diff --git a/docs/index.rst b/docs/index.rst
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -9,6 +9,7 @@
getting-started.md
spec-api.md
+ metadata.md
spec-injection.md
dev-info.md
sys-info.md
diff --git a/docs/metadata.md b/docs/metadata.md
--- a/docs/metadata.md
+++ b/docs/metadata.md
@@ -8,31 +8,44 @@
## Metadata requirements
MUST
-- the schema/vocabulary used *MUST* be specified with a persistant url
+- **the schema/vocabulary** used *MUST* be specified with a persistent url
(DublinCore, DOAP, CodeMeta, etc.)
-- the origin url *MUST* be defined depending on the schema you use:
```XML
-
-hal.archives-ouvertes.fr
-hal.archives-ouvertes.fr
-hal.archives-ouvertes.fr
+
+or
+
+or
+
```
+- **the url** representing the location of the source *MUST* be provided
+under the url tag. The url will be used for creating an origin object in the
+archive.
+```XML
+www.url-example.com
+or
+www.url-example.com
+or
+www.url-example.com
+```
+- **the external_identifier** *MUST* be provided as an identifier
+- **the name** of the software deposit *MUST* be provided
+[atom:title, codemeta:name, dcterms:title]
+- **the authors** of the software deposit *MUST* be provided
SHOULD
-- the external_identifier *SHOULD* match the Slug external-identifier in
+- **the external_identifier** *SHOULD* match the Slug external-identifier in
the header
-- the following metadata *SHOULD* be included using the correct terminology
-(depending on the schema you are using- the CodeMeta crosswalk table can
- help you identify the terms):
- - codemeta:name - the software artifact title
- - codemeta:description - short or long description of the software in the
- deposit
- - codemeta:license - the software license/s
- - codemeta:author - the software authors
+- **the description** of the software deposit *SHOULD* be provided
+[codemeta:description] - short or long description of the software
+- **the license/s** of the software deposit *SHOULD* be provided
+[codemeta:license]
+
MAY
- - other metadata *MAY* be added with terms defined by the schema in use.
+- other metadata *MAY* be added with terms defined by the schema in use.
## Examples
### Using only Atom
diff --git a/docs/spec-injection.md b/docs/spec-injection.md
--- a/docs/spec-injection.md
+++ b/docs/spec-injection.md
@@ -22,14 +22,15 @@
Some of those metadata will also be included in the `origin_metadata`
table.
-
- origin | https://hal.inria.fr/hal-id
--------------------------------------|----------------------------------------
- origin_visit | 1 :reception_date
- occurrence & occurrence_history | branch: client's version n° (e.g hal)
- revision | synthetic_revision (tarball)
- directory | upper level of the uncompressed archive
-
+```
+origin | https://hal.inria.fr/hal-id |
+------------------------------------|----------------------------------------|
+origin_visit | 1 :reception_date |
+origin_metadata | aggregated metadata |
+occurrence & occurrence_history | branch: client's version n° (e.g hal) |
+revision | synthetic_revision (tarball) |
+directory | upper level of the uncompressed archive|
+```
### Questions raised concerning injection
@@ -200,12 +201,19 @@
`origin_metadata` table before translation as part of the injection
process and an indexation process should be scheduled.
+- provider_id and tool_id are resolved by the prepare_metadata method in the
+loader-core
+
+- the origin_metadata entry is sent to storage by the send_origin_metadata in
+the loader-core
+
+
origin_metadata table:
```
id bigint PK
origin bigint
discovery_date date
provider_id bigint FK // (from provider table)
+tool_id bigint FK // indexer_configuration_id tool used for extraction
metadata jsonb // before translation
-indexer_configuration_id bigint FK // tool used for extraction
```
diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -71,7 +71,7 @@
"""
ADDITIONAL_CONFIG = {
- 'extraction_dir': ('str', '/tmp/swh-deposit/archive/')
+ 'extraction_dir': ('str', '/tmp/swh-deposit/archive/'),
}
def __init__(self):
@@ -120,6 +120,28 @@
"""Class in charge of aggregating metadata on a deposit.
"""
+ ADDITIONAL_CONFIG = {
+ 'provider': ('dict', {
+ 'provider_name': '',
+ 'provider_type': 'deposit_client',
+ 'provider_url': '',
+ 'metadata': {
+ }
+ }),
+ 'tool': ('dict', {
+ 'tool_name': 'swh-deposit',
+ 'tool_version': '0.0.1',
+ 'tool_configuration': {
+ 'sword_version': '2'
+ }
+ })
+ }
+
+ def __init__(self):
+ super().__init__()
+ self.provider = self.config['provider']
+ self.tool = self.config['tool']
+
def _aggregate_metadata(self, deposit, metadata_requests):
"""Retrieve and aggregates metadata information.
@@ -143,15 +165,15 @@
"""
data = {}
- metadata_requests = []
# Retrieve tarballs/metadata information
- metadata = self._aggregate_metadata(deposit, metadata_requests)
+ metadata = self._aggregate_metadata(deposit, requests)
# Read information metadata
data['origin'] = {
- 'type': deposit.collection.name,
- 'url': deposit.external_id,
+ 'type': 'deposit',
+ 'url': os.path.join(deposit.client.url.rstrip('/'),
+ deposit.external_id),
}
# revision
@@ -163,6 +185,10 @@
'email': deposit.client.email,
}
+ # metadata provider
+ self.provider['provider_name'] = deposit.client.last_name
+ self.provider['provider_url'] = deposit.client.url
+
revision_type = 'tar'
revision_msg = '%s: Deposit %s in collection %s' % (
fullname, deposit.id, deposit.collection.name)
@@ -188,6 +214,11 @@
data['occurrence'] = {
'branch': 'master'
}
+ data['origin_metadata'] = {
+ 'provider': self.provider,
+ 'tool': self.tool,
+ 'metadata': metadata
+ }
return data
diff --git a/swh/deposit/injection/loader.py b/swh/deposit/injection/loader.py
--- a/swh/deposit/injection/loader.py
+++ b/swh/deposit/injection/loader.py
@@ -135,14 +135,35 @@
visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
revision = metadata['revision']
occurrence = metadata['occurrence']
+ self.origin_metadata = metadata['origin_metadata']
+ self.prepare_metadata()
self.client.update_deposit_status(deposit_update_url, 'injecting')
+
super().prepare(tar_path=archive,
origin=origin,
visit_date=visit_date,
revision=revision,
occurrences=[occurrence])
+ def store_metadata(self):
+ """Storing the origin_metadata during the load processus.
+
+ Provider_id and tool_id are resolved during the prepare() method.
+
+ """
+ origin_id = self.origin_id
+ visit_date = self.visit_date
+ provider_id = self.origin_metadata['provider']['provider_id']
+ tool_id = self.origin_metadata['tool']['tool_id']
+ metadata = self.origin_metadata['metadata']
+ try:
+ self.send_origin_metadata(origin_id, visit_date, provider_id,
+ tool_id, metadata)
+ except:
+ self.log.exception('Problem when storing origin_metadata')
+ raise
+
def post_load(self, success=True):
"""Updating the deposit's status according to its loading status.
diff --git a/swh/deposit/migrations/0006_depositclient_url.py b/swh/deposit/migrations/0006_depositclient_url.py
new file mode 100644
--- /dev/null
+++ b/swh/deposit/migrations/0006_depositclient_url.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.10.7 on 2017-11-07 13:12
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('deposit', '0005_auto_20171019_1436'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='depositclient',
+ name='url',
+ field=models.TextField(default='https://hal.archives-ouvertes.fr/'),
+ preserve_default=False,
+ ),
+ ]
diff --git a/swh/deposit/models.py b/swh/deposit/models.py
--- a/swh/deposit/models.py
+++ b/swh/deposit/models.py
@@ -72,6 +72,7 @@
"""
collections = ArrayField(models.IntegerField(), null=True)
objects = UserManager()
+ url = models.TextField(null=False)
class Meta:
db_table = 'deposit_client'
diff --git a/swh/deposit/tests/__init__.py b/swh/deposit/tests/__init__.py
--- a/swh/deposit/tests/__init__.py
+++ b/swh/deposit/tests/__init__.py
@@ -12,6 +12,20 @@
TEST_CONFIG = {
'max_upload_size': 500,
'extraction_dir': '/tmp/swh-deposit/test/extraction-dir',
+ 'provider': {
+ 'provider_name': '',
+ 'provider_type': 'deposit_client',
+ 'provider_url': '',
+ 'metadata': {
+ }
+ },
+ 'tool': {
+ 'tool_name': 'swh-deposit',
+ 'tool_version': '0.0.1',
+ 'tool_configuration': {
+ 'sword_version': '2'
+ }
+ }
}
diff --git a/swh/deposit/tests/api/test_deposit_read_metadata.py b/swh/deposit/tests/api/test_deposit_read_metadata.py
--- a/swh/deposit/tests/api/test_deposit_read_metadata.py
+++ b/swh/deposit/tests/api/test_deposit_read_metadata.py
@@ -37,8 +37,27 @@
expected_meta = {
'origin': {
- 'url': 'some-external-id',
- 'type': 'hal'
+ 'url': 'https://hal.test.fr/some-external-id',
+ 'type': 'deposit'
+ },
+ 'origin_metadata': {
+ 'metadata': {
+ '{http://www.w3.org/2005/Atom}external_identifier':
+ 'some-external-id'
+ },
+ 'provider': {
+ 'provider_name': '',
+ 'provider_type': 'deposit_client',
+ 'provider_url': 'https://hal.test.fr/',
+ 'metadata': {}
+ },
+ 'tool': {
+ 'tool_name': 'swh-deposit',
+ 'tool_version': '0.0.1',
+ 'tool_configuration': {
+ 'sword_version': '2'
+ }
+ }
},
'revision': {
'synthetic': True,
@@ -51,7 +70,10 @@
'fullname': '', 'email': '', 'name': ''
},
'date': None,
- 'metadata': {},
+ 'metadata': {
+ '{http://www.w3.org/2005/Atom}external_identifier':
+ 'some-external-id'
+ },
'type': 'tar'
},
'occurrence': {
diff --git a/swh/deposit/tests/common.py b/swh/deposit/tests/common.py
--- a/swh/deposit/tests/common.py
+++ b/swh/deposit/tests/common.py
@@ -16,7 +16,8 @@
from rest_framework import status
from swh.deposit.config import COL_IRI, EM_IRI, EDIT_SE_IRI
-from swh.deposit.models import DepositClient, DepositCollection
+from swh.deposit.models import DepositClient, DepositCollection, Deposit
+from swh.deposit.models import DepositRequest
from swh.deposit.models import DepositRequestType
from swh.deposit.parsers import parse_xml
from swh.deposit.settings.testing import MEDIA_ROOT
@@ -97,7 +98,7 @@
super().tearDown()
shutil.rmtree(self.root_path)
- def create_simple_binary_deposit(self, status_partial=False):
+ def create_simple_binary_deposit(self, status_partial=True):
response = self.client.post(
reverse(COL_IRI, args=[self.collection.name]),
content_type='application/zip',
@@ -160,12 +161,14 @@
deposit_request_types[deposit_request_type] = drt
_name = 'hal'
+ _url = 'https://hal.test.fr/'
# set collection up
_collection = DepositCollection(name=_name)
_collection.save()
# set user/client up
_client = DepositClient.objects.create_user(username=_name,
- password=_name)
+ password=_name,
+ url=_url)
_client.collections = [_collection.id]
_client.save()
@@ -225,6 +228,57 @@
anotherthing
"""
+ self.atom_entry_data2 = b"""
+
+ Awesome Compiler
+ urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
+ 1785io25c695
+ 2017-10-07T15:17:08Z
+ some awesome author
+ """
+
+ self.codemeta_entry_data0 = b"""
+
+ Awesome Compiler
+ urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
+ 1785io25c695
+ 2017-10-07T15:17:08Z
+ some awesome author
+ description
+ key-word 1
+ """
+
+ self.codemeta_entry_data1 = b"""
+
+ Composing a Web of Audio Applications
+ hal
+ hal-01243065
+ hal-01243065
+ https://hal-test.archives-ouvertes.fr/hal-01243065
+ test
+ DSP programming,Web
+ 2017-05-03T16:08:47+02:00
+ this is the description
+ 1
+ phpstorm
+ stable
+ php
+ python
+ C
+
+ GNU General Public License v3.0 only
+
+
+ CeCILL Free Software License Agreement v1.1
+
+
+ HAL
+ hal@ccsd.cnrs.fr
+
+"""
+
def create_deposit_with_status_rejected(self):
url = reverse(COL_IRI, args=[self.collection.name])
@@ -272,6 +326,30 @@
'{http://www.w3.org/2005/Atom}deposit_id']
return deposit_id
+ def create_deposit_partial_with_data_in_args(self, data):
+ """Create a simple deposit (1 request) in `partial` state with the data
+ or metadata as an argument and returns its new identifier.
+
+ Args:
+ data: atom entry
+
+ Returns:
+ deposit id
+
+ """
+ response = self.client.post(
+ reverse(COL_IRI, args=[self.collection.name]),
+ content_type='application/atom+xml;type=entry',
+ data=data,
+ HTTP_SLUG='external-id',
+ HTTP_IN_PROGRESS='true')
+
+ assert response.status_code == status.HTTP_201_CREATED
+ response_content = parse_xml(BytesIO(response.content))
+ deposit_id = response_content[
+ '{http://www.w3.org/2005/Atom}deposit_id']
+ return deposit_id
+
def _update_deposit_with_status(self, deposit_id, status_partial=False):
"""Add to a given deposit another archive and update its current
status to `ready` (by default).
@@ -308,3 +386,27 @@
deposit_id = self._update_deposit_with_status(
deposit_id, status_partial=True)
return deposit_id
+
+ def add_metadata_to_deposit(self, deposit_id, status_partial=False):
+ """Add metadata to deposit.
+
+ """
+ # when
+ response = self.client.post(
+ reverse(EDIT_SE_IRI, args=[self.collection.name, deposit_id]),
+ content_type='application/atom+xml;type=entry',
+ data=self.codemeta_entry_data1,
+ HTTP_SLUG='external-id',
+ HTTP_IN_PROGRESS=status_partial)
+ assert response.status_code == status.HTTP_201_CREATED
+ # then
+ deposit = Deposit.objects.get(pk=deposit_id)
+ assert deposit is not None
+
+ deposit_requests = DepositRequest.objects.filter(deposit=deposit)
+ assert deposit_requests is not []
+
+ for dr in deposit_requests:
+ if dr.type.name == 'metadata':
+ assert deposit_requests[0].metadata is not {}
+ return deposit_id
diff --git a/swh/deposit/tests/test_loader.py b/swh/deposit/tests/test_loader.py
--- a/swh/deposit/tests/test_loader.py
+++ b/swh/deposit/tests/test_loader.py
@@ -25,6 +25,10 @@
from .common import FileSystemCreationRoutine
+TOOL_ID = 99
+PROVIDER_ID = 12
+
+
class DepositLoaderInhibitsStorage:
"""Mixin class to inhibit the persistence and keep in memory the data
sent for storage.
@@ -38,11 +42,14 @@
self.state = {
'origin': [],
'origin_visit': [],
+ 'origin_metadata': [],
'content': [],
'directory': [],
'revision': [],
'release': [],
'occurrence': [],
+ 'tool': [],
+ 'provider': []
}
def _add(self, type, l):
@@ -73,6 +80,39 @@
self._add('origin_visit', [origin_visit])
return origin_visit
+ def send_origin_metadata(self, origin_id, visit_date, provider_id, tool_id,
+ metadata):
+ origin_metadata = {
+ 'origin_id': origin_id,
+ 'visit_date': visit_date,
+ 'provider_id': provider_id,
+ 'tool_id': tool_id,
+ 'metadata': metadata
+ }
+ self._add('origin_metadata', [origin_metadata])
+ return origin_metadata
+
+ def send_tool(self, tool):
+ tool = {
+ 'tool_name': tool['tool_name'],
+ 'tool_version': tool['tool_version'],
+ 'tool_configuration': tool['tool_configuration']
+ }
+ self._add('tool', [tool])
+ tool_id = TOOL_ID
+ return tool_id
+
+ def send_provider(self, provider):
+ provider = {
+ 'provider_name': provider['provider_name'],
+ 'provider_type': provider['provider_type'],
+ 'provider_url': provider['provider_url'],
+ 'metadata': provider['metadata']
+ }
+ self._add('provider', [provider])
+ provider_id = PROVIDER_ID
+ return provider_id
+
def maybe_load_contents(self, contents):
self._add('content', contents)
@@ -218,3 +258,62 @@
# FIXME enrich state introspection
# expected_revisions = {}
# self.assertRevisionsOk(expected_revisions)
+
+ @istest
+ def inject_deposit_verify_metadata(self):
+ """Load a deposit with metadata, test metadata integrity
+
+ """
+ self.deposit_metadata_id = self.add_metadata_to_deposit(
+ self.deposit_id)
+ args = [self.collection.name, self.deposit_metadata_id]
+
+ archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args)
+ deposit_meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args)
+ deposit_update_url = reverse(PRIVATE_PUT_DEPOSIT, args=args)
+
+ # when
+ self.loader.load(archive_url=archive_url,
+ deposit_meta_url=deposit_meta_url,
+ deposit_update_url=deposit_update_url)
+
+ # then
+ self.assertEquals(len(self.loader.state['content']), 1)
+ self.assertEquals(len(self.loader.state['directory']), 1)
+ self.assertEquals(len(self.loader.state['revision']), 1)
+ self.assertEquals(len(self.loader.state['release']), 0)
+ self.assertEquals(len(self.loader.state['occurrence']), 1)
+ self.assertEquals(len(self.loader.state['origin_metadata']), 1)
+ self.assertEquals(len(self.loader.state['tool']), 1)
+ self.assertEquals(len(self.loader.state['provider']), 1)
+
+ atom = '{http://www.w3.org/2005/Atom}'
+ codemeta = '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}'
+ expected_origin_metadata = {
+ atom + 'author': {
+ atom + 'email': 'hal@ccsd.cnrs.fr',
+ atom + 'name': 'HAL'
+ },
+ codemeta + 'url':
+ 'https://hal-test.archives-ouvertes.fr/hal-01243065',
+ codemeta + 'runtimePlatform': 'phpstorm',
+ codemeta + 'license': {
+ codemeta + 'name':
+ 'CeCILL Free Software License Agreement v1.1'
+ },
+ codemeta + 'programmingLanguage': 'C',
+ codemeta + 'applicationCategory': 'test',
+ codemeta + 'dateCreated': '2017-05-03T16:08:47+02:00',
+ codemeta + 'version': 1,
+ atom + 'external_identifier': 'hal-01243065',
+ atom + 'title': 'Composing a Web of Audio Applications',
+ codemeta + 'description': 'this is the description',
+ atom + 'id': 'hal-01243065',
+ atom + 'client': 'hal',
+ codemeta + 'keywords': 'DSP programming,Web',
+ codemeta + 'developmentStatus': 'stable'
+ }
+ result = self.loader.state['origin_metadata'][0]
+ self.assertEquals(result['metadata'], expected_origin_metadata)
+ self.assertEquals(result['tool_id'], TOOL_ID)
+ self.assertEquals(result['provider_id'], PROVIDER_ID)