diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ vcversioner click xmltodict +iso8601 diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,6 +14,7 @@ from swh.core import tarball from swh.model import identifiers +from swh.deposit.utils import normalize_date from . import DepositReadMixin from ...config import SWH_PERSON, ARCHIVE_TYPE @@ -136,6 +137,36 @@ if client_domain in metadata[field]: return metadata[field] + def _normalize_dates(self, deposit, metadata): + """Normalize the date to use as a tuple of author date, committer date + from the incoming metadata. + + Args: + deposit (Deposit): Deposit model representation + metadata (Dict): Metadata dict representation + + Returns: + Tuple of author date, committer date. Those dates are + swh normalized. + + """ + commit_date = metadata.get('codemeta:datePublished') + author_date = metadata.get('codemeta:dateCreated') + + if author_date and commit_date: + pass + elif commit_date: + author_date = commit_date + elif author_date: + commit_date = author_date + else: + author_date = deposit.complete_date + commit_date = deposit.complete_date + return ( + normalize_date(author_date), + normalize_date(commit_date) + ) + def metadata_read(self, deposit): """Read and aggregate multiple data on deposit into one unified data dictionary. @@ -169,12 +200,13 @@ revision_type = 'tar' revision_msg = '%s: Deposit %s in collection %s' % ( fullname, deposit.id, deposit.collection.name) - complete_date = identifiers.normalize_timestamp(deposit.complete_date) + + author_date, commit_date = self._normalize_dates(deposit, metadata) data['revision'] = { 'synthetic': True, - 'date': complete_date, - 'committer_date': complete_date, + 'date': author_date, + 'committer_date': commit_date, 'author': author_committer, 'committer': author_committer, 'type': revision_type, diff --git a/swh/deposit/tests/api/test_deposit_read_metadata.py b/swh/deposit/tests/api/test_deposit_read_metadata.py --- a/swh/deposit/tests/api/test_deposit_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_read_metadata.py @@ -23,6 +23,43 @@ """Deposit access to read metadata information on deposit. """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.template_metadata = """ + + Composing a Web of Audio Applications + hal + hal-01243065 + hal-01243065 + https://hal-test.archives-ouvertes.fr/hal-01243065 + test + DSP programming + this is the description + 1 + phpstorm + stable + php + python + C + + GNU General Public License v3.0 only + + + CeCILL Free Software License Agreement v1.1 + + + HAL + hal@ccsd.cnrs.fr + + + Morane Gruenpeter + +%s +""" + def test_read_metadata(self): """Private metadata read api to existing deposit should return metadata @@ -50,6 +87,7 @@ 'metadata': { '@xmlns': ['http://www.w3.org/2005/Atom'], 'author': ['some awesome author', 'another one', 'no one'], + 'codemeta:dateCreated': '2017-10-07T15:17:08Z', 'external_identifier': 'some-external-id', 'url': 'https://hal-test.archives-ouvertes.fr/' + 'some-external-id' @@ -70,15 +108,30 @@ }, 'revision': { 'synthetic': True, - 'committer_date': None, + 'committer_date': { + 'timestamp': { + 'seconds': 1507389428, + 'microseconds': 0 + }, + 'offset': 0, + 'negative_utc': False + }, 'message': 'hal: Deposit %s in collection hal' % deposit_id, 'author': SWH_PERSON, 'committer': SWH_PERSON, - 'date': None, + 'date': { + 'timestamp': { + 'seconds': 1507389428, + 'microseconds': 0 + }, + 'offset': 0, + 'negative_utc': False + }, 'metadata': { '@xmlns': ['http://www.w3.org/2005/Atom'], 'author': ['some awesome author', 'another one', 'no one'], 'external_identifier': 'some-external-id', + 'codemeta:dateCreated': '2017-10-07T15:17:08Z', 'url': 'https://hal-test.archives-ouvertes.fr/' + 'some-external-id' }, @@ -135,6 +188,7 @@ 'metadata': { '@xmlns': ['http://www.w3.org/2005/Atom'], 'author': ['some awesome author', 'another one', 'no one'], + 'codemeta:dateCreated': '2017-10-07T15:17:08Z', 'external_identifier': 'some-external-id', 'url': 'https://hal-test.archives-ouvertes.fr/' + 'some-external-id' @@ -155,8 +209,22 @@ }, 'revision': { 'synthetic': True, - 'date': None, - 'committer_date': None, + 'date': { + 'timestamp': { + 'seconds': 1507389428, + 'microseconds': 0 + }, + 'offset': 0, + 'negative_utc': False + }, + 'committer_date': { + 'timestamp': { + 'seconds': 1507389428, + 'microseconds': 0 + }, + 'offset': 0, + 'negative_utc': False + }, 'author': SWH_PERSON, 'committer': SWH_PERSON, 'type': 'tar', @@ -164,6 +232,7 @@ 'metadata': { '@xmlns': ['http://www.w3.org/2005/Atom'], 'author': ['some awesome author', 'another one', 'no one'], + 'codemeta:dateCreated': '2017-10-07T15:17:08Z', 'external_identifier': 'some-external-id', 'url': 'https://hal-test.archives-ouvertes.fr/' + 'some-external-id' @@ -175,6 +244,393 @@ self.assertEqual(data, expected_meta) + def test_read_metadata_3(self): + """date(Created|Published) provided, uses author/committer date + + """ + # add metadata to the deposit with datePublished and dateCreated + codemeta_entry_data = self.template_metadata % """ + 2015-04-06T17:08:47+02:00 + 2017-05-03T16:08:47+02:00 +""" + + deposit_id = self.create_deposit_partial_with_data_in_args( + codemeta_entry_data) + + url = reverse(PRIVATE_GET_DEPOSIT_METADATA, + args=[self.collection.name, deposit_id]) + + response = self.client.get(url) + + self.assertEqual(response.status_code, + status.HTTP_200_OK) + self.assertEqual(response._headers['content-type'][1], + 'application/json') + data = response.json() + + expected_origin = { + 'type': 'deposit', + 'url': 'https://hal-test.archives-ouvertes.fr/hal-01243065' + } + expected_metadata = { + '@xmlns': 'http://www.w3.org/2005/Atom', + '@xmlns:codemeta': + 'https://doi.org/10.5063/SCHEMA/CODEMETA-2.0', + 'author': { + 'email': 'hal@ccsd.cnrs.fr', + 'name': 'HAL' + }, + 'client': 'hal', + 'codemeta:applicationCategory': 'test', + 'codemeta:author': { + 'codemeta:name': 'Morane Gruenpeter' + }, + 'codemeta:dateCreated': '2015-04-06T17:08:47+02:00', + 'codemeta:datePublished': '2017-05-03T16:08:47+02:00', + 'codemeta:description': 'this is the description', + 'codemeta:developmentStatus': 'stable', + 'codemeta:keywords': 'DSP programming', + 'codemeta:license': [ + { + 'codemeta:name': 'GNU General Public License v3.0 only' + }, + { + 'codemeta:name': + 'CeCILL Free Software License Agreement v1.1' + } + ], + 'codemeta:programmingLanguage': [ + 'php', 'python', 'C' + ], + 'codemeta:runtimePlatform': 'phpstorm', + 'codemeta:url': 'https://hal-test.archives-ouvertes.fr/hal-01243065', # noqa + 'codemeta:version': '1', + 'external_identifier': 'hal-01243065', + 'id': 'hal-01243065', + 'title': 'Composing a Web of Audio Applications' + } + + expected_origin_metadata = { + 'metadata': expected_metadata, + 'provider': { + 'metadata': {}, + 'provider_name': 'hal', + 'provider_type': 'deposit_client', + 'provider_url': 'https://hal-test.archives-ouvertes.fr/' + }, + 'tool': { + 'configuration': { + 'sword_version': '2' + }, + 'name': 'swh-deposit', + 'version': '0.0.1' + } + } + + expected_revision = { + 'author': { + 'email': 'robot@softwareheritage.org', + 'fullname': 'Software Heritage', + 'name': 'Software Heritage' + }, + 'committer': { + 'email': 'robot@softwareheritage.org', + 'fullname': 'Software Heritage', + 'name': 'Software Heritage' + }, + 'committer_date': { + 'negative_utc': False, + 'offset': 120, + 'timestamp': { + 'microseconds': 0, + 'seconds': 1493820527 + } + }, + 'date': { + 'negative_utc': False, + 'offset': 120, + 'timestamp': { + 'microseconds': 0, + 'seconds': 1428332927 + } + }, + 'message': 'hal: Deposit %s in collection hal' % deposit_id, + 'metadata': expected_metadata, + 'synthetic': True, + 'type': 'tar' + } + + expected_meta = { + 'branch_name': 'master', + 'origin': expected_origin, + 'origin_metadata': expected_origin_metadata, + 'revision': expected_revision, + } + + self.assertEqual(data, expected_meta) + + def test_read_metadata_4(self): + """dateCreated/datePublished not provided, revision uses complete_date + + """ + codemeta_entry_data = self.template_metadata % '' + + deposit_id = self.create_deposit_partial_with_data_in_args( + codemeta_entry_data) + + # will use the deposit completed date as fallback date + deposit = Deposit.objects.get(pk=deposit_id) + deposit.complete_date = '2016-04-06' + deposit.save() + + url = reverse(PRIVATE_GET_DEPOSIT_METADATA, + args=[self.collection.name, deposit_id]) + + response = self.client.get(url) + + self.assertEqual(response.status_code, + status.HTTP_200_OK) + self.assertEqual(response._headers['content-type'][1], + 'application/json') + data = response.json() + + expected_origin = { + 'type': 'deposit', + 'url': 'https://hal-test.archives-ouvertes.fr/hal-01243065' + } + expected_metadata = { + '@xmlns': 'http://www.w3.org/2005/Atom', + '@xmlns:codemeta': + 'https://doi.org/10.5063/SCHEMA/CODEMETA-2.0', + 'author': { + 'email': 'hal@ccsd.cnrs.fr', + 'name': 'HAL' + }, + 'client': 'hal', + 'codemeta:applicationCategory': 'test', + 'codemeta:author': { + 'codemeta:name': 'Morane Gruenpeter' + }, + 'codemeta:description': 'this is the description', + 'codemeta:developmentStatus': 'stable', + 'codemeta:keywords': 'DSP programming', + 'codemeta:license': [ + { + 'codemeta:name': 'GNU General Public License v3.0 only' + }, + { + 'codemeta:name': + 'CeCILL Free Software License Agreement v1.1' + } + ], + 'codemeta:programmingLanguage': [ + 'php', 'python', 'C' + ], + 'codemeta:runtimePlatform': 'phpstorm', + 'codemeta:url': 'https://hal-test.archives-ouvertes.fr/hal-01243065', # noqa + 'codemeta:version': '1', + 'external_identifier': 'hal-01243065', + 'id': 'hal-01243065', + 'title': 'Composing a Web of Audio Applications' + } + + expected_origin_metadata = { + 'metadata': expected_metadata, + 'provider': { + 'metadata': {}, + 'provider_name': 'hal', + 'provider_type': 'deposit_client', + 'provider_url': 'https://hal-test.archives-ouvertes.fr/' + }, + 'tool': { + 'configuration': { + 'sword_version': '2' + }, + 'name': 'swh-deposit', + 'version': '0.0.1' + } + } + + expected_revision = { + 'author': { + 'email': 'robot@softwareheritage.org', + 'fullname': 'Software Heritage', + 'name': 'Software Heritage' + }, + 'committer': { + 'email': 'robot@softwareheritage.org', + 'fullname': 'Software Heritage', + 'name': 'Software Heritage' + }, + 'committer_date': { + 'negative_utc': False, + 'offset': 0, + 'timestamp': { + 'microseconds': 0, + 'seconds': 1459900800 + } + }, + 'date': { + 'negative_utc': False, + 'offset': 0, + 'timestamp': { + 'microseconds': 0, + 'seconds': 1459900800 + } + }, + 'message': 'hal: Deposit %s in collection hal' % deposit_id, + 'metadata': expected_metadata, + 'synthetic': True, + 'type': 'tar' + } + + expected_meta = { + 'branch_name': 'master', + 'origin': expected_origin, + 'origin_metadata': expected_origin_metadata, + 'revision': expected_revision, + } + + self.assertEqual(data, expected_meta) + + def test_read_metadata_5(self): + """dateCreated/datePublished provided, revision uses author/committer + date + + If multiple dateCreated provided, the first occurrence (of + dateCreated) is selected. If multiple datePublished provided, + the first occurrence (of datePublished) is selected. + + """ + # add metadata to the deposit with multiple datePublished/dateCreated + codemeta_entry_data = self.template_metadata % """ + 2015-04-06T17:08:47+02:00 + 2017-05-03T16:08:47+02:00 + 2016-04-06T17:08:47+02:00 + 2018-05-03T16:08:47+02:00 +""" + + deposit_id = self.create_deposit_partial_with_data_in_args( + codemeta_entry_data) + + url = reverse(PRIVATE_GET_DEPOSIT_METADATA, + args=[self.collection.name, deposit_id]) + + response = self.client.get(url) + + self.assertEqual(response.status_code, + status.HTTP_200_OK) + self.assertEqual(response._headers['content-type'][1], + 'application/json') + data = response.json() + + expected_origin = { + 'type': 'deposit', + 'url': 'https://hal-test.archives-ouvertes.fr/hal-01243065' + } + expected_metadata = { + '@xmlns': 'http://www.w3.org/2005/Atom', + '@xmlns:codemeta': + 'https://doi.org/10.5063/SCHEMA/CODEMETA-2.0', + 'author': { + 'email': 'hal@ccsd.cnrs.fr', + 'name': 'HAL' + }, + 'client': 'hal', + 'codemeta:applicationCategory': 'test', + 'codemeta:author': { + 'codemeta:name': 'Morane Gruenpeter' + }, + 'codemeta:dateCreated': [ + '2015-04-06T17:08:47+02:00', + '2016-04-06T17:08:47+02:00', + ], + 'codemeta:datePublished': [ + '2017-05-03T16:08:47+02:00', + '2018-05-03T16:08:47+02:00', + ], + 'codemeta:description': 'this is the description', + 'codemeta:developmentStatus': 'stable', + 'codemeta:keywords': 'DSP programming', + 'codemeta:license': [ + { + 'codemeta:name': 'GNU General Public License v3.0 only' + }, + { + 'codemeta:name': + 'CeCILL Free Software License Agreement v1.1' + } + ], + 'codemeta:programmingLanguage': [ + 'php', 'python', 'C' + ], + 'codemeta:runtimePlatform': 'phpstorm', + 'codemeta:url': 'https://hal-test.archives-ouvertes.fr/hal-01243065', # noqa + 'codemeta:version': '1', + 'external_identifier': 'hal-01243065', + 'id': 'hal-01243065', + 'title': 'Composing a Web of Audio Applications' + } + + expected_origin_metadata = { + 'metadata': expected_metadata, + 'provider': { + 'metadata': {}, + 'provider_name': 'hal', + 'provider_type': 'deposit_client', + 'provider_url': 'https://hal-test.archives-ouvertes.fr/' + }, + 'tool': { + 'configuration': { + 'sword_version': '2' + }, + 'name': 'swh-deposit', + 'version': '0.0.1' + } + } + + expected_revision = { + 'author': { + 'email': 'robot@softwareheritage.org', + 'fullname': 'Software Heritage', + 'name': 'Software Heritage' + }, + 'committer': { + 'email': 'robot@softwareheritage.org', + 'fullname': 'Software Heritage', + 'name': 'Software Heritage' + }, + 'committer_date': { + 'negative_utc': False, + 'offset': 120, + 'timestamp': { + 'microseconds': 0, + 'seconds': 1493820527 + } + }, + 'date': { + 'negative_utc': False, + 'offset': 120, + 'timestamp': { + 'microseconds': 0, + 'seconds': 1428332927 + } + }, + 'message': 'hal: Deposit %s in collection hal' % deposit_id, + 'metadata': expected_metadata, + 'synthetic': True, + 'type': 'tar' + } + + expected_meta = { + 'branch_name': 'master', + 'origin': expected_origin, + 'origin_metadata': expected_origin_metadata, + 'revision': expected_revision, + } + + self.assertEqual(data, expected_meta) + def test_access_to_nonexisting_deposit_returns_404_response(self): """Read unknown collection should return a 404 response diff --git a/swh/deposit/tests/common.py b/swh/deposit/tests/common.py --- a/swh/deposit/tests/common.py +++ b/swh/deposit/tests/common.py @@ -346,6 +346,7 @@ another one no one + 2017-10-07T15:17:08Z """ self.atom_entry_data2 = b""" @@ -488,6 +489,9 @@ deposit id """ + if isinstance(data, str): + data = data.encode('utf-8') + response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', diff --git a/swh/deposit/tests/loader/test_loader.py b/swh/deposit/tests/loader/test_loader.py --- a/swh/deposit/tests/loader/test_loader.py +++ b/swh/deposit/tests/loader/test_loader.py @@ -59,12 +59,10 @@ # create the extraction dir used by the loader os.makedirs(TEST_LOADER_CONFIG['extraction_dir'], exist_ok=True) - # 1. create a deposit with archive and metadata - self.deposit_id = self.create_simple_binary_deposit() - # 2. Sets a basic client which accesses the test data + # Sets a basic client which accesses the test data loader_client = SWHDepositTestClient(self.client, config=CLIENT_TEST_CONFIG) - # 3. setup loader with that client + # Setup loader with that client self.loader = loader.DepositLoader(client=loader_client) self.storage = self.loader.storage @@ -77,7 +75,11 @@ """Load a deposit which is ready """ - args = [self.collection.name, self.deposit_id] + # create a deposit with archive and metadata + deposit_id = self.create_simple_binary_deposit() + self.update_binary_deposit(deposit_id, status_partial=False) + + args = [self.collection.name, deposit_id] archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args) deposit_meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args) @@ -100,9 +102,9 @@ """Load a deposit with metadata, test metadata integrity """ - self.deposit_metadata_id = self.add_metadata_to_deposit( - self.deposit_id) - args = [self.collection.name, self.deposit_metadata_id] + deposit_id = self.create_simple_binary_deposit() + self.add_metadata_to_deposit(deposit_id, status_partial=False) + args = [self.collection.name, deposit_id] archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args) deposit_meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args) @@ -157,7 +159,7 @@ self.assertOriginMetadataContains('deposit', origin_url, expected_origin_metadata) - deposit = Deposit.objects.get(pk=self.deposit_id) + deposit = Deposit.objects.get(pk=deposit_id) self.assertRegex(deposit.swh_id, r'^swh:1:dir:.*') self.assertEqual(deposit.swh_id_context, '%s;origin=%s' % ( diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -1,10 +1,11 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest +from unittest.mock import patch from swh.deposit import utils @@ -130,3 +131,48 @@ utils.merge(d1) self.assertEqual(utils.merge(d0), d0) + + +@patch('swh.deposit.utils.normalize_timestamp', side_effect=lambda x: x) +def test_normalize_date_0(mock_normalize): + """When date is a list, choose the first date and normalize it + + Note: We do not test swh.model.identifiers which is already tested + in swh.model + + """ + actual_date = utils.normalize_date(['2017-10-12', 'date1']) + + expected_date = '2017-10-12 00:00:00+00:00' + + assert str(actual_date) == expected_date + + +@patch('swh.deposit.utils.normalize_timestamp', side_effect=lambda x: x) +def test_normalize_date_1(mock_normalize): + """Providing a date in a reasonable format, everything is fine + + Note: We do not test swh.model.identifiers which is already tested + in swh.model + + """ + actual_date = utils.normalize_date('2018-06-11 17:02:02') + + expected_date = '2018-06-11 17:02:02+00:00' + + assert str(actual_date) == expected_date + + +@patch('swh.deposit.utils.normalize_timestamp', side_effect=lambda x: x) +def test_normalize_date_doing_irrelevant_stuff(mock_normalize): + """Providing a date with only the year results in a reasonable date + + Note: We do not test swh.model.identifiers which is already tested + in swh.model + + """ + actual_date = utils.normalize_date('2017') + + expected_date = '2017-01-01 00:00:00+00:00' + + assert str(actual_date) == expected_date diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -1,10 +1,14 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import iso8601 + from types import GeneratorType +from swh.model.identifiers import normalize_timestamp + def merge(*dicts): """Given an iterator of dicts, merge them losing no information. @@ -53,3 +57,27 @@ new_val = _extend([existing_val], value) d[key] = new_val return d + + +def normalize_date(date): + """Normalize date fields as expected by swh workers. + + If date is a list, elect arbitrarily the first element of that + list + + If date is (then) a string, parse it through + dateutil.parser.parse to extract a datetime. + + Then normalize it through + swh.model.identifiers.normalize_timestamp. + + Returns + The swh date object + + """ + if isinstance(date, list): + date = date[0] + if isinstance(date, str): + date = iso8601.parse_date(date) + + return normalize_timestamp(date)