diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index a6f4bd45..4141fdb5 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,264 +1,241 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import shutil import tempfile from contextlib import contextmanager -from dateutil import parser from django.http import FileResponse from rest_framework import status from swh.core import tarball from swh.model import identifiers +from swh.deposit.utils import normalize_date from . import DepositReadMixin from ...config import SWH_PERSON, ARCHIVE_TYPE from ..common import SWHGetDepositAPI, SWHPrivateAPIView from ...models import Deposit @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ if len(archive_paths) > 1: # need to rebuild one archive from multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.deposit-', dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, 'aggregate') os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = tarball.compress( aggregated_tarball_rootdir + '.zip', nature='zip', dirpath_or_files=aggregated_tarball_rootdir) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) else: # only 1 archive, no need to do fancy actions (and no cleanup step) yield archive_paths[0] class SWHDepositReadArchives(SWHGetDepositAPI, SWHPrivateAPIView, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ ADDITIONAL_CONFIG = { 'extraction_dir': ('str', '/tmp/swh-deposit/archive/'), } def __init__(self): super().__init__() self.extraction_dir = self.config['extraction_dir'] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def process_get(self, req, collection_name, deposit_id): """Build a unique tarball from the multiple received and stream that content to the client. Args: req (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [r.archive.path for r in self._deposit_requests( deposit_id, request_type=ARCHIVE_TYPE)] with aggregate_tarballs(self.extraction_dir, archive_paths) as path: return FileResponse(open(path, 'rb'), status=status.HTTP_200_OK, content_type='application/octet-stream') class SWHDepositReadMetadata(SWHGetDepositAPI, SWHPrivateAPIView, DepositReadMixin): """Class in charge of aggregating metadata on a deposit. """ ADDITIONAL_CONFIG = { 'provider': ('dict', { # 'provider_name': '', # those are not set since read from the # 'provider_url': '', # deposit's client 'provider_type': 'deposit_client', 'metadata': {} }), 'tool': ('dict', { 'name': 'swh-deposit', 'version': '0.0.1', 'configuration': { 'sword_version': '2' } }) } def __init__(self): super().__init__() self.provider = self.config['provider'] self.tool = self.config['tool'] def _retrieve_url(self, deposit, metadata): client_domain = deposit.client.domain for field in metadata: if 'url' in field: if client_domain in metadata[field]: return metadata[field] - def _prepare_date(self, date): - """Prepare date fields as normalized swh date - - If date is a list, elect arbitrarily the first element of that - list - - If date is (then) a string, parse it through - dateutil.parser.parse to extract a datetime. - - Then normalize it through - swh.model.identifiers.normalize_timestamp. - - Returns - The swh date object - - """ - if isinstance(date, list): - date = date[0] - if isinstance(date, str): - date = parser.parse(date) - - return identifiers.normalize_timestamp(date) - def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Args: deposit (Deposit): Deposit model representation metadata (Dict): Metadata dict representation Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date = metadata.get('codemeta:datePublished') author_date = metadata.get('codemeta:dateCreated') if author_date and commit_date: pass elif commit_date: author_date = commit_date elif author_date: commit_date = author_date else: author_date = deposit.complete_date commit_date = deposit.complete_date return ( - self._prepare_date(author_date), - self._prepare_date(commit_date) + normalize_date(author_date), + normalize_date(commit_date) ) def metadata_read(self, deposit): """Read and aggregate multiple data on deposit into one unified data dictionary. Args: deposit (Deposit): Deposit concerned by the data aggregation. Returns: Dictionary of data representing the deposit to inject in swh. """ data = {} metadata = self._metadata_get(deposit) # create origin_url from metadata only after deposit_check validates it origin_url = self._retrieve_url(deposit, metadata) # Read information metadata data['origin'] = { 'type': 'deposit', 'url': origin_url } # revision fullname = deposit.client.username author_committer = SWH_PERSON # metadata provider self.provider['provider_name'] = deposit.client.last_name self.provider['provider_url'] = deposit.client.provider_url revision_type = 'tar' revision_msg = '%s: Deposit %s in collection %s' % ( fullname, deposit.id, deposit.collection.name) author_date, commit_date = self._normalize_dates(deposit, metadata) data['revision'] = { 'synthetic': True, 'date': author_date, 'committer_date': commit_date, 'author': author_committer, 'committer': author_committer, 'type': revision_type, 'message': revision_msg, 'metadata': metadata, } if deposit.parent: swh_persistent_id = deposit.parent.swh_id persistent_identifier = identifiers.parse_persistent_identifier( swh_persistent_id) parent_revision = persistent_identifier.object_id data['revision']['parents'] = [parent_revision] data['branch_name'] = 'master' data['origin_metadata'] = { 'provider': self.provider, 'tool': self.tool, 'metadata': metadata } return data def process_get(self, req, collection_name, deposit_id): deposit = Deposit.objects.get(pk=deposit_id) data = self.metadata_read(deposit) d = {} if data: d = json.dumps(data) return status.HTTP_200_OK, d, 'application/json' diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py index aa3ecfa7..1fff7f60 100644 --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -1,132 +1,182 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest +from unittest.mock import patch from swh.deposit import utils class UtilsTestCase(unittest.TestCase): """Utils library """ def test_merge(self): """Calling utils.merge on dicts should merge without losing information """ d0 = { 'author': 'someone', 'license': [['gpl2']], 'a': 1 } d1 = { 'author': ['author0', {'name': 'author1'}], 'license': [['gpl3']], 'b': { '1': '2' } } d2 = { 'author': map(lambda x: x, ['else']), 'license': 'mit', 'b': { '2': '3', } } d3 = { 'author': (v for v in ['no one']), } actual_merge = utils.merge(d0, d1, d2, d3) expected_merge = { 'a': 1, 'license': [['gpl2'], ['gpl3'], 'mit'], 'author': [ 'someone', 'author0', {'name': 'author1'}, 'else', 'no one'], 'b': { '1': '2', '2': '3', } } self.assertEqual(actual_merge, expected_merge) def test_merge_2(self): d0 = { 'license': 'gpl2', 'runtime': { 'os': 'unix derivative' } } d1 = { 'license': 'gpl3', 'runtime': 'GNU/Linux' } expected = { 'license': ['gpl2', 'gpl3'], 'runtime': [ { 'os': 'unix derivative' }, 'GNU/Linux' ], } actual = utils.merge(d0, d1) self.assertEqual(actual, expected) def test_merge_edge_cases(self): input_dict = { 'license': ['gpl2', 'gpl3'], 'runtime': [ { 'os': 'unix derivative' }, 'GNU/Linux' ], } # against empty dict actual = utils.merge(input_dict, {}) self.assertEqual(actual, input_dict) # against oneself actual = utils.merge(input_dict, input_dict, input_dict) self.assertEqual(input_dict, input_dict) def test_merge_one_dict(self): """Merge one dict should result in the same dict value """ input_and_expected = {'anything': 'really'} actual = utils.merge(input_and_expected) self.assertEqual(actual, input_and_expected) def test_merge_raise(self): """Calling utils.merge with any no dict argument should raise """ d0 = { 'author': 'someone', 'a': 1 } d1 = ['not a dict'] with self.assertRaises(ValueError): utils.merge(d0, d1) with self.assertRaises(ValueError): utils.merge(d1, d0) with self.assertRaises(ValueError): utils.merge(d1) self.assertEqual(utils.merge(d0), d0) + + +@patch('swh.deposit.utils.normalize_timestamp', side_effect=lambda x: x) +def test_normalize_date_0(mock_normalize): + """When date is a list, choose the first date and normalize it + + Note: We do not test swh.model.identifiers which is already tested + in swh.model + + """ + actual_date = utils.normalize_date(['2017-10-12', 'date1']) + + expected_date = '2017-10-12 00:00:00' + + assert str(actual_date) == expected_date + + +@patch('swh.deposit.utils.normalize_timestamp', side_effect=lambda x: x) +def test_normalize_date_1(mock_normalize): + """Providing a date in a reasonable format, everything is fine + + Note: We do not test swh.model.identifiers which is already tested + in swh.model + + """ + mock_normalize.side_effect = lambda x: x + + actual_date = utils.normalize_date('2018-06-11 17:02:02') + + expected_date = '2018-06-11 17:02:02' # <- why? + + assert str(actual_date) == expected_date + + +@patch('swh.deposit.utils.normalize_timestamp', side_effect=lambda x: x) +def test_normalize_date_doing_irrelevant_stuff(mock_normalize): + """Providing a date in an unknown format, it's completely off + + Note: We do not test swh.model.identifiers which is already tested + in swh.model + + """ + mock_normalize.side_effect = lambda x: x + + actual_date = utils.normalize_date('2017') + + expected_date = '2017-04-16 00:00:00' # <- why? + + assert str(actual_date) == expected_date diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py index 7979ec5b..4994cb07 100644 --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -1,55 +1,82 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from dateutil import parser from types import GeneratorType +from swh.model.identifiers import normalize_timestamp + def merge(*dicts): """Given an iterator of dicts, merge them losing no information. Args: *dicts: arguments are all supposed to be dict to merge into one Returns: dict merged without losing information """ def _extend(existing_val, value): """Given an existing value and a value (as potential lists), merge them together without repetition. """ if isinstance(value, (list, map, GeneratorType)): vals = value else: vals = [value] for v in vals: if v in existing_val: continue existing_val.append(v) return existing_val d = {} for data in dicts: if not isinstance(data, dict): raise ValueError( 'dicts is supposed to be a variable arguments of dict') for key, value in data.items(): existing_val = d.get(key) if not existing_val: d[key] = value continue if isinstance(existing_val, (list, map, GeneratorType)): new_val = _extend(existing_val, value) elif isinstance(existing_val, dict): if isinstance(value, dict): new_val = merge(existing_val, value) else: new_val = _extend([existing_val], value) else: new_val = _extend([existing_val], value) d[key] = new_val return d + + +def normalize_date(date): + """Normalize date fields as expected by swh workers. + + If date is a list, elect arbitrarily the first element of that + list + + If date is (then) a string, parse it through + dateutil.parser.parse to extract a datetime. + + Then normalize it through + swh.model.identifiers.normalize_timestamp. + + Returns + The swh date object + + """ + if isinstance(date, list): + date = date[0] + if isinstance(date, str): + date = parser.parse(date) + + return normalize_timestamp(date)