diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index a6a6c1c3..3413419a 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,231 +1,228 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import shutil import tempfile from contextlib import contextmanager from django.http import FileResponse from rest_framework import status from swh.core import tarball from swh.model import identifiers +from ...config import SWH_PERSON from ..common import SWHGetDepositAPI, SWHPrivateAPIView from ...models import Deposit, DepositRequest @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ if len(archive_paths) > 1: # need to rebuild one archive # from multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.deposit-', dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, 'aggregate') os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = tarball.compress( aggregated_tarball_rootdir + '.zip', nature='zip', dirpath_or_files=aggregated_tarball_rootdir) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) else: # only 1 archive, no need to do fancy actions (and no cleanup step) yield archive_paths[0] class SWHDepositReadArchives(SWHGetDepositAPI, SWHPrivateAPIView): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ ADDITIONAL_CONFIG = { 'extraction_dir': ('str', '/tmp/swh-deposit/archive/'), } def __init__(self): super().__init__() self.extraction_dir = self.config['extraction_dir'] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def retrieve_archives(self, deposit_id): """Given a deposit identifier, returns its associated archives' path. Yields: path to deposited archives """ deposit = Deposit.objects.get(pk=deposit_id) deposit_requests = DepositRequest.objects.filter( deposit=deposit, type=self.deposit_request_types['archive']).order_by('id') for deposit_request in deposit_requests: yield deposit_request.archive.path def process_get(self, req, collection_name, deposit_id): """Build a unique tarball from the multiple received and stream that content to the client. Args: req (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = list(self.retrieve_archives(deposit_id)) with aggregate_tarballs(self.extraction_dir, archive_paths) as path: return FileResponse(open(path, 'rb'), status=status.HTTP_200_OK, content_type='application/octet-stream') class SWHDepositReadMetadata(SWHGetDepositAPI, SWHPrivateAPIView): """Class in charge of aggregating metadata on a deposit. """ ADDITIONAL_CONFIG = { 'provider': ('dict', { # 'provider_name': '', # those are not set since read from the # 'provider_url': '', # deposit's client 'provider_type': 'deposit_client', 'metadata': {} }), 'tool': ('dict', { 'tool_name': 'swh-deposit', 'tool_version': '0.0.1', 'tool_configuration': { 'sword_version': '2' } }) } def __init__(self): super().__init__() self.provider = self.config['provider'] self.tool = self.config['tool'] def _aggregate_metadata(self, deposit, metadata_requests): """Retrieve and aggregates metadata information. """ metadata = {} for req in metadata_requests: metadata.update(req.metadata) return metadata def aggregate(self, deposit, requests): """Aggregate multiple data on deposit into one unified data dictionary. Args: deposit (Deposit): Deposit concerned by the data aggregation. requests ([DepositRequest]): List of associated requests which need aggregation. Returns: Dictionary of data representing the deposit to inject in swh. """ data = {} # Retrieve tarballs/metadata information metadata = self._aggregate_metadata(deposit, requests) # Read information metadata data['origin'] = { 'type': 'deposit', 'url': os.path.join(deposit.client.url.rstrip('/'), deposit.external_id), } # revision fullname = deposit.client.get_full_name() - author_committer = { - 'name': deposit.client.last_name, - 'fullname': fullname, - 'email': deposit.client.email, - } + author_committer = SWH_PERSON # metadata provider self.provider['provider_name'] = deposit.client.last_name self.provider['provider_url'] = deposit.client.url revision_type = 'tar' revision_msg = '%s: Deposit %s in collection %s' % ( fullname, deposit.id, deposit.collection.name) complete_date = identifiers.normalize_timestamp(deposit.complete_date) data['revision'] = { 'synthetic': True, 'date': complete_date, 'committer_date': complete_date, 'author': author_committer, 'committer': author_committer, 'type': revision_type, 'message': revision_msg, 'metadata': metadata, } if deposit.parent: parent_revision = deposit.parent.swh_id data['revision']['parents'] = [parent_revision] data['occurrence'] = { 'branch': 'master' } data['origin_metadata'] = { 'provider': self.provider, 'tool': self.tool, 'metadata': metadata } return data def process_get(self, req, collection_name, deposit_id): deposit = Deposit.objects.get(pk=deposit_id) requests = DepositRequest.objects.filter( deposit=deposit, type=self.deposit_request_types['metadata']) data = self.aggregate(deposit, requests) d = {} if data: d = json.dumps(data) return status.HTTP_200_OK, d, 'application/json' diff --git a/swh/deposit/config.py b/swh/deposit/config.py index 92cf1526..f2da70d6 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,86 +1,93 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging from swh.core.config import SWHConfig # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = 'edit_se_iri' EM_IRI = 'em_iri' CONT_FILE_IRI = 'cont_file_iri' SD_IRI = 'servicedocument' COL_IRI = 'upload' STATE_IRI = 'state_iri' PRIVATE_GET_RAW_CONTENT = 'private-download' PRIVATE_CHECK_DEPOSIT = 'check-deposit' PRIVATE_PUT_DEPOSIT = 'private-update' PRIVATE_GET_DEPOSIT_METADATA = 'private-read' ARCHIVE_KEY = 'archive' METADATA_KEY = 'metadata' ARCHIVE_TYPE = 'archive' METADATA_TYPE = 'metadata' AUTHORIZED_PLATFORMS = ['development', 'production', 'testing'] DEPOSIT_STATUS_REJECTED = 'rejected' DEPOSIT_STATUS_PARTIAL = 'partial' DEPOSIT_STATUS_READY = 'ready-for-load' DEPOSIT_STATUS_READY_FOR_CHECKS = 'ready-for-checks' DEPOSIT_STATUS_LOAD_SUCCESS = 'success' DEPOSIT_STATUS_LOAD_FAILURE = 'failure' +# Revision author for deposit +SWH_PERSON = { + 'name': 'Software Heritage', + 'fullname': 'Software Heritage', + 'email': 'robot@softwareheritage.org' +} + def setup_django_for(platform): """Setup function for command line tools (swh.deposit.create_user, swh.deposit.scheduler.cli) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running Raises: ValueError in case of wrong platform inputs. """ if platform not in AUTHORIZED_PLATFORMS: raise ValueError('Platform should be one of %s' % AUTHORIZED_PLATFORMS) os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'swh.deposit.settings.%s' % platform) import django django.setup() class SWHDefaultConfig(SWHConfig): """Mixin intended to enrich views with SWH configuration. """ CONFIG_BASE_FILENAME = 'deposit/server' DEFAULT_CONFIG = { 'max_upload_size': ('int', 209715200), 'checks': ('bool', True), } ADDITIONAL_CONFIG = {} def __init__(self, **config): super().__init__() self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) self.config.update(config) self.log = logging.getLogger('swh.deposit') if self.config['checks']: from swh.scheduler.backend import SchedulerBackend self.scheduler = SchedulerBackend() diff --git a/swh/deposit/tests/api/test_deposit_read_metadata.py b/swh/deposit/tests/api/test_deposit_read_metadata.py index 616d643b..96076423 100644 --- a/swh/deposit/tests/api/test_deposit_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_read_metadata.py @@ -1,207 +1,200 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from django.core.urlresolvers import reverse from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase from swh.deposit.models import Deposit from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.config import DEPOSIT_STATUS_PARTIAL +from ...config import SWH_PERSON from ..common import BasicTestCase, WithAuthTestCase, CommonCreationRoutine class DepositReadMetadataTest(APITestCase, WithAuthTestCase, BasicTestCase, CommonCreationRoutine): """Deposit access to read metadata information on deposit. """ @istest def read_metadata(self): """Private metadata read api to existing deposit should return metadata """ deposit_id = self.create_deposit_partial() url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[self.collection.name, deposit_id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEquals(response._headers['content-type'][1], 'application/json') data = json.loads(response.content.decode('utf-8')) expected_meta = { 'origin': { 'url': 'https://hal.test.fr/some-external-id', 'type': 'deposit' }, 'origin_metadata': { 'metadata': { '{http://www.w3.org/2005/Atom}external_identifier': 'some-external-id' }, 'provider': { 'provider_name': '', 'provider_type': 'deposit_client', 'provider_url': 'https://hal.test.fr/', 'metadata': {} }, 'tool': { 'tool_name': 'swh-deposit', 'tool_version': '0.0.1', 'tool_configuration': { 'sword_version': '2' } } }, 'revision': { 'synthetic': True, 'committer_date': None, 'message': ': Deposit %s in collection hal' % deposit_id, - 'author': { - 'fullname': '', 'email': '', 'name': '' - }, - 'committer': { - 'fullname': '', 'email': '', 'name': '' - }, + 'author': SWH_PERSON, + 'committer': SWH_PERSON, 'date': None, 'metadata': { '{http://www.w3.org/2005/Atom}external_identifier': 'some-external-id' }, 'type': 'tar' }, 'occurrence': { 'branch': 'master' } } self.assertEquals(data, expected_meta) @istest def read_metadata_revision_with_parent(self): """Private read metadata to a deposit (with parent) returns metadata """ swh_id = 'da78a9d4cf1d5d29873693fd496142e3a18c20fa' deposit_id1 = self.create_deposit_with_status( status=DEPOSIT_STATUS_LOAD_SUCCESS, external_id='some-external-id', swh_id=swh_id) deposit_parent = Deposit.objects.get(pk=deposit_id1) self.assertEquals(deposit_parent.swh_id, swh_id) self.assertEquals(deposit_parent.external_id, 'some-external-id') self.assertEquals(deposit_parent.status, DEPOSIT_STATUS_LOAD_SUCCESS) deposit_id = self.create_deposit_partial( external_id='some-external-id') deposit = Deposit.objects.get(pk=deposit_id) self.assertEquals(deposit.external_id, 'some-external-id') self.assertEquals(deposit.swh_id, None) self.assertEquals(deposit.parent, deposit_parent) self.assertEquals(deposit.status, DEPOSIT_STATUS_PARTIAL) url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[self.collection.name, deposit_id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEquals(response._headers['content-type'][1], 'application/json') data = json.loads(response.content.decode('utf-8')) expected_meta = { 'origin': { 'url': 'https://hal.test.fr/some-external-id', 'type': 'deposit' }, 'origin_metadata': { 'metadata': { '{http://www.w3.org/2005/Atom}external_identifier': 'some-external-id' }, 'provider': { 'provider_name': '', 'provider_type': 'deposit_client', 'provider_url': 'https://hal.test.fr/', 'metadata': {} }, 'tool': { 'tool_name': 'swh-deposit', 'tool_version': '0.0.1', 'tool_configuration': { 'sword_version': '2' } } }, 'revision': { 'synthetic': True, 'date': None, 'committer_date': None, - 'author': { - 'fullname': '', 'email': '', 'name': '' - }, - 'committer': { - 'fullname': '', 'email': '', 'name': '' - }, + 'author': SWH_PERSON, + 'committer': SWH_PERSON, 'type': 'tar', 'message': ': Deposit %s in collection hal' % deposit_id, 'metadata': { '{http://www.w3.org/2005/Atom}external_identifier': 'some-external-id' }, 'parents': [swh_id] }, 'occurrence': { 'branch': 'master' } } self.assertEquals(data, expected_meta) @istest def access_to_nonexisting_deposit_returns_404_response(self): """Read unknown collection should return a 404 response """ unknown_id = '999' url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[self.collection.name, unknown_id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) self.assertIn('Deposit with id %s does not exist' % unknown_id, response.content.decode('utf-8')) @istest def access_to_nonexisting_collection_returns_404_response(self): """Read unknown deposit should return a 404 response """ collection_name = 'non-existing' deposit_id = self.create_deposit_partial() url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection_name, deposit_id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) self.assertIn('Unknown collection name %s' % collection_name, response.content.decode('utf-8'),)