diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index 56f1dbd2..b73d6f82 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,229 +1,227 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import shutil import tempfile from contextlib import contextmanager from django.http import FileResponse from rest_framework import status from swh.core import tarball from swh.model import identifiers from swh.deposit.utils import normalize_date from . import DepositReadMixin, SWHPrivateAPIView from ...config import SWH_PERSON, ARCHIVE_TYPE from ..common import SWHGetDepositAPI from ...models import Deposit @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ # rebuild one zip archive from (possibly) multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.deposit-', dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, 'aggregate') os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones - temp_tarpath = tarball.compress( - aggregated_tarball_rootdir + '.zip', - nature='zip', - dirpath_or_files=aggregated_tarball_rootdir) - + temp_tarpath = shutil.make_archive( + aggregated_tarball_rootdir, 'zip', + aggregated_tarball_rootdir) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) class SWHDepositReadArchives(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ ADDITIONAL_CONFIG = { 'extraction_dir': ('str', '/tmp/swh-deposit/archive/'), } def __init__(self): super().__init__() self.extraction_dir = self.config['extraction_dir'] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def process_get(self, req, collection_name, deposit_id): """Build a unique tarball from the multiple received and stream that content to the client. Args: req (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [r.archive.path for r in self._deposit_requests( deposit_id, request_type=ARCHIVE_TYPE)] with aggregate_tarballs(self.extraction_dir, archive_paths) as path: return FileResponse(open(path, 'rb'), status=status.HTTP_200_OK, content_type='application/octet-stream') class SWHDepositReadMetadata(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin): """Class in charge of aggregating metadata on a deposit. """ ADDITIONAL_CONFIG = { 'provider': ('dict', { # 'provider_name': '', # those are not set since read from the # 'provider_url': '', # deposit's client 'provider_type': 'deposit_client', 'metadata': {} }), 'tool': ('dict', { 'name': 'swh-deposit', 'version': '0.0.1', 'configuration': { 'sword_version': '2' } }) } def __init__(self): super().__init__() self.provider = self.config['provider'] self.tool = self.config['tool'] def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Args: deposit (Deposit): Deposit model representation metadata (Dict): Metadata dict representation Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date = metadata.get('codemeta:datePublished') author_date = metadata.get('codemeta:dateCreated') if author_date and commit_date: pass elif commit_date: author_date = commit_date elif author_date: commit_date = author_date else: author_date = deposit.complete_date commit_date = deposit.complete_date return ( normalize_date(author_date), normalize_date(commit_date) ) def metadata_read(self, deposit): """Read and aggregate multiple data on deposit into one unified data dictionary. Args: deposit (Deposit): Deposit concerned by the data aggregation. Returns: Dictionary of data representing the deposit to inject in swh. """ metadata = self._metadata_get(deposit) # Read information metadata data = { 'origin': { 'type': 'deposit', 'url': deposit.origin_url, } } # revision fullname = deposit.client.username author_committer = SWH_PERSON # metadata provider self.provider['provider_name'] = deposit.client.last_name self.provider['provider_url'] = deposit.client.provider_url revision_type = 'tar' revision_msg = '%s: Deposit %s in collection %s' % ( fullname, deposit.id, deposit.collection.name) author_date, commit_date = self._normalize_dates(deposit, metadata) data['revision'] = { 'synthetic': True, 'date': author_date, 'committer_date': commit_date, 'author': author_committer, 'committer': author_committer, 'type': revision_type, 'message': revision_msg, 'metadata': metadata, } if deposit.parent: swh_persistent_id = deposit.parent.swh_id persistent_identifier = identifiers.parse_persistent_identifier( swh_persistent_id) parent_revision = persistent_identifier.object_id data['revision']['parents'] = [parent_revision] data['branch_name'] = 'master' data['origin_metadata'] = { 'provider': self.provider, 'tool': self.tool, 'metadata': metadata } return data def process_get(self, req, collection_name, deposit_id): deposit = Deposit.objects.get(pk=deposit_id) data = self.metadata_read(deposit) d = {} if data: d = json.dumps(data) return status.HTTP_200_OK, d, 'application/json' diff --git a/swh/deposit/tests/api/test_deposit_private_read_archive.py b/swh/deposit/tests/api/test_deposit_private_read_archive.py index d41ebef1..23942963 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_archive.py +++ b/swh/deposit/tests/api/test_deposit_private_read_archive.py @@ -1,111 +1,86 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import hashlib -import shutil +import io +import zipfile from django.urls import reverse -from os import listdir, path, mkdir from rest_framework import status -from swh.core import tarball from swh.deposit.config import PRIVATE_GET_RAW_CONTENT, EM_IRI - from swh.deposit.tests.common import create_arborescence_archive PRIVATE_GET_RAW_CONTENT_NC = PRIVATE_GET_RAW_CONTENT + '-nc' def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_GET_RAW_CONTENT, args=[collection.name, deposit.id]), reverse(PRIVATE_GET_RAW_CONTENT_NC, args=[deposit.id]) ] def test_access_to_existing_deposit_with_one_archive( authenticated_client, deposit_collection, complete_deposit, sample_archive): """Access to deposit should stream a 200 response with its raw content """ deposit = complete_deposit for url in private_get_raw_url_endpoints(deposit_collection, deposit): r = authenticated_client.get(url) assert r.status_code == status.HTTP_200_OK assert r._headers['content-type'][1] == 'application/octet-stream' # read the stream data = b''.join(r.streaming_content) - actual_sha1 = hashlib.sha1(data).hexdigest() - assert actual_sha1 == sample_archive['sha1sum'] + # extract the file from the zip + zfile = zipfile.ZipFile(io.BytesIO(data)) + assert zfile.namelist() == ['file1'] + assert zfile.open('file1').read() == b'some content in file' def test_access_to_existing_deposit_with_multiple_archives( tmp_path, authenticated_client, deposit_collection, partial_deposit, sample_archive): """Access to deposit should stream a 200 response with its raw contents """ deposit = partial_deposit archive2 = create_arborescence_archive( - tmp_path, 'archive2', 'file2', b'some content in file') + tmp_path, 'archive2', 'file2', b'some other content in file') # Add a second archive to deposit update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, content_type='application/zip', # as zip data=archive2['data'], # + headers CONTENT_LENGTH=archive2['length'], HTTP_SLUG=deposit.external_id, HTTP_CONTENT_MD5=archive2['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % ( archive2['name'], )) assert response.status_code == status.HTTP_201_CREATED for url in private_get_raw_url_endpoints(deposit_collection, deposit): r = authenticated_client.get(url) assert r.status_code == status.HTTP_200_OK assert r._headers['content-type'][1] == 'application/octet-stream' # read the stream data = b''.join(r.streaming_content) - actual_sha1 = hashlib.sha1(data).hexdigest() - check_tarball_consistency( - tmp_path, sample_archive, archive2, actual_sha1) - - -def check_tarball_consistency(tmp_path, archive, archive2, actual_sha1): - """Check the tarballs are ok - - """ - workdir = path.join(tmp_path, 'workdir') - mkdir(workdir) - lst = set(listdir(workdir)) - assert lst == set() - tarball.uncompress(archive['path'], dest=workdir) - assert listdir(workdir) == ['file1'] - tarball.uncompress(archive2['path'], dest=workdir) - lst = set(listdir(workdir)) - assert lst == {'file1', 'file2'} - - new_path = workdir + '.zip' - tarball.compress(new_path, 'zip', workdir) - with open(new_path, 'rb') as f: - h = hashlib.sha1(f.read()).hexdigest() - - assert actual_sha1 == h - assert actual_sha1 != archive['sha1sum'] - assert actual_sha1 != archive2['sha1sum'] - - shutil.rmtree(workdir) + # extract the file from the zip + zfile = zipfile.ZipFile(io.BytesIO(data)) + assert zfile.namelist() == ['file1', 'file2'] + assert zfile.open('file1').read() == b'some content in file' + assert zfile.open('file2').read() == b'some other content in file'