diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index c783d5f5..ab9f6f28 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,207 +1,200 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from contextlib import contextmanager import os import shutil import tempfile from typing import Any, Dict, Tuple from rest_framework import status from swh.core import tarball from swh.deposit.utils import normalize_date from swh.model import identifiers from swh.model.model import MetadataAuthorityType from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, SWH_PERSON from ...models import Deposit from ..common import APIGet @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ # rebuild one zip archive from (possibly) multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = shutil.make_archive( aggregated_tarball_rootdir, "zip", aggregated_tarball_rootdir ) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ def __init__(self): super().__init__() self.extraction_dir = self.config["extraction_dir"] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def process_get( self, request, collection_name: str, deposit_id: int ) -> Tuple[int, Any, str]: """Build a unique tarball from the multiple received and stream that content to the client. Args: request (Request): collection_name: Collection owning the deposit deposit_id: Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [ r.archive.path for r in self._deposit_requests(deposit_id, request_type=ARCHIVE_TYPE) ] return ( status.HTTP_200_OK, aggregate_tarballs(self.extraction_dir, archive_paths), "swh/generator", ) class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin): """Class in charge of aggregating metadata on a deposit. """ def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Args: deposit (Deposit): Deposit model representation metadata (Dict): Metadata dict representation Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date = metadata.get("codemeta:datePublished") author_date = metadata.get("codemeta:dateCreated") if author_date and commit_date: pass elif commit_date: author_date = commit_date elif author_date: commit_date = author_date else: author_date = deposit.complete_date commit_date = deposit.complete_date return (normalize_date(author_date), normalize_date(commit_date)) def metadata_read(self, deposit: Deposit) -> Dict[str, Any]: """Read and aggregate multiple deposit information into one unified dictionary. Args: - deposit: Deposit concerned by the data aggregation. + deposit: Deposit to retrieve information from Returns: Dictionary of deposit information read by the deposit loader, with the following keys: **origin** (Dict): Information about the origin - **origin_metadata (Dict): Metadata about the origin to load + **metadata_raw** (List[str]): List of raw metadata received for the + deposit - **metadata_raw** (List[str]): List of raw metadata received for the - deposit + **metadata_dict** (Dict): Deposit aggregated metadata into one dict - **metadata_dict** (Dict): Deposit aggregated metadata into one dict + **provider** (Dict): the metadata provider information about the + deposit client - **provider** (Dict): the metadata provider information about the - deposit client - - **tool** (Dict): the deposit information + **tool** (Dict): the deposit information **deposit** (Dict): deposit information relevant to build the revision (author_date, committer_date, etc...) """ metadata, raw_metadata = self._metadata_get(deposit) - # Read information metadata - data = {"origin": {"type": "deposit", "url": deposit.origin_url,}} - author_date, commit_date = self._normalize_dates(deposit, metadata) if deposit.parent: parent_swhid = deposit.parent.swhid assert parent_swhid is not None swhid = identifiers.parse_swhid(parent_swhid) parent_revision = swhid.object_id parents = [parent_revision] else: parents = [] - data["origin_metadata"] = { - # metadata provider + return { + "origin": {"type": "deposit", "url": deposit.origin_url}, "provider": { "provider_name": deposit.client.last_name, "provider_url": deposit.client.provider_url, "provider_type": MetadataAuthorityType.DEPOSIT_CLIENT.value, "metadata": {}, }, "tool": self.tool, "metadata_raw": raw_metadata, "metadata_dict": metadata, + "deposit": { + "id": deposit.id, + "client": deposit.client.username, + "collection": deposit.collection.name, + "author": SWH_PERSON, + "author_date": author_date, + "committer": SWH_PERSON, + "committer_date": commit_date, + "revision_parents": parents, + }, } - data["deposit"] = { - "id": deposit.id, - "client": deposit.client.username, - "collection": deposit.collection.name, - "author": SWH_PERSON, - "author_date": author_date, - "committer": SWH_PERSON, - "committer_date": commit_date, - "revision_parents": parents, - } - - return data def process_get( self, request, collection_name: str, deposit_id: int ) -> Tuple[int, Dict, str]: deposit = Deposit.objects.get(pk=deposit_id) data = self.metadata_read(deposit) return status.HTTP_200_OK, data if data else {}, "application/json" diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py index ba5d2a87..fad948b7 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py @@ -1,424 +1,398 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse from rest_framework import status from swh.deposit import __version__, utils from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc" def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" deposit_id = deposit if isinstance(deposit, int) else deposit.id return [ reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]), reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]), ] def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata): # update deposit's metadata response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=metadata, HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED return deposit def test_read_metadata( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private metadata read api to existing deposit should return metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.save() metadata_xml_atoms = [ atom_dataset[atom_key] for atom_key in ["entry-data2", "entry-data3"] ] metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms] for atom_xml in metadata_xml_atoms: deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, atom_xml, ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" - data = response.json() - assert data == { + actual_data = response.json() + assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, - "origin_metadata": { - "metadata_raw": metadata_xml_atoms, - "metadata_dict": utils.merge(*metadata_xml_raws), - "provider": { - "metadata": {}, - "provider_name": "", - "provider_type": "deposit_client", - "provider_url": "https://hal-test.archives-ouvertes.fr/", - }, - "tool": { - "configuration": {"sword_version": "2"}, - "name": "swh-deposit", - "version": __version__, - }, + "metadata_raw": metadata_xml_atoms, + "metadata_dict": utils.merge(*metadata_xml_raws), + "provider": { + "metadata": {}, + "provider_name": "", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + }, + "tool": { + "configuration": {"sword_version": "2"}, + "name": "swh-deposit", + "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [], }, } def test_read_metadata_revision_with_parent( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private read metadata to a deposit (with parent) returns metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.save() metadata_xml_atoms = [ atom_dataset[atom_key] for atom_key in ["entry-data2", "entry-data3"] ] metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms] for atom_xml in metadata_xml_atoms: deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, atom_xml, ) rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa" swhid = "swh:1:rev:%s" % rev_id fake_parent = Deposit( swhid=swhid, client=deposit.client, collection=deposit.collection ) fake_parent.save() deposit.parent = fake_parent deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" - data = response.json() - assert data == { + actual_data = response.json() + assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, - "origin_metadata": { - "metadata_raw": metadata_xml_atoms, - "metadata_dict": utils.merge(*metadata_xml_raws), - "provider": { - "metadata": {}, - "provider_name": "", - "provider_type": "deposit_client", - "provider_url": "https://hal-test.archives-ouvertes.fr/", - }, - "tool": { - "configuration": {"sword_version": "2"}, - "name": "swh-deposit", - "version": __version__, - }, + "metadata_raw": metadata_xml_atoms, + "metadata_dict": utils.merge(*metadata_xml_raws), + "provider": { + "metadata": {}, + "provider_name": "", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + }, + "tool": { + "configuration": {"sword_version": "2"}, + "name": "swh-deposit", + "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [rev_id], }, } def test_read_metadata_3( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """date(Created|Published) provided, uses author/committer date """ deposit = partial_deposit deposit.external_id = "hal-01243065" deposit.save() # add metadata to the deposit with datePublished and dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 """ ) metadata_xml_atoms = [ atom_dataset["entry-data2"], atom_dataset["entry-data3"], codemeta_entry_data, ] metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms] for atom_xml in metadata_xml_atoms: update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, atom_xml, ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" - data = response.json() - assert data == { + actual_data = response.json() + assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/hal-01243065", }, - "origin_metadata": { - "metadata_raw": metadata_xml_atoms, - "metadata_dict": utils.merge(*metadata_xml_raws), - "provider": { - "metadata": {}, - "provider_name": "", - "provider_type": "deposit_client", - "provider_url": "https://hal-test.archives-ouvertes.fr/", - }, - "tool": { - "configuration": {"sword_version": "2"}, - "name": "swh-deposit", - "version": __version__, - }, + "metadata_raw": metadata_xml_atoms, + "metadata_dict": utils.merge(*metadata_xml_raws), + "provider": { + "metadata": {}, + "provider_name": "", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + }, + "tool": { + "configuration": {"sword_version": "2"}, + "name": "swh-deposit", + "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], }, } def test_read_metadata_4( authenticated_client, deposit_collection, atom_dataset, partial_deposit ): """dateCreated/datePublished not provided, revision uses complete_date """ deposit = partial_deposit codemeta_entry_data = atom_dataset["metadata"] % "" deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) # will use the deposit completed date as fallback date deposit.complete_date = "2016-04-06" deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" - data = response.json() - - expected_origin = { - "type": "deposit", - "url": "https://hal-test.archives-ouvertes.fr/%s" % (deposit.external_id), - } + actual_data = response.json() - expected_origin_metadata = { + assert actual_data == { + "origin": { + "type": "deposit", + "url": "https://hal-test.archives-ouvertes.fr/external-id-partial", + }, "metadata_raw": [codemeta_entry_data], "metadata_dict": parse_xml(codemeta_entry_data), "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, - } - - expected_deposit_info = { - "author": SWH_PERSON, - "committer": SWH_PERSON, - "committer_date": { - "negative_utc": False, - "offset": 0, - "timestamp": {"microseconds": 0, "seconds": 1459900800}, - }, - "author_date": { - "negative_utc": False, - "offset": 0, - "timestamp": {"microseconds": 0, "seconds": 1459900800}, + "deposit": { + "author": SWH_PERSON, + "committer": SWH_PERSON, + "committer_date": { + "negative_utc": False, + "offset": 0, + "timestamp": {"microseconds": 0, "seconds": 1459900800}, + }, + "author_date": { + "negative_utc": False, + "offset": 0, + "timestamp": {"microseconds": 0, "seconds": 1459900800}, + }, + "client": deposit_collection.name, + "id": deposit.id, + "collection": deposit_collection.name, + "revision_parents": [], }, - "client": deposit_collection.name, - "id": deposit.id, - "collection": deposit_collection.name, - "revision_parents": [], } - expected_meta = { - "origin": expected_origin, - "origin_metadata": expected_origin_metadata, - "deposit": expected_deposit_info, - } - - assert data == expected_meta - def test_read_metadata_5( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """dateCreated/datePublished provided, revision uses author/committer date If multiple dateCreated provided, the first occurrence (of dateCreated) is selected. If multiple datePublished provided, the first occurrence (of datePublished) is selected. """ deposit = partial_deposit # add metadata to the deposit with multiple datePublished/dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 2016-04-06T17:08:47+02:00 2018-05-03T16:08:47+02:00 """ ) deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" - data = response.json() + actual_data = response.json() - expected_origin = { - "type": "deposit", - "url": "https://hal-test.archives-ouvertes.fr/external-id-partial", - } - - expected_origin_metadata = { + assert actual_data == { + "origin": { + "type": "deposit", + "url": "https://hal-test.archives-ouvertes.fr/external-id-partial", + }, "metadata_raw": [codemeta_entry_data], "metadata_dict": parse_xml(codemeta_entry_data), "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, - } - - expected_deposit_info = { - "author": SWH_PERSON, - "committer": SWH_PERSON, - "committer_date": { - "negative_utc": False, - "offset": 120, - "timestamp": {"microseconds": 0, "seconds": 1493820527}, - }, - "author_date": { - "negative_utc": False, - "offset": 120, - "timestamp": {"microseconds": 0, "seconds": 1428332927}, + "deposit": { + "author": SWH_PERSON, + "committer": SWH_PERSON, + "committer_date": { + "negative_utc": False, + "offset": 120, + "timestamp": {"microseconds": 0, "seconds": 1493820527}, + }, + "author_date": { + "negative_utc": False, + "offset": 120, + "timestamp": {"microseconds": 0, "seconds": 1428332927}, + }, + "client": deposit_collection.name, + "id": deposit.id, + "collection": deposit_collection.name, + "revision_parents": [], }, - "client": deposit_collection.name, - "id": deposit.id, - "collection": deposit_collection.name, - "revision_parents": [], } - expected_meta = { - "origin": expected_origin, - "origin_metadata": expected_origin_metadata, - "deposit": expected_deposit_info, - } - - assert data == expected_meta - def test_access_to_nonexisting_deposit_returns_404_response( authenticated_client, deposit_collection, ): """Read unknown collection should return a 404 response """ unknown_id = 999 try: Deposit.objects.get(pk=unknown_id) except Deposit.DoesNotExist: assert True for url in private_get_raw_url_endpoints(deposit_collection, unknown_id): response = authenticated_client.get(url) assert response.status_code == status.HTTP_404_NOT_FOUND msg = "Deposit with id %s does not exist" % unknown_id assert msg in response.content.decode("utf-8")