diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py index 307fada2..136e3bdf 100644 --- a/swh/deposit/api/private/__init__.py +++ b/swh/deposit/api/private/__init__.py @@ -1,91 +1,87 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, Optional, Tuple from rest_framework.permissions import AllowAny from rest_framework.views import APIView -from swh.deposit import utils - from ...config import METADATA_TYPE, APIConfig from ...models import Deposit, DepositRequest class DepositReadMixin: """Deposit Read mixin """ def _deposit_requests(self, deposit: Deposit, request_type: str): """Given a deposit, yields its associated deposit_request Args: deposit: Deposit to list requests for request_type: 'archive' or 'metadata' Yields: - deposit requests of type request_type associated to the deposit + deposit requests of type request_type associated to the deposit, + most recent first """ deposit_requests = DepositRequest.objects.filter( type=request_type, deposit=deposit - ).order_by("id") + ).order_by("-id") for deposit_request in deposit_requests: yield deposit_request - def _metadata_get(self, deposit: Deposit) -> Tuple[Dict[str, Any], List[str]]: + def _metadata_get(self, deposit: Deposit) -> Tuple[Dict[str, Any], Optional[bytes]]: """Given a deposit, retrieve all metadata requests into one Dict and returns both that aggregated metadata dict and the list of raw_metdadata. Args: deposit: The deposit instance to extract metadata from Returns: - Tuple of aggregated metadata dict, list of raw_metadata + Tuple of last metadata dict and last raw_metadata """ - metadata: List[Dict[str, Any]] = [] - raw_metadata: List[str] = [] for deposit_request in self._deposit_requests( deposit, request_type=METADATA_TYPE ): - metadata.append(deposit_request.metadata) - raw_metadata.append(deposit_request.raw_metadata) + if deposit_request.raw_metadata is not None: + return (deposit_request.metadata, deposit_request.raw_metadata) - aggregated_metadata = utils.merge(*metadata) - return (aggregated_metadata, raw_metadata) + return ({}, None) class APIPrivateView(APIConfig, APIView): """Mixin intended as private api (so no authentication) based API view (for the private ones). """ def __init__(self): super().__init__() self.authentication_classes = () self.permission_classes = (AllowAny,) def checks(self, req, collection_name, deposit=None): """Override default checks implementation to allow empty collection. """ headers = self._read_headers(req) self.additional_checks(req, headers, collection_name, deposit) return {"headers": headers} def get( self, request, collection_name=None, deposit_id=None, *args, **kwargs, ): return super().get(request, collection_name, deposit_id) def put( self, request, collection_name=None, deposit_id=None, *args, **kwargs, ): return super().put(request, collection_name, deposit_id) diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py index 563f1a80..c2b4730c 100644 --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -1,191 +1,192 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from itertools import chain import re from shutil import get_unpack_formats import tarfile from typing import Dict, Optional, Tuple import zipfile from rest_framework import status from rest_framework.request import Request from swh.scheduler.utils import create_oneshot_task_dict from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED from ...models import Deposit, DepositRequest from ..checks import check_metadata from ..common import APIGet MANDATORY_ARCHIVE_UNREADABLE = ( "At least one of its associated archives is not readable" # noqa ) MANDATORY_ARCHIVE_INVALID = ( "Mandatory archive is invalid (i.e contains only one archive)" # noqa ) MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" ARCHIVE_EXTENSIONS = [ "zip", "tar", "tar.gz", "xz", "tar.xz", "bz2", "tar.bz2", "Z", "tar.Z", "tgz", "7z", ] PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) def known_archive_format(filename): return any( filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) ) class APIChecks(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to trigger the deposit checks on deposit archives and metadata. Only GET is supported. """ def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]: """Given a deposit, check each deposit request of type archive. Args: The deposit to check archives for Returns tuple (status, details): True, None if all archives are ok, (False, ) otherwise. """ requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) + requests.reverse() if len(requests) == 0: # no associated archive is refused return False, {"archive": [{"summary": MANDATORY_ARCHIVE_MISSING,}]} errors = [] for archive_request in requests: check, error_message = self._check_archive(archive_request) if not check: errors.append( {"summary": error_message, "fields": [archive_request.id]} ) if not errors: return True, None return False, {"archive": errors} def _check_archive( self, archive_request: DepositRequest ) -> Tuple[bool, Optional[str]]: """Check that a deposit associated archive is ok: - readable - supported archive format - valid content: the archive does not contain a single archive file If any of those checks are not ok, return the corresponding failing check. Args: archive_path (DepositRequest): Archive to check Returns: (True, None) if archive is check compliant, (False, ) otherwise. """ archive_path = archive_request.archive.path if not known_archive_format(archive_path): return False, MANDATORY_ARCHIVE_UNSUPPORTED try: if zipfile.is_zipfile(archive_path): with zipfile.ZipFile(archive_path) as zipfile_: files = zipfile_.namelist() elif tarfile.is_tarfile(archive_path): with tarfile.open(archive_path) as tarfile_: files = tarfile_.getnames() else: return False, MANDATORY_ARCHIVE_UNSUPPORTED except Exception: return False, MANDATORY_ARCHIVE_UNREADABLE if len(files) > 1: return True, None element = files[0] if PATTERN_ARCHIVE_EXTENSION.match(element): # archive in archive! return False, MANDATORY_ARCHIVE_INVALID return True, None def process_get( self, req: Request, collection_name: str, deposit: Deposit ) -> Tuple[int, Dict, str]: """Trigger the checks on the deposit archives and then on the deposit metadata. If any problems (or warnings) are raised, the deposit status and status detail are updated accordingly. If all checks are ok, the deposit status is updated to the 'verified' status (details updated with warning if any) and a loading task is scheduled for the deposit to be ingested. Otherwise, the deposit is marked as 'rejected' with the error details. A json response is returned to the caller with the deposit checks. Args: req: Client request collection_name: Collection owning the deposit deposit: Deposit concerned by the reading Returns: Tuple (status, json response, content-type) """ metadata, _ = self._metadata_get(deposit) details_dict: Dict = {} # will check each deposit's associated request (both of type # archive and metadata) for errors archives_status_ok, details = self._check_deposit_archives(deposit) if not archives_status_ok: assert details is not None details_dict.update(details) metadata_status_ok, details = check_metadata(metadata) # Ensure in case of error, we do have the rejection details assert metadata_status_ok or (not metadata_status_ok and details is not None) # we can have warnings even if checks are ok (e.g. missing suggested field) details_dict.update(details or {}) deposit_status_ok = archives_status_ok and metadata_status_ok # if any details_dict arose, the deposit is rejected deposit.status = ( DEPOSIT_STATUS_VERIFIED if deposit_status_ok else DEPOSIT_STATUS_REJECTED ) response: Dict = { "status": deposit.status, } if details_dict: deposit.status_detail = details_dict response["details"] = details_dict # Deposit ok, then we schedule the deposit loading task (if not already done) if deposit_status_ok and not deposit.load_task_id and self.config["checks"]: url = deposit.origin_url task = create_oneshot_task_dict( "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 ) load_task_id = self.scheduler.create_tasks([task])[0]["id"] deposit.load_task_id = load_task_id deposit.save() return status.HTTP_200_OK, response, "application/json" diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index a87d3db2..ced1c912 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,207 +1,207 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from contextlib import contextmanager import os import shutil import tempfile from typing import Any, Dict, Tuple from rest_framework import status from swh.core import tarball from swh.deposit.utils import normalize_date from swh.model.hashutil import hash_to_hex from swh.model.model import MetadataAuthorityType from swh.model.swhids import CoreSWHID from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, SWH_PERSON from ...models import Deposit from ..common import APIGet @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ # rebuild one zip archive from (possibly) multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = shutil.make_archive( aggregated_tarball_rootdir, "tar", aggregated_tarball_rootdir ) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ def __init__(self): super().__init__() self.extraction_dir = self.config["extraction_dir"] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def process_get( self, request, collection_name: str, deposit: Deposit ) -> Tuple[int, Any, str]: """Build a unique tarball from the multiple received and stream that content to the client. Args: request (Request): collection_name: Collection owning the deposit deposit: Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [ r.archive.path for r in self._deposit_requests(deposit, request_type=ARCHIVE_TYPE) ] return ( status.HTTP_200_OK, aggregate_tarballs(self.extraction_dir, archive_paths), "swh/generator", ) class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin): """Class in charge of aggregating metadata on a deposit. """ def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Args: deposit (Deposit): Deposit model representation metadata (Dict): Metadata dict representation Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date = metadata.get("codemeta:datePublished") author_date = metadata.get("codemeta:dateCreated") if author_date and commit_date: pass elif commit_date: author_date = commit_date elif author_date: commit_date = author_date else: author_date = deposit.complete_date commit_date = deposit.complete_date return (normalize_date(author_date), normalize_date(commit_date)) def metadata_read(self, deposit: Deposit) -> Dict[str, Any]: """Read and aggregate multiple deposit information into one unified dictionary. Args: deposit: Deposit to retrieve information from Returns: Dictionary of deposit information read by the deposit loader, with the following keys: **origin** (Dict): Information about the origin - **metadata_raw** (List[str]): List of raw metadata received for the + **metadata_raw** (str): List of raw metadata received for the deposit **metadata_dict** (Dict): Deposit aggregated metadata into one dict **provider** (Dict): the metadata provider information about the deposit client **tool** (Dict): the deposit information **deposit** (Dict): deposit information relevant to build the revision (author_date, committer_date, etc...) """ metadata, raw_metadata = self._metadata_get(deposit) author_date, commit_date = self._normalize_dates(deposit, metadata) if deposit.parent: parent_swhid = deposit.parent.swhid assert parent_swhid is not None swhid = CoreSWHID.from_string(parent_swhid) parent_revision = hash_to_hex(swhid.object_id) parents = [parent_revision] else: parents = [] release_notes = metadata.get("codemeta:releaseNotes") if isinstance(release_notes, list): release_notes = "\n\n".join(release_notes) if not release_notes: release_notes = None return { "origin": {"type": "deposit", "url": deposit.origin_url}, "provider": { "provider_name": deposit.client.last_name, "provider_url": deposit.client.provider_url, "provider_type": MetadataAuthorityType.DEPOSIT_CLIENT.value, "metadata": {}, }, "tool": self.tool, "metadata_raw": raw_metadata, "metadata_dict": metadata, "deposit": { "id": deposit.id, "client": deposit.client.username, "collection": deposit.collection.name, "author": SWH_PERSON, "author_date": author_date, "committer": SWH_PERSON, "committer_date": commit_date, "revision_parents": parents, "release_notes": release_notes, }, } def process_get( self, request, collection_name: str, deposit: Deposit ) -> Tuple[int, Dict, str]: data = self.metadata_read(deposit) return status.HTTP_200_OK, data if data else {}, "application/json" diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py index 77d5198f..42a9fc38 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py @@ -1,459 +1,433 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse from rest_framework import status -from swh.deposit import __version__, utils +from swh.deposit import __version__ from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA, SE_IRI, SWH_PERSON from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc" def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" deposit_id = deposit if isinstance(deposit, int) else deposit.id return [ reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]), reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]), ] def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata): # update deposit's metadata response = authenticated_client.post( reverse(SE_IRI, args=[collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=metadata, HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED return deposit def test_read_metadata( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private metadata read api to existing deposit should return metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}" deposit.save() - metadata_xml_atoms = [ - atom_dataset[atom_key] for atom_key in ["entry-data2", "entry-data3"] - ] - metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms] - for atom_xml in metadata_xml_atoms: - deposit = update_deposit_with_metadata( - authenticated_client, deposit_collection, deposit, atom_xml, - ) + metadata_xml_raw = atom_dataset["entry-data2"] + deposit = update_deposit_with_metadata( + authenticated_client, deposit_collection, deposit, metadata_xml_raw, + ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, - "metadata_raw": metadata_xml_atoms, - "metadata_dict": utils.merge(*metadata_xml_raws), + "metadata_raw": metadata_xml_raw, + "metadata_dict": parse_xml(metadata_xml_raw), "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [], "release_notes": "This is the release of October 7th, 2017.", }, } def test_read_metadata_revision_with_parent( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private read metadata to a deposit (with parent) returns metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}" deposit.save() - metadata_xml_atoms = [ - atom_dataset[atom_key] for atom_key in ["entry-data2", "entry-data3"] - ] - metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms] - for atom_xml in metadata_xml_atoms: - deposit = update_deposit_with_metadata( - authenticated_client, deposit_collection, deposit, atom_xml, - ) + metadata_xml_raw = atom_dataset["entry-data2"] + deposit = update_deposit_with_metadata( + authenticated_client, deposit_collection, deposit, metadata_xml_raw, + ) rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa" swhid = "swh:1:rev:%s" % rev_id fake_parent = Deposit( swhid=swhid, client=deposit.client, collection=deposit.collection ) fake_parent.save() deposit.parent = fake_parent deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, - "metadata_raw": metadata_xml_atoms, - "metadata_dict": utils.merge(*metadata_xml_raws), + "metadata_raw": metadata_xml_raw, + "metadata_dict": parse_xml(metadata_xml_raw), "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [rev_id], "release_notes": "This is the release of October 7th, 2017.", }, } def test_read_metadata_3( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """date(Created|Published) provided, uses author/committer date """ deposit = partial_deposit deposit.external_id = "hal-01243065" deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}" deposit.save() - # add metadata to the deposit with datePublished and dateCreated - codemeta_entry_data = ( - atom_dataset["metadata"] - % """ - 2015-04-06T17:08:47+02:00 - 2017-05-03T16:08:47+02:00 -""" + metadata_xml_raw = atom_dataset["entry-data3"] + update_deposit_with_metadata( + authenticated_client, deposit_collection, deposit, metadata_xml_raw, ) - metadata_xml_atoms = [ - atom_dataset["entry-data2"], - atom_dataset["entry-data3"], - codemeta_entry_data, - ] - metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms] - for atom_xml in metadata_xml_atoms: - update_deposit_with_metadata( - authenticated_client, deposit_collection, deposit, atom_xml, - ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/hal-01243065", }, - "metadata_raw": metadata_xml_atoms, - "metadata_dict": utils.merge(*metadata_xml_raws), + "metadata_raw": metadata_xml_raw, + "metadata_dict": parse_xml(metadata_xml_raw), "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], "release_notes": "This is the release of October 7th, 2017.", }, } def test_read_metadata_4( authenticated_client, deposit_collection, atom_dataset, partial_deposit ): """dateCreated/datePublished not provided, revision uses complete_date """ deposit = partial_deposit codemeta_entry_data = atom_dataset["metadata"] % "" deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) # will use the deposit completed date as fallback date deposit.complete_date = "2016-04-06" deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": {"type": "deposit", "url": None,}, - "metadata_raw": [codemeta_entry_data], + "metadata_raw": codemeta_entry_data, "metadata_dict": parse_xml(codemeta_entry_data), "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "author_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], "release_notes": None, }, } def test_read_metadata_5( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """dateCreated/datePublished provided, revision uses author/committer date If multiple dateCreated provided, the first occurrence (of dateCreated) is selected. If multiple datePublished provided, the first occurrence (of datePublished) is selected. """ deposit = partial_deposit # add metadata to the deposit with multiple datePublished/dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 2016-04-06T17:08:47+02:00 2018-05-03T16:08:47+02:00 """ ) deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/hal-01243065", }, - "metadata_raw": [codemeta_entry_data], + "metadata_raw": codemeta_entry_data, "metadata_dict": parse_xml(codemeta_entry_data), "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1428332927}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], "release_notes": None, }, } def test_access_to_nonexisting_deposit_returns_404_response( authenticated_client, deposit_collection, ): """Read unknown collection should return a 404 response """ unknown_id = 999 try: Deposit.objects.get(pk=unknown_id) except Deposit.DoesNotExist: assert True for url in private_get_raw_url_endpoints(deposit_collection, unknown_id): response = authenticated_client.get(url) assert response.status_code == status.HTTP_404_NOT_FOUND msg = "Deposit %s does not exist" % unknown_id assert msg in response.content.decode("utf-8") def test_read_metadata_multiple_release_notes( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private metadata read api to existing deposit should return metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}" deposit.save() - metadata_xml_atoms = [ - atom_dataset[atom_key] for atom_key in ["entry-data-multiple-release-notes"] - ] - metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms] - for atom_xml in metadata_xml_atoms: - deposit = update_deposit_with_metadata( - authenticated_client, deposit_collection, deposit, atom_xml, - ) + metadata_xml_raw = atom_dataset["entry-data-multiple-release-notes"] + deposit = update_deposit_with_metadata( + authenticated_client, deposit_collection, deposit, metadata_xml_raw, + ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, - "metadata_raw": metadata_xml_atoms, - "metadata_dict": utils.merge(*metadata_xml_raws), + "metadata_raw": metadata_xml_raw, + "metadata_dict": parse_xml(metadata_xml_raw), "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [], "release_notes": ( "This is the release of October 7th, 2017.\n\n" "It fixes some bugs." ), }, } diff --git a/swh/deposit/tests/data/atom/entry-data2.xml b/swh/deposit/tests/data/atom/entry-data2.xml index 35eb7208..aaeba602 100644 --- a/swh/deposit/tests/data/atom/entry-data2.xml +++ b/swh/deposit/tests/data/atom/entry-data2.xml @@ -1,12 +1,18 @@ https://hal-test.archives-ouvertes.fr/some-external-id some awesome author + + another one + no one + 2017-10-07T15:17:08Z + This is the release of October 7th, 2017. diff --git a/swh/deposit/tests/data/atom/entry-data3.xml b/swh/deposit/tests/data/atom/entry-data3.xml index 7780ab1f..a76f2b19 100644 --- a/swh/deposit/tests/data/atom/entry-data3.xml +++ b/swh/deposit/tests/data/atom/entry-data3.xml @@ -1,7 +1,13 @@ - + + https://hal-test.archives-ouvertes.fr/some-external-id + some awesome author + another one no one 2017-10-07T15:17:08Z This is the release of October 7th, 2017. + 2015-04-06T17:08:47+02:00 + 2017-05-03T16:08:47+02:00 diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py index ee1beb8d..d72fefad 100644 --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -1,303 +1,217 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.deposit import utils from swh.deposit.parsers import parse_xml from swh.model.exceptions import ValidationError from swh.model.swhids import CoreSWHID, QualifiedSWHID @pytest.fixture def xml_with_origin_reference(): xml_data = """ """ return xml_data.strip() -def test_merge(): - """Calling utils.merge on dicts should merge without losing information - - """ - d0 = {"author": "someone", "license": [["gpl2"]], "a": 1} - - d1 = { - "author": ["author0", {"name": "author1"}], - "license": [["gpl3"]], - "b": {"1": "2"}, - } - - d2 = {"author": map(lambda x: x, ["else"]), "license": "mit", "b": {"2": "3",}} - - d3 = { - "author": (v for v in ["no one"]), - } - - actual_merge = utils.merge(d0, d1, d2, d3) - - expected_merge = { - "a": 1, - "license": [["gpl2"], ["gpl3"], "mit"], - "author": ["someone", "author0", {"name": "author1"}, "else", "no one"], - "b": {"1": "2", "2": "3",}, - } - assert actual_merge == expected_merge - - -def test_merge_2(): - d0 = {"license": "gpl2", "runtime": {"os": "unix derivative"}} - - d1 = {"license": "gpl3", "runtime": "GNU/Linux"} - - expected = { - "license": ["gpl2", "gpl3"], - "runtime": [{"os": "unix derivative"}, "GNU/Linux"], - } - - actual = utils.merge(d0, d1) - assert actual == expected - - -def test_merge_edge_cases(): - input_dict = { - "license": ["gpl2", "gpl3"], - "runtime": [{"os": "unix derivative"}, "GNU/Linux"], - } - # against empty dict - actual = utils.merge(input_dict, {}) - assert actual == input_dict - - # against oneself - actual = utils.merge(input_dict, input_dict, input_dict) - assert actual == input_dict - - -def test_merge_one_dict(): - """Merge one dict should result in the same dict value - - """ - input_and_expected = {"anything": "really"} - actual = utils.merge(input_and_expected) - assert actual == input_and_expected - - -def test_merge_raise(): - """Calling utils.merge with any no dict argument should raise - - """ - d0 = {"author": "someone", "a": 1} - - d1 = ["not a dict"] - - with pytest.raises(ValueError): - utils.merge(d0, d1) - - with pytest.raises(ValueError): - utils.merge(d1, d0) - - with pytest.raises(ValueError): - utils.merge(d1) - - assert utils.merge(d0) == d0 - - def test_normalize_date_0(): """When date is a list, choose the first date and normalize it """ actual_date = utils.normalize_date(["2017-10-12", "date1"]) assert actual_date == { "timestamp": {"microseconds": 0, "seconds": 1507766400}, "offset": 0, } def test_normalize_date_1(): """Providing a date in a reasonable format, everything is fine """ actual_date = utils.normalize_date("2018-06-11 17:02:02") assert actual_date == { "timestamp": {"microseconds": 0, "seconds": 1528736522}, "offset": 0, } def test_normalize_date_doing_irrelevant_stuff(): """Providing a date with only the year results in a reasonable date """ actual_date = utils.normalize_date("2017") assert actual_date == { "timestamp": {"seconds": 1483228800, "microseconds": 0}, "offset": 0, } @pytest.mark.parametrize( "swhid,expected_metadata_context", [ ("swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49", {"origin": None},), ( "swh:1:snp:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=http://blah", {"origin": "http://blah", "path": None}, ), ( "swh:1:dir:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;path=/path", {"origin": None, "path": b"/path"}, ), ( "swh:1:rev:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;visit=swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa { "origin": None, "path": None, "snapshot": CoreSWHID.from_string( "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" ), }, ), ( "swh:1:rel:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa { "origin": None, "path": None, "directory": CoreSWHID.from_string( "swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" ), }, ), ], ) def test_compute_metadata_context(swhid: str, expected_metadata_context): assert expected_metadata_context == utils.compute_metadata_context( QualifiedSWHID.from_string(swhid) ) def test_parse_swh_reference_origin(xml_with_origin_reference): url = "https://url" xml_data = xml_with_origin_reference.format(url=url) metadata = parse_xml(xml_data) actual_origin = utils.parse_swh_reference(metadata) assert actual_origin == url @pytest.fixture def xml_swh_deposit_template(): xml_data = """ {swh_deposit} """ return xml_data.strip() @pytest.mark.parametrize( "xml_ref", [ "", "", "", """""", ], ) def test_parse_swh_reference_empty(xml_swh_deposit_template, xml_ref): xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref) metadata = utils.parse_xml(xml_body) assert utils.parse_swh_reference(metadata) is None @pytest.fixture def xml_with_swhid(atom_dataset): return atom_dataset["entry-data-with-swhid"] @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", ], ) def test_parse_swh_reference_swhid(swhid, xml_with_swhid): xml_data = xml_with_swhid.format(swhid=swhid) metadata = utils.parse_xml(xml_data) actual_swhid = utils.parse_swh_reference(metadata) assert actual_swhid is not None expected_swhid = QualifiedSWHID.from_string(swhid) assert actual_swhid == expected_swhid @pytest.mark.parametrize( "invalid_swhid", [ # incorrect length "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235" # noqa # visit qualifier should be a core SWHID with type, "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa # anchor qualifier should be a core SWHID with type one of "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04", # noqa ], ) def test_parse_swh_reference_invalid_swhid(invalid_swhid, xml_with_swhid): """Unparsable swhid should raise """ xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid) metadata = utils.parse_xml(xml_invalid_swhid) with pytest.raises(ValidationError): utils.parse_swh_reference(metadata) @pytest.mark.parametrize( "xml_ref", [ "", "", "", ], ) def test_parse_swh_metatada_provenance_empty(xml_swh_deposit_template, xml_ref): xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref) metadata = utils.parse_xml(xml_body) assert utils.parse_swh_metadata_provenance(metadata) is None @pytest.fixture def xml_with_metadata_provenance(atom_dataset): return atom_dataset["entry-data-with-metadata-provenance"] def test_parse_swh_metadata_provenance2(xml_with_metadata_provenance): xml_data = xml_with_metadata_provenance.format(url="https://url.org/metadata/url") metadata = utils.parse_xml(xml_data) actual_url = utils.parse_swh_metadata_provenance(metadata) assert actual_url == "https://url.org/metadata/url" diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py index 0ad2c1af..1adb7258 100644 --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -1,291 +1,241 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -from types import GeneratorType from typing import Any, Dict, Optional, Union import iso8601 import xmltodict from swh.model.exceptions import ValidationError from swh.model.model import TimestampWithTimezone from swh.model.swhids import ExtendedSWHID, ObjectType, QualifiedSWHID logger = logging.getLogger(__name__) def parse_xml(stream, encoding="utf-8"): namespaces = { "http://www.w3.org/2005/Atom": "atom", "http://www.w3.org/2007/app": "app", "http://purl.org/dc/terms/": "dc", "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta", "http://purl.org/net/sword/terms/": "sword", "https://www.softwareheritage.org/schema/2018/deposit": "swh", "https://schema.org/": "schema", } data = xmltodict.parse( stream, encoding=encoding, namespaces=namespaces, process_namespaces=True, dict_constructor=dict, ) if "atom:entry" in data: data = data["atom:entry"] return data -def merge(*dicts): - """Given an iterator of dicts, merge them losing no information. - - Args: - *dicts: arguments are all supposed to be dict to merge into one - - Returns: - dict merged without losing information - - """ - - def _extend(existing_val, value): - """Given an existing value and a value (as potential lists), merge - them together without repetition. - - """ - if isinstance(value, (list, map, GeneratorType)): - vals = value - else: - vals = [value] - for v in vals: - if v in existing_val: - continue - existing_val.append(v) - return existing_val - - d = {} - for data in dicts: - if not isinstance(data, dict): - raise ValueError("dicts is supposed to be a variable arguments of dict") - - for key, value in data.items(): - existing_val = d.get(key) - if not existing_val: - d[key] = value - continue - if isinstance(existing_val, (list, map, GeneratorType)): - new_val = _extend(existing_val, value) - elif isinstance(existing_val, dict): - if isinstance(value, dict): - new_val = merge(existing_val, value) - else: - new_val = _extend([existing_val], value) - else: - new_val = _extend([existing_val], value) - d[key] = new_val - return d - - def normalize_date(date): """Normalize date fields as expected by swh workers. If date is a list, elect arbitrarily the first element of that list If date is (then) a string, parse it through dateutil.parser.parse to extract a datetime. Then normalize it through :class:`swh.model.model.TimestampWithTimezone` Returns The swh date object """ if isinstance(date, list): date = date[0] if isinstance(date, str): date = iso8601.parse_date(date) tstz = TimestampWithTimezone.from_dict(date) return { "timestamp": tstz.timestamp.to_dict(), "offset": tstz.offset_minutes(), } def compute_metadata_context(swhid_reference: QualifiedSWHID) -> Dict[str, Any]: """Given a SWHID object, determine the context as a dict. """ metadata_context: Dict[str, Any] = {"origin": None} if swhid_reference.qualifiers(): metadata_context = { "origin": swhid_reference.origin, "path": swhid_reference.path, } snapshot = swhid_reference.visit if snapshot: metadata_context["snapshot"] = snapshot anchor = swhid_reference.anchor if anchor: metadata_context[anchor.object_type.name.lower()] = anchor return metadata_context ALLOWED_QUALIFIERS_NODE_TYPE = ( ObjectType.SNAPSHOT, ObjectType.REVISION, ObjectType.RELEASE, ObjectType.DIRECTORY, ) def parse_swh_metadata_provenance( metadata: Dict, ) -> Optional[Union[QualifiedSWHID, str]]: """Parse swh metadata-provenance within the metadata dict reference if found, None otherwise. .. code-block:: xml https://url.org/metadata/url Args: metadata: result of parsing an Atom document with :func:`parse_xml` Raises: ValidationError in case of invalid xml Returns: Either the metadata provenance url if any or None otherwise """ swh_deposit = metadata.get("swh:deposit") if not swh_deposit: return None swh_metadata_provenance = swh_deposit.get("swh:metadata-provenance") if not swh_metadata_provenance: return None return swh_metadata_provenance.get("schema:url") def parse_swh_reference(metadata: Dict,) -> Optional[Union[QualifiedSWHID, str]]: """Parse swh reference within the metadata dict (or origin) reference if found, None otherwise. .. code-block:: xml or: .. code-block:: xml Args: metadata: result of parsing an Atom document with :func:`parse_xml` Raises: ValidationError in case the swhid referenced (if any) is invalid Returns: Either swhid or origin reference if any. None otherwise. """ # noqa swh_deposit = metadata.get("swh:deposit") if not swh_deposit: return None swh_reference = swh_deposit.get("swh:reference") if not swh_reference: return None swh_origin = swh_reference.get("swh:origin") if swh_origin: url = swh_origin.get("@url") if url: return url swh_object = swh_reference.get("swh:object") if not swh_object: return None swhid = swh_object.get("@swhid") if not swhid: return None swhid_reference = QualifiedSWHID.from_string(swhid) if swhid_reference.qualifiers(): anchor = swhid_reference.anchor if anchor: if anchor.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE: error_msg = ( "anchor qualifier should be a core SWHID with type one of " f"{', '.join(t.name.lower() for t in ALLOWED_QUALIFIERS_NODE_TYPE)}" ) raise ValidationError(error_msg) visit = swhid_reference.visit if visit: if visit.object_type != ObjectType.SNAPSHOT: raise ValidationError( f"visit qualifier should be a core SWHID with type snp, " f"not {visit.object_type.value}" ) if ( visit and anchor and visit.object_type == ObjectType.SNAPSHOT and anchor.object_type == ObjectType.SNAPSHOT ): logger.warn( "SWHID use of both anchor and visit targeting " f"a snapshot: {swhid_reference}" ) raise ValidationError( "'anchor=swh:1:snp:' is not supported when 'visit' is also provided." ) return swhid_reference def extended_swhid_from_qualified(swhid: QualifiedSWHID) -> ExtendedSWHID: """Used to get the target of a metadata object from a , as the latter uses a QualifiedSWHID.""" return ExtendedSWHID.from_string(str(swhid).split(";")[0]) def to_header_link(link: str, link_name: str) -> str: """Build a single header link. >>> link_next = to_header_link("next-url", "next") >>> link_next '; rel="next"' >>> ','.join([link_next, to_header_link("prev-url", "prev")]) '; rel="next",; rel="prev"' """ return f'<{link}>; rel="{link_name}"'