diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py --- a/swh/deposit/api/checks.py +++ b/swh/deposit/api/checks.py @@ -15,10 +15,11 @@ """ from typing import Dict, Optional, Tuple +from xml.etree import ElementTree import iso8601 -from swh.deposit.utils import normalize_date, parse_swh_metadata_provenance +from swh.deposit.utils import NAMESPACES, normalize_date, parse_swh_metadata_provenance MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" INVALID_DATE_FORMAT = "Invalid date format" @@ -27,7 +28,7 @@ METADATA_PROVENANCE_KEY = "swh:metadata-provenance" -def check_metadata(metadata: Dict) -> Tuple[bool, Optional[Dict]]: +def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]]: """Check metadata for mandatory field presence and date format. Args: @@ -47,9 +48,9 @@ ("atom:author", "codemeta:author"): False, } - for field, value in metadata.items(): - for possible_names in alternate_fields: - if field in possible_names: + for possible_names in alternate_fields: + for possible_name in possible_names: + if metadata.find(possible_name, namespaces=NAMESPACES) is not None: alternate_fields[possible_names] = True continue @@ -68,18 +69,17 @@ fields = [] - commit_date = metadata.get("codemeta:datePublished") - author_date = metadata.get("codemeta:dateCreated") - - if commit_date: + for commit_date in metadata.findall( + "codemeta:datePublished", namespaces=NAMESPACES + ): try: - normalize_date(commit_date) + normalize_date(commit_date.text) except iso8601.iso8601.ParseError: fields.append("codemeta:datePublished") - if author_date: + for author_date in metadata.findall("codemeta:dateCreated", namespaces=NAMESPACES): try: - normalize_date(author_date) + normalize_date(author_date.text) except iso8601.iso8601.ParseError: fields.append("codemeta:dateCreated") diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -9,6 +9,7 @@ import json from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union import uuid +from xml.etree import ElementTree import attr from django.core.files.uploadedfile import UploadedFile @@ -27,7 +28,7 @@ from swh.deposit.api.converters import convert_status_detail from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication from swh.deposit.models import Deposit -from swh.deposit.utils import compute_metadata_context +from swh.deposit.utils import NAMESPACES, compute_metadata_context from swh.model import hashutil from swh.model.model import ( MetadataAuthority, @@ -316,6 +317,8 @@ metadata = deposit_request_data.get(METADATA_KEY) if metadata: + # TODO: remove non-raw metadata? we don't use these anymore except in + # manual queries to the deposit DB raw_metadata = deposit_request_data[RAW_METADATA_KEY] deposit_request = DepositRequest( type=METADATA_TYPE, @@ -498,14 +501,23 @@ archive=filehandler.name, ) - def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]: - """Given a metadata stream, reads the metadata and returns both the - parsed and the raw metadata. + def _read_metadata( + self, metadata_stream + ) -> Tuple[bytes, Dict[str, Any], ElementTree.Element]: + """ + Given a metadata stream, reads the metadata and returns the metadata in three + forms: + * verbatim (as raw bytes), for archival in long-term storage + * parsed as a Python dict, for archival in postgresql's jsonb type + * parsed as ElementTree, to extract information immediately """ raw_metadata = metadata_stream.read() - metadata = parse_xml(raw_metadata) - return raw_metadata, metadata + metadata_dict = parse_xml(raw_metadata) + metadata_tree = ElementTree.fromstring(raw_metadata) + # TODO: remove metadata_dict? we don't use it anymore, except in manual + # queries to the deposit DB + return raw_metadata, metadata_dict, metadata_tree def _multipart_upload( self, @@ -592,7 +604,9 @@ self._check_file_md5sum(filehandler, headers.content_md5sum) try: - raw_metadata, metadata = self._read_metadata(data["application/atom+xml"]) + raw_metadata, metadata_dict, metadata_tree = self._read_metadata( + data["application/atom+xml"] + ) except ParserError: raise DepositError( PARSING_ERROR, @@ -601,7 +615,7 @@ "Please ensure your metadata file is correctly formatted.", ) - self._set_deposit_origin_from_metadata(deposit, metadata, headers) + self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers) # actual storage of data self._deposit_put( @@ -609,7 +623,7 @@ ) deposit_request_data = { ARCHIVE_KEY: filehandler, - METADATA_KEY: metadata, + METADATA_KEY: metadata_dict, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( @@ -628,7 +642,8 @@ self, deposit: Deposit, swhid_reference: Union[str, QualifiedSWHID], - metadata: Dict, + metadata_dict: Dict, + metadata_tree: ElementTree.Element, raw_metadata: bytes, deposit_origin: Optional[str] = None, ) -> Tuple[ExtendedSWHID, Deposit, DepositRequest]: @@ -644,8 +659,10 @@ Args: deposit: Deposit reference swhid_reference: The swhid or the origin to attach metadata information to - metadata: Full dict of metadata to check for validity (parsed out of - raw_metadata) + metadata_dict: Full dict of metadata for storage in the deposit DB as jsonb + (parsed out of raw_metadata) + metadata_tree: Full element tree of metadata to check for validity + (parsed out of raw_metadata) raw_metadata: The actual raw metadata to send in the storage metadata deposit_origin: Optional deposit origin url to use if any (e.g. deposit update scenario provides one) @@ -658,7 +675,7 @@ Tuple of target swhid, deposit, and deposit request """ - metadata_ok, error_details = check_metadata(metadata) + metadata_ok, error_details = check_metadata(metadata_tree) if not metadata_ok: assert error_details, "Details should be set when a failure occurs" raise DepositError( @@ -675,7 +692,7 @@ # replace metadata within the deposit backend deposit_request_data = { - METADATA_KEY: metadata, + METADATA_KEY: metadata_dict, RAW_METADATA_KEY: raw_metadata, } @@ -818,7 +835,9 @@ ) try: - raw_metadata, metadata = self._read_metadata(metadata_stream) + raw_metadata, metadata_dict, metadata_tree = self._read_metadata( + metadata_stream + ) except ParserError: raise DepositError( BAD_REQUEST, @@ -827,16 +846,16 @@ "Please ensure your metadata file is correctly formatted.", ) - if metadata is None: + if metadata_dict is None: raise DepositError( BAD_REQUEST, empty_atom_entry_summary, empty_atom_entry_desc ) - self._set_deposit_origin_from_metadata(deposit, metadata, headers) + self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers) # Determine if we are in the metadata-only deposit case try: - swhid_ref = parse_swh_reference(metadata) + swhid_ref = parse_swh_reference(metadata_tree) except ValidationError as e: raise DepositError( PARSING_ERROR, "Invalid SWHID reference", str(e), @@ -855,7 +874,7 @@ if swhid_ref is not None: deposit.save() # We need a deposit id target_swhid, depo, depo_request = self._store_metadata_deposit( - deposit, swhid_ref, metadata, raw_metadata + deposit, swhid_ref, metadata_dict, metadata_tree, raw_metadata ) deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS @@ -879,7 +898,7 @@ self._deposit_request_put( deposit, - {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, + {METADATA_KEY: metadata_dict, RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) @@ -892,10 +911,14 @@ ) def _set_deposit_origin_from_metadata(self, deposit, metadata, headers): - create_origin = metadata.get("swh:deposit", {}).get("swh:create_origin") - add_to_origin = metadata.get("swh:deposit", {}).get("swh:add_to_origin") + create_origin = metadata.find( + "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES + ) + add_to_origin = metadata.find( + "swh:deposit/swh:add_to_origin/swh:origin", namespaces=NAMESPACES + ) - if create_origin and add_to_origin: + if create_origin is not None and add_to_origin is not None: raise DepositError( BAD_REQUEST, " and are mutually exclusive, " @@ -903,13 +926,13 @@ "origin.", ) - if create_origin: - origin_url = create_origin["swh:origin"]["@url"] + if create_origin is not None: + origin_url = create_origin.attrib["url"] check_client_origin(deposit.client, origin_url) deposit.origin_url = origin_url - if add_to_origin: - origin_url = add_to_origin["swh:origin"]["@url"] + if add_to_origin is not None: + origin_url = add_to_origin.attrib["url"] check_client_origin(deposit.client, origin_url) deposit.parent = ( Deposit.objects.filter( @@ -922,7 +945,10 @@ ) deposit.origin_url = origin_url - if "atom:external_identifier" in metadata: + external_identifier_element = metadata.find( + "atom:external_identifier", namespaces=NAMESPACES + ) + if external_identifier_element is not None: # Deprecated tag. # When clients stopped using it, this should raise an error # unconditionally @@ -934,7 +960,7 @@ " and from now on.", ) - if headers.slug and metadata["atom:external_identifier"] != headers.slug: + if headers.slug and external_identifier_element.text != headers.slug: raise DepositError( BAD_REQUEST, "The tag and Slug header are deprecated, " diff --git a/swh/deposit/api/edit.py b/swh/deposit/api/edit.py --- a/swh/deposit/api/edit.py +++ b/swh/deposit/api/edit.py @@ -108,7 +108,9 @@ ) try: - raw_metadata, metadata = self._read_metadata(request.data) + raw_metadata, metadata_dict, metadata_tree = self._read_metadata( + request.data + ) except ParserError: raise DepositError( BAD_REQUEST, @@ -117,7 +119,7 @@ "Please ensure your metadata file is correctly formatted.", ) - if not metadata: + if not metadata_dict: raise DepositError( BAD_REQUEST, "Empty body request is not supported", @@ -128,7 +130,8 @@ _, deposit, deposit_request = self._store_metadata_deposit( deposit, QualifiedSWHID.from_string(swhid), - metadata, + metadata_dict, + metadata_tree, raw_metadata, deposit.origin_url, ) diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py --- a/swh/deposit/api/private/__init__.py +++ b/swh/deposit/api/private/__init__.py @@ -3,7 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, Optional, Tuple +from typing import Optional from rest_framework.permissions import AllowAny from rest_framework.views import APIView @@ -36,24 +36,20 @@ for deposit_request in deposit_requests: yield deposit_request - def _metadata_get(self, deposit: Deposit) -> Tuple[Dict[str, Any], Optional[bytes]]: - """Given a deposit, retrieve all metadata requests into one Dict and returns both that - aggregated metadata dict and the list of raw_metdadata. + def _metadata_get(self, deposit: Deposit) -> Optional[bytes]: + """Retrieve the last non-empty raw metadata object for that deposit, if any Args: deposit: The deposit instance to extract metadata from - Returns: - Tuple of last metadata dict and last raw_metadata - """ for deposit_request in self._deposit_requests( deposit, request_type=METADATA_TYPE ): if deposit_request.raw_metadata is not None: - return (deposit_request.metadata, deposit_request.raw_metadata) + return deposit_request.raw_metadata - return ({}, None) + return None class APIPrivateView(APIConfig, APIView): diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -8,6 +8,7 @@ from shutil import get_unpack_formats import tarfile from typing import Dict, Optional, Tuple +from xml.etree import ElementTree import zipfile from rest_framework import status @@ -151,7 +152,7 @@ Tuple (status, json response, content-type) """ - metadata, _ = self._metadata_get(deposit) + raw_metadata = self._metadata_get(deposit) details_dict: Dict = {} # will check each deposit's associated request (both of type # archive and metadata) for errors @@ -160,11 +161,18 @@ assert details is not None details_dict.update(details) - metadata_status_ok, details = check_metadata(metadata) - # Ensure in case of error, we do have the rejection details - assert metadata_status_ok or (not metadata_status_ok and details is not None) - # we can have warnings even if checks are ok (e.g. missing suggested field) - details_dict.update(details or {}) + if raw_metadata is None: + metadata_status_ok = False + details_dict["metadata"] = [{"summary": "Missing Atom document"}] + else: + metadata_tree = ElementTree.fromstring(raw_metadata) + metadata_status_ok, details = check_metadata(metadata_tree) + # Ensure in case of error, we do have the rejection details + assert metadata_status_ok or ( + not metadata_status_ok and details is not None + ) + # we can have warnings even if checks are ok (e.g. missing suggested field) + details_dict.update(details or {}) deposit_status_ok = archives_status_ok and metadata_status_ok # if any details_dict arose, the deposit is rejected diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,12 +7,13 @@ import os import shutil import tempfile -from typing import Any, Dict, Tuple +from typing import Any, Dict, Optional, Tuple +from xml.etree import ElementTree from rest_framework import status from swh.core import tarball -from swh.deposit.utils import normalize_date +from swh.deposit.utils import NAMESPACES, normalize_date from swh.model.hashutil import hash_to_hex from swh.model.model import MetadataAuthorityType from swh.model.swhids import CoreSWHID @@ -20,6 +21,7 @@ from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, SWH_PERSON from ...models import Deposit +from ...utils import parse_xml from ..common import APIGet @@ -105,31 +107,33 @@ """ - def _normalize_dates(self, deposit, metadata): + def _parse_dates( + self, deposit: Deposit, metadata: ElementTree.Element + ) -> Tuple[dict, dict]: """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. - Args: - deposit (Deposit): Deposit model representation - metadata (Dict): Metadata dict representation - Returns: Tuple of author date, committer date. Those dates are swh normalized. """ - commit_date = metadata.get("codemeta:datePublished") - author_date = metadata.get("codemeta:dateCreated") - - if author_date and commit_date: - pass - elif commit_date: - author_date = commit_date - elif author_date: - commit_date = author_date + commit_date_elt = metadata.find("codemeta:datePublished", namespaces=NAMESPACES) + author_date_elt = metadata.find("codemeta:dateCreated", namespaces=NAMESPACES) + + author_date: Any + commit_date: Any + + if author_date_elt is None and commit_date_elt is None: + author_date = commit_date = deposit.complete_date + elif commit_date_elt is None: + author_date = commit_date = author_date_elt.text # type: ignore + elif author_date_elt is None: + author_date = commit_date = commit_date_elt.text else: - author_date = deposit.complete_date - commit_date = deposit.complete_date + author_date = author_date_elt.text + commit_date = commit_date_elt.text + return (normalize_date(author_date), normalize_date(commit_date)) def metadata_read(self, deposit: Deposit) -> Dict[str, Any]: @@ -158,8 +162,14 @@ (author_date, committer_date, etc...) """ - metadata, raw_metadata = self._metadata_get(deposit) - author_date, commit_date = self._normalize_dates(deposit, metadata) + raw_metadata = self._metadata_get(deposit) + author_date: Optional[dict] + commit_date: Optional[dict] + if raw_metadata: + metadata_tree = ElementTree.fromstring(raw_metadata) + author_date, commit_date = self._parse_dates(deposit, metadata_tree) + else: + author_date = commit_date = None if deposit.parent: parent_swhid = deposit.parent.swhid @@ -170,10 +180,15 @@ else: parents = [] - release_notes = metadata.get("codemeta:releaseNotes") - if isinstance(release_notes, list): - release_notes = "\n\n".join(release_notes) - if not release_notes: + release_notes_elements = metadata_tree.findall( + "codemeta:releaseNotes", namespaces=NAMESPACES + ) + release_notes: Optional[str] + if release_notes_elements: + release_notes = "\n\n".join( + element.text for element in release_notes_elements if element.text + ) + else: release_notes = None return { @@ -186,7 +201,7 @@ }, "tool": self.tool, "metadata_raw": raw_metadata, - "metadata_dict": metadata, + "metadata_dict": parse_xml(raw_metadata), "deposit": { "id": deposit.id, "client": deposit.client.username, diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py --- a/swh/deposit/cli/client.py +++ b/swh/deposit/cli/client.py @@ -595,13 +595,15 @@ """Deposit metadata only upload """ + from xml.etree import ElementTree + from swh.deposit.client import PublicApiDepositClient - from swh.deposit.utils import parse_swh_reference, parse_xml + from swh.deposit.utils import parse_swh_reference # Parse to check for a swhid presence within the metadata file with open(metadata_path, "r") as f: metadata_raw = f.read() - actual_swhid = parse_swh_reference(parse_xml(metadata_raw)) + actual_swhid = parse_swh_reference(ElementTree.fromstring(metadata_raw)) if not actual_swhid: raise InputError("A SWHID must be provided for a metadata-only deposit") diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py --- a/swh/deposit/tests/api/test_checks.py +++ b/swh/deposit/tests/api/test_checks.py @@ -3,7 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import textwrap from typing import Any, Dict +from xml.etree import ElementTree import pytest @@ -19,42 +21,77 @@ } } +XMLNS = """xmlns="http://www.w3.org/2005/Atom" + xmlns:swh="https://www.softwareheritage.org/schema/2018/deposit" + xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" + xmlns:schema="http://schema.org/" +""" + +PROVENANCE_XML = """ + + + some-metadata-provenance-url + + +""" + +_parameters1 = [ + textwrap.dedent(metadata_ok) + for (metadata_ok,) in [ + ( + f""" + + something + something-else + foo + someone + {PROVENANCE_XML} + + """, + ), + ( + f""" + + something + something-else + foo + no one + {PROVENANCE_XML} + + """, + ), + ( + f""" + + some url + bar + no one + {PROVENANCE_XML} + + """, + ), + ( + f""" + + some url + some id + nar + no one + 2020-12-21 + 2020-12-21 + {PROVENANCE_XML} + + """, + ), + ] +] + @pytest.mark.parametrize( - "metadata_ok", - [ - { - "atom:url": "something", - "atom:external_identifier": "something-else", - "atom:name": "foo", - "atom:author": "someone", - **METADATA_PROVENANCE_DICT, - }, - { - "atom:url": "some url", - "atom:external_identifier": "some id", - "atom:title": "bar", - "atom:author": "no one", - **METADATA_PROVENANCE_DICT, - }, - { - "atom:url": "some url", - "codemeta:name": "bar", - "codemeta:author": "no one", - **METADATA_PROVENANCE_DICT, - }, - { - "atom:url": "some url", - "atom:external_identifier": "some id", - "atom:title": "bar", - "atom:author": "no one", - "codemeta:datePublished": "2020-12-21", - "codemeta:dateCreated": "2020-12-21", - }, - ], + "metadata_ok", _parameters1, ) def test_api_checks_check_metadata_ok(metadata_ok, swh_checks_deposit): - actual_check, detail = check_metadata(metadata_ok) + actual_check, detail = check_metadata(ElementTree.fromstring(metadata_ok)) assert actual_check is True, f"Unexpected result: {detail}" if "swh:deposit" in metadata_ok: # no missing suggested field @@ -71,117 +108,143 @@ } -@pytest.mark.parametrize( - "metadata_ko,expected_summary", - [ +_parameters2 = [ + (textwrap.dedent(metadata_ko), expected_summary) + for (metadata_ko, expected_summary) in [ ( - { - "atom:url": "something", - "atom:external_identifier": "something-else", - "atom:author": "someone", - **METADATA_PROVENANCE_DICT, - }, + f""" + + something + something-else + someone + {PROVENANCE_XML} + + """, { "summary": "Mandatory fields are missing", "fields": ["atom:name or atom:title or codemeta:name"], }, ), ( - { - "atom:url": "something", - "atom:external_identifier": "something-else", - "atom:title": "foobar", - **METADATA_PROVENANCE_DICT, - }, + f""" + + something + something-else + foobar + {PROVENANCE_XML} + + """, { "summary": "Mandatory fields are missing", "fields": ["atom:author or codemeta:author"], }, ), ( - { - "atom:url": "something", - "atom:external_identifier": "something-else", - "codemeta:title": "bar", - "atom:author": "someone", - **METADATA_PROVENANCE_DICT, - }, + f""" + + something + something-else + bar + someone + {PROVENANCE_XML} + + """, { "summary": "Mandatory fields are missing", "fields": ["atom:name or atom:title or codemeta:name"], }, ), ( - { - "atom:url": "something", - "atom:external_identifier": "something-else", - "atom:title": "foobar", - "author": "foo", - **METADATA_PROVENANCE_DICT, - }, + f""" + + something + something-else + foobar + foo + {PROVENANCE_XML} + + """, { "summary": "Mandatory fields are missing", "fields": ["atom:author or codemeta:author"], }, ), ( - { - "atom:url": "something", - "atom:external_identifier": "something-else", - "atom:title": "foobar", - "atom:authorblahblah": "foo", - **METADATA_PROVENANCE_DICT, - }, + f""" + + something + something-else + bar + someone + {PROVENANCE_XML} + + """, { "summary": "Mandatory fields are missing", "fields": ["atom:author or codemeta:author"], }, ), ( + f""" + + something + something-else + bar + someone + 2020-aa-21 + 2020-12-bb + {PROVENANCE_XML} + + """, { - "atom:url": "something", - "atom:external_identifier": "something-else", - "atom:author": "someone", - **METADATA_PROVENANCE_DICT, - }, - { - "summary": "Mandatory fields are missing", - "fields": ["atom:name or atom:title or codemeta:name"], + "summary": "Invalid date format", + "fields": ["codemeta:datePublished", "codemeta:dateCreated"], }, ), - ], -) + ] +] + + +@pytest.mark.parametrize("metadata_ko,expected_summary", _parameters2) def test_api_checks_check_metadata_ko( metadata_ko, expected_summary, swh_checks_deposit ): - actual_check, error_detail = check_metadata(metadata_ko) + actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) assert actual_check is False assert error_detail == {"metadata": [expected_summary]} -@pytest.mark.parametrize( - "metadata_ko,expected_invalid_summary", - [ +_parameters3 = [ + (textwrap.dedent(metadata_ko), expected_summary) + for (metadata_ko, expected_summary) in [ ( - { - "atom:url": "some url", - "atom:external_identifier": "some id", - "atom:title": "bar", - "atom:author": "no one", - "codemeta:datePublished": "2020-aa-21", - "codemeta:dateCreated": "2020-12-bb", - }, + f""" + + some url + someid + bar + no one + 2020-aa-21 + 2020-12-bb + + """, { "summary": "Invalid date format", "fields": ["codemeta:datePublished", "codemeta:dateCreated"], }, ), - ], -) + ] +] + + +@pytest.mark.parametrize("metadata_ko,expected_invalid_summary", _parameters3) def test_api_checks_check_metadata_fields_ko_and_missing_suggested_fields( metadata_ko, expected_invalid_summary, swh_checks_deposit ): - actual_check, error_detail = check_metadata(metadata_ko) + actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) assert actual_check is False assert error_detail == { "metadata": [expected_invalid_summary] diff --git a/swh/deposit/tests/api/test_collection_add_to_origin.py b/swh/deposit/tests/api/test_collection_add_to_origin.py --- a/swh/deposit/tests/api/test_collection_add_to_origin.py +++ b/swh/deposit/tests/api/test_collection_add_to_origin.py @@ -39,7 +39,7 @@ data=atom_dataset["entry-data-with-add-to-origin"] % origin_url, ) - assert response.status_code == status.HTTP_201_CREATED + assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] @@ -105,7 +105,7 @@ data=atom_dataset["entry-data0"] % origin_url, ) - assert response.status_code == status.HTTP_403_FORBIDDEN + assert response.status_code == status.HTTP_403_FORBIDDEN, response.content.decode() assert b"must start with" in response.content diff --git a/swh/deposit/tests/api/test_collection_post_multipart.py b/swh/deposit/tests/api/test_collection_post_multipart.py --- a/swh/deposit/tests/api/test_collection_post_multipart.py +++ b/swh/deposit/tests/api/test_collection_post_multipart.py @@ -42,7 +42,6 @@ HTTP_IN_PROGRESS="false", ) - print(response.content.decode()) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py --- a/swh/deposit/tests/api/test_deposit_private_check.py +++ b/swh/deposit/tests/api/test_deposit_private_check.py @@ -7,11 +7,7 @@ import pytest from rest_framework import status -from swh.deposit.api.checks import ( - MANDATORY_FIELDS_MISSING, - METADATA_PROVENANCE_KEY, - SUGGESTED_FIELDS_MISSING, -) +from swh.deposit.api.checks import METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING from swh.deposit.api.private.deposit_check import ( MANDATORY_ARCHIVE_INVALID, MANDATORY_ARCHIVE_MISSING, @@ -142,18 +138,9 @@ assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED # metadata check failure - assert len(details["metadata"]) == 2 + assert len(details["metadata"]) == 1 mandatory = details["metadata"][0] - assert mandatory["summary"] == MANDATORY_FIELDS_MISSING - assert set(mandatory["fields"]) == set( - [ - "atom:author or codemeta:author", - "atom:name or atom:title or codemeta:name", - ] - ) - suggested = details["metadata"][1] - assert suggested["summary"] == SUGGESTED_FIELDS_MISSING - assert set(suggested["fields"]) == set([METADATA_PROVENANCE_KEY]) + assert mandatory["summary"] == "Missing Atom document" deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py --- a/swh/deposit/tests/cli/test_client.py +++ b/swh/deposit/tests/cli/test_client.py @@ -10,6 +10,7 @@ import logging import os from unittest.mock import MagicMock +from xml.etree import ElementTree import pytest import yaml @@ -201,7 +202,7 @@ == "meta-prov-url" ) - checks_ok, detail = check_metadata(actual_metadata) + checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml)) assert checks_ok is True assert detail is None @@ -227,7 +228,7 @@ assert actual_metadata.get("codemeta:identifier") is None assert actual_metadata.get("swh:deposit") is None - checks_ok, detail = check_metadata(actual_metadata) + checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml)) assert checks_ok is True assert detail == { diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -3,10 +3,11 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from xml.etree import ElementTree + import pytest from swh.deposit import utils -from swh.deposit.parsers import parse_xml from swh.model.exceptions import ValidationError from swh.model.swhids import CoreSWHID, QualifiedSWHID @@ -103,7 +104,7 @@ def test_parse_swh_reference_origin(xml_with_origin_reference): url = "https://url" xml_data = xml_with_origin_reference.format(url=url) - metadata = parse_xml(xml_data) + metadata = ElementTree.fromstring(xml_data) actual_origin = utils.parse_swh_reference(metadata) assert actual_origin == url @@ -133,7 +134,7 @@ ) def test_parse_swh_reference_empty(xml_swh_deposit_template, xml_ref): xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref) - metadata = utils.parse_xml(xml_body) + metadata = ElementTree.fromstring(xml_body) assert utils.parse_swh_reference(metadata) is None @@ -156,7 +157,7 @@ ) def test_parse_swh_reference_swhid(swhid, xml_with_swhid): xml_data = xml_with_swhid.format(swhid=swhid) - metadata = utils.parse_xml(xml_data) + metadata = ElementTree.fromstring(xml_data) actual_swhid = utils.parse_swh_reference(metadata) assert actual_swhid is not None @@ -182,7 +183,7 @@ """ xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid) - metadata = utils.parse_xml(xml_invalid_swhid) + metadata = ElementTree.fromstring(xml_invalid_swhid) with pytest.raises(ValidationError): utils.parse_swh_reference(metadata) @@ -198,7 +199,7 @@ ) def test_parse_swh_metatada_provenance_empty(xml_swh_deposit_template, xml_ref): xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref) - metadata = utils.parse_xml(xml_body) + metadata = ElementTree.fromstring(xml_body) assert utils.parse_swh_metadata_provenance(metadata) is None @@ -210,7 +211,7 @@ def test_parse_swh_metadata_provenance2(xml_with_metadata_provenance): xml_data = xml_with_metadata_provenance.format(url="https://url.org/metadata/url") - metadata = utils.parse_xml(xml_data) + metadata = ElementTree.fromstring(xml_data) actual_url = utils.parse_swh_metadata_provenance(metadata) diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -5,6 +5,7 @@ import logging from typing import Any, Dict, Optional, Union +from xml.etree import ElementTree import iso8601 import xmltodict @@ -16,21 +17,22 @@ logger = logging.getLogger(__name__) -def parse_xml(stream, encoding="utf-8"): - namespaces = { - "http://www.w3.org/2005/Atom": "atom", - "http://www.w3.org/2007/app": "app", - "http://purl.org/dc/terms/": "dc", - "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta", - "http://purl.org/net/sword/terms/": "sword", - "https://www.softwareheritage.org/schema/2018/deposit": "swh", - "http://schema.org/": "schema", - } +NAMESPACES = { + "atom": "http://www.w3.org/2005/Atom", + "app": "http://www.w3.org/2007/app", + "dc": "http://purl.org/dc/terms/", + "codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", + "sword": "http://purl.org/net/sword/terms/", + "swh": "https://www.softwareheritage.org/schema/2018/deposit", + "schema": "http://schema.org/", +} + +def parse_xml(stream, encoding="utf-8"): data = xmltodict.parse( stream, encoding=encoding, - namespaces=namespaces, + namespaces={uri: prefix for (prefix, uri) in NAMESPACES.items()}, process_namespaces=True, dict_constructor=dict, ) @@ -98,7 +100,7 @@ def parse_swh_metadata_provenance( - metadata: Dict, + metadata: ElementTree.Element, ) -> Optional[Union[QualifiedSWHID, str]]: """Parse swh metadata-provenance within the metadata dict reference if found, None otherwise. @@ -121,19 +123,17 @@ Either the metadata provenance url if any or None otherwise """ - - swh_deposit = metadata.get("swh:deposit") - if not swh_deposit: - return None - - swh_metadata_provenance = swh_deposit.get("swh:metadata-provenance") - if not swh_metadata_provenance: - return None - - return swh_metadata_provenance.get("schema:url") + url_element = metadata.find( + "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES + ) + if url_element is not None: + return url_element.text + return None -def parse_swh_reference(metadata: Dict,) -> Optional[Union[QualifiedSWHID, str]]: +def parse_swh_reference( + metadata: ElementTree.Element, +) -> Optional[Union[QualifiedSWHID, str]]: """Parse swh reference within the metadata dict (or origin) reference if found, None otherwise. @@ -155,7 +155,7 @@ Args: - metadata: result of parsing an Atom document with :func:`parse_xml` + metadata: result of parsing an Atom document Raises: ValidationError in case the swhid referenced (if any) is invalid @@ -164,27 +164,21 @@ Either swhid or origin reference if any. None otherwise. """ # noqa - swh_deposit = metadata.get("swh:deposit") - if not swh_deposit: - return None - - swh_reference = swh_deposit.get("swh:reference") - if not swh_reference: - return None - - swh_origin = swh_reference.get("swh:origin") - if swh_origin: - url = swh_origin.get("@url") - if url: - return url + ref_origin = metadata.find( + "swh:deposit/swh:reference/swh:origin[@url]", namespaces=NAMESPACES + ) + if ref_origin is not None: + return ref_origin.attrib["url"] - swh_object = swh_reference.get("swh:object") - if not swh_object: + ref_object = metadata.find( + "swh:deposit/swh:reference/swh:object[@swhid]", namespaces=NAMESPACES + ) + if ref_object is None: return None - - swhid = swh_object.get("@swhid") + swhid = ref_object.attrib["swhid"] if not swhid: return None + swhid_reference = QualifiedSWHID.from_string(swhid) if swhid_reference.qualifiers():