diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -66,7 +66,8 @@ ParserError, ) from ..models import DepositClient, DepositCollection, DepositRequest -from ..parsers import parse_swh_reference, parse_xml +from ..parsers import parse_xml +from ..utils import parse_swh_reference ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -9,7 +9,6 @@ """ import logging -from typing import Dict, Optional, Union from xml.parsers.expat import ExpatError from django.conf import settings @@ -17,15 +16,6 @@ from swh.deposit.errors import ParserError from swh.deposit.utils import parse_xml as _parse_xml -from swh.model.exceptions import ValidationError -from swh.model.identifiers import ( - DIRECTORY, - RELEASE, - REVISION, - SNAPSHOT, - SWHID, - parse_swhid, -) logger = logging.getLogger(__name__) @@ -102,93 +92,3 @@ return SWHXMLParser().parse(raw_content) except ExpatError as e: raise ParserError(str(e)) - - -ALLOWED_QUALIFIERS_NODE_TYPE = (SNAPSHOT, REVISION, RELEASE, DIRECTORY) - - -def parse_swh_reference(metadata: Dict) -> Optional[Union[str, SWHID]]: - """Parse swh reference within the metadata dict (or origin) reference if found, None - otherwise. - - - - - - - - or: - - - - - - - Raises: - ValidationError in case the swhid referenced (if any) is invalid - - Returns: - Either swhid or origin reference if any. None otherwise. - - """ # noqa - visit_swhid = None - anchor_swhid = None - - swh_deposit = metadata.get("swh:deposit") - if not swh_deposit: - return None - - swh_reference = swh_deposit.get("swh:reference") - if not swh_reference: - return None - - swh_origin = swh_reference.get("swh:origin") - if swh_origin: - url = swh_origin.get("@url") - if url: - return url - - swh_object = swh_reference.get("swh:object") - if not swh_object: - return None - - swhid = swh_object.get("@swhid") - if not swhid: - return None - swhid_reference = parse_swhid(swhid) - - if swhid_reference.metadata: - anchor = swhid_reference.metadata.get("anchor") - if anchor: - anchor_swhid = parse_swhid(anchor) - if anchor_swhid.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE: - error_msg = ( - "anchor qualifier should be a core SWHID with type one of " - f" {', '.join(ALLOWED_QUALIFIERS_NODE_TYPE)}" - ) - raise ValidationError(error_msg) - - visit = swhid_reference.metadata.get("visit") - if visit: - visit_swhid = parse_swhid(visit) - if visit_swhid.object_type != SNAPSHOT: - raise ValidationError( - f"visit qualifier should be a core SWHID with type {SNAPSHOT}" - ) - - if ( - visit_swhid - and anchor_swhid - and visit_swhid.object_type == SNAPSHOT - and anchor_swhid.object_type == SNAPSHOT - ): - logger.warn( - "SWHID use of both anchor and visit targeting " - f"a snapshot: {swhid_reference}" - ) - raise ValidationError( - "'anchor=swh:1:snp:' is not supported when 'visit' is also provided." - ) - - return swhid_reference diff --git a/swh/deposit/tests/api/test_parsers.py b/swh/deposit/tests/api/test_parsers.py --- a/swh/deposit/tests/api/test_parsers.py +++ b/swh/deposit/tests/api/test_parsers.py @@ -6,11 +6,7 @@ from collections import OrderedDict import io -import pytest - -from swh.deposit.parsers import SWHXMLParser, parse_swh_reference, parse_xml -from swh.model.exceptions import ValidationError -from swh.model.identifiers import parse_swhid +from swh.deposit.parsers import SWHXMLParser def test_parsing_without_duplicates(): @@ -131,112 +127,3 @@ ] ) assert expected_dict == actual_result - - -@pytest.fixture -def xml_with_origin_reference(): - xml_data = """ - - - - - - - - """ - return xml_data.strip() - - -def test_parse_swh_reference_origin(xml_with_origin_reference): - url = "https://url" - xml_data = xml_with_origin_reference.format(url=url) - metadata = parse_xml(xml_data) - - actual_origin = parse_swh_reference(metadata) - assert actual_origin == url - - -@pytest.fixture -def xml_with_empty_reference(): - xml_data = """ - - - {swh_reference} - - - """ - return xml_data.strip() - - -@pytest.mark.parametrize( - "xml_ref", - [ - "", - "", - "", - """""", - ], -) -def test_parse_swh_reference_empty(xml_with_empty_reference, xml_ref): - xml_body = xml_with_empty_reference.format(swh_reference=xml_ref) - metadata = parse_xml(xml_body) - - assert parse_swh_reference(metadata) is None - - -@pytest.fixture -def xml_with_swhid(atom_dataset): - return atom_dataset["entry-data-with-swhid"] - - -@pytest.mark.parametrize( - "swhid", - [ - "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa - "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa - "swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa - "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa - "swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa - "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", - ], -) -def test_parse_swh_reference_swhid(swhid, xml_with_swhid): - xml_data = xml_with_swhid.format(swhid=swhid) - metadata = parse_xml(xml_data) - - actual_swhid = parse_swh_reference(metadata) - assert actual_swhid is not None - - expected_swhid = parse_swhid(swhid) - assert actual_swhid == expected_swhid - - -@pytest.mark.parametrize( - "invalid_swhid,error_msg", - [ - ("swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235", "Unexpected length"), - ( - "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa - "visit qualifier should be a core SWHID with type", - ), - ( - "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa - "anchor qualifier should be a core SWHID with type one of", - ), - ( - "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04", # noqa - "anchor=swh:1:snp", - ), - ], -) -def test_parse_swh_reference_invalid_swhid(invalid_swhid, error_msg, xml_with_swhid): - """Unparsable swhid should raise - - """ - xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid) - metadata = parse_xml(xml_invalid_swhid) - - with pytest.raises(ValidationError, match=error_msg): - parse_swh_reference(metadata) diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -9,10 +9,28 @@ import pytest from swh.deposit import utils +from swh.deposit.parsers import parse_xml +from swh.model.exceptions import ValidationError from swh.model.identifiers import SWHID, parse_swhid from swh.model.model import MetadataTargetType +@pytest.fixture +def xml_with_origin_reference(): + xml_data = """ + + + + + + + + """ + return xml_data.strip() + + def test_merge(): """Calling utils.merge on dicts should merge without losing information @@ -198,3 +216,96 @@ assert object_type == expected_type assert metadata_context == expected_metadata_context + + +def test_parse_swh_reference_origin(xml_with_origin_reference): + url = "https://url" + xml_data = xml_with_origin_reference.format(url=url) + metadata = parse_xml(xml_data) + + actual_origin = utils.parse_swh_reference(metadata) + assert actual_origin == url + + +@pytest.fixture +def xml_with_empty_reference(): + xml_data = """ + + + {swh_reference} + + + """ + return xml_data.strip() + + +@pytest.mark.parametrize( + "xml_ref", + [ + "", + "", + "", + """""", + ], +) +def test_parse_swh_reference_empty(xml_with_empty_reference, xml_ref): + xml_body = xml_with_empty_reference.format(swh_reference=xml_ref) + metadata = utils.parse_xml(xml_body) + + assert utils.parse_swh_reference(metadata) is None + + +@pytest.fixture +def xml_with_swhid(atom_dataset): + return atom_dataset["entry-data-with-swhid"] + + +@pytest.mark.parametrize( + "swhid", + [ + "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa + "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa + "swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa + "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa + "swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa + "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", + ], +) +def test_parse_swh_reference_swhid(swhid, xml_with_swhid): + xml_data = xml_with_swhid.format(swhid=swhid) + metadata = utils.parse_xml(xml_data) + + actual_swhid = utils.parse_swh_reference(metadata) + assert actual_swhid is not None + + expected_swhid = parse_swhid(swhid) + assert actual_swhid == expected_swhid + + +@pytest.mark.parametrize( + "invalid_swhid,error_msg", + [ + ("swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235", "Unexpected length"), + ( + "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa + "visit qualifier should be a core SWHID with type", + ), + ( + "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa + "anchor qualifier should be a core SWHID with type one of", + ), + ( + "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04", # noqa + "anchor=swh:1:snp", + ), + ], +) +def test_parse_swh_reference_invalid_swhid(invalid_swhid, error_msg, xml_with_swhid): + """Unparsable swhid should raise + + """ + xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid) + metadata = utils.parse_xml(xml_invalid_swhid) + + with pytest.raises(ValidationError, match=error_msg): + utils.parse_swh_reference(metadata) diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -3,15 +3,27 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging from types import GeneratorType -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Union import iso8601 import xmltodict -from swh.model.identifiers import SWHID, normalize_timestamp, parse_swhid +from swh.model.exceptions import ValidationError +from swh.model.identifiers import ( + DIRECTORY, + RELEASE, + REVISION, + SNAPSHOT, + SWHID, + normalize_timestamp, + parse_swhid, +) from swh.model.model import MetadataTargetType +logger = logging.getLogger(__name__) + def parse_xml(stream, encoding="utf-8"): namespaces = { @@ -140,3 +152,93 @@ object_type = MetadataTargetType.ORIGIN return object_type, metadata_context + + +ALLOWED_QUALIFIERS_NODE_TYPE = (SNAPSHOT, REVISION, RELEASE, DIRECTORY) + + +def parse_swh_reference(metadata: Dict) -> Optional[Union[str, SWHID]]: + """Parse swh reference within the metadata dict (or origin) reference if found, None + otherwise. + + + + + + + + or: + + + + + + + Raises: + ValidationError in case the swhid referenced (if any) is invalid + + Returns: + Either swhid or origin reference if any. None otherwise. + + """ # noqa + visit_swhid = None + anchor_swhid = None + + swh_deposit = metadata.get("swh:deposit") + if not swh_deposit: + return None + + swh_reference = swh_deposit.get("swh:reference") + if not swh_reference: + return None + + swh_origin = swh_reference.get("swh:origin") + if swh_origin: + url = swh_origin.get("@url") + if url: + return url + + swh_object = swh_reference.get("swh:object") + if not swh_object: + return None + + swhid = swh_object.get("@swhid") + if not swhid: + return None + swhid_reference = parse_swhid(swhid) + + if swhid_reference.metadata: + anchor = swhid_reference.metadata.get("anchor") + if anchor: + anchor_swhid = parse_swhid(anchor) + if anchor_swhid.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE: + error_msg = ( + "anchor qualifier should be a core SWHID with type one of " + f" {', '.join(ALLOWED_QUALIFIERS_NODE_TYPE)}" + ) + raise ValidationError(error_msg) + + visit = swhid_reference.metadata.get("visit") + if visit: + visit_swhid = parse_swhid(visit) + if visit_swhid.object_type != SNAPSHOT: + raise ValidationError( + f"visit qualifier should be a core SWHID with type {SNAPSHOT}" + ) + + if ( + visit_swhid + and anchor_swhid + and visit_swhid.object_type == SNAPSHOT + and anchor_swhid.object_type == SNAPSHOT + ): + logger.warn( + "SWHID use of both anchor and visit targeting " + f"a snapshot: {swhid_reference}" + ) + raise ValidationError( + "'anchor=swh:1:snp:' is not supported when 'visit' is also provided." + ) + + return swhid_reference