diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -66,7 +66,8 @@
ParserError,
)
from ..models import DepositClient, DepositCollection, DepositRequest
-from ..parsers import parse_swh_reference, parse_xml
+from ..parsers import parse_xml
+from ..utils import parse_swh_reference
ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"]
ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"]
diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py
--- a/swh/deposit/parsers.py
+++ b/swh/deposit/parsers.py
@@ -9,7 +9,6 @@
"""
import logging
-from typing import Dict, Optional, Union
from xml.parsers.expat import ExpatError
from django.conf import settings
@@ -17,15 +16,6 @@
from swh.deposit.errors import ParserError
from swh.deposit.utils import parse_xml as _parse_xml
-from swh.model.exceptions import ValidationError
-from swh.model.identifiers import (
- DIRECTORY,
- RELEASE,
- REVISION,
- SNAPSHOT,
- SWHID,
- parse_swhid,
-)
logger = logging.getLogger(__name__)
@@ -102,93 +92,3 @@
return SWHXMLParser().parse(raw_content)
except ExpatError as e:
raise ParserError(str(e))
-
-
-ALLOWED_QUALIFIERS_NODE_TYPE = (SNAPSHOT, REVISION, RELEASE, DIRECTORY)
-
-
-def parse_swh_reference(metadata: Dict) -> Optional[Union[str, SWHID]]:
- """Parse swh reference within the metadata dict (or origin) reference if found, None
- otherwise.
-
-
-
-
-
-
-
- or:
-
-
-
-
-
-
- Raises:
- ValidationError in case the swhid referenced (if any) is invalid
-
- Returns:
- Either swhid or origin reference if any. None otherwise.
-
- """ # noqa
- visit_swhid = None
- anchor_swhid = None
-
- swh_deposit = metadata.get("swh:deposit")
- if not swh_deposit:
- return None
-
- swh_reference = swh_deposit.get("swh:reference")
- if not swh_reference:
- return None
-
- swh_origin = swh_reference.get("swh:origin")
- if swh_origin:
- url = swh_origin.get("@url")
- if url:
- return url
-
- swh_object = swh_reference.get("swh:object")
- if not swh_object:
- return None
-
- swhid = swh_object.get("@swhid")
- if not swhid:
- return None
- swhid_reference = parse_swhid(swhid)
-
- if swhid_reference.metadata:
- anchor = swhid_reference.metadata.get("anchor")
- if anchor:
- anchor_swhid = parse_swhid(anchor)
- if anchor_swhid.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE:
- error_msg = (
- "anchor qualifier should be a core SWHID with type one of "
- f" {', '.join(ALLOWED_QUALIFIERS_NODE_TYPE)}"
- )
- raise ValidationError(error_msg)
-
- visit = swhid_reference.metadata.get("visit")
- if visit:
- visit_swhid = parse_swhid(visit)
- if visit_swhid.object_type != SNAPSHOT:
- raise ValidationError(
- f"visit qualifier should be a core SWHID with type {SNAPSHOT}"
- )
-
- if (
- visit_swhid
- and anchor_swhid
- and visit_swhid.object_type == SNAPSHOT
- and anchor_swhid.object_type == SNAPSHOT
- ):
- logger.warn(
- "SWHID use of both anchor and visit targeting "
- f"a snapshot: {swhid_reference}"
- )
- raise ValidationError(
- "'anchor=swh:1:snp:' is not supported when 'visit' is also provided."
- )
-
- return swhid_reference
diff --git a/swh/deposit/tests/api/test_parsers.py b/swh/deposit/tests/api/test_parsers.py
--- a/swh/deposit/tests/api/test_parsers.py
+++ b/swh/deposit/tests/api/test_parsers.py
@@ -6,11 +6,7 @@
from collections import OrderedDict
import io
-import pytest
-
-from swh.deposit.parsers import SWHXMLParser, parse_swh_reference, parse_xml
-from swh.model.exceptions import ValidationError
-from swh.model.identifiers import parse_swhid
+from swh.deposit.parsers import SWHXMLParser
def test_parsing_without_duplicates():
@@ -131,112 +127,3 @@
]
)
assert expected_dict == actual_result
-
-
-@pytest.fixture
-def xml_with_origin_reference():
- xml_data = """
-
-
-
-
-
-
-
- """
- return xml_data.strip()
-
-
-def test_parse_swh_reference_origin(xml_with_origin_reference):
- url = "https://url"
- xml_data = xml_with_origin_reference.format(url=url)
- metadata = parse_xml(xml_data)
-
- actual_origin = parse_swh_reference(metadata)
- assert actual_origin == url
-
-
-@pytest.fixture
-def xml_with_empty_reference():
- xml_data = """
-
-
- {swh_reference}
-
-
- """
- return xml_data.strip()
-
-
-@pytest.mark.parametrize(
- "xml_ref",
- [
- "",
- "",
- "",
- """""",
- ],
-)
-def test_parse_swh_reference_empty(xml_with_empty_reference, xml_ref):
- xml_body = xml_with_empty_reference.format(swh_reference=xml_ref)
- metadata = parse_xml(xml_body)
-
- assert parse_swh_reference(metadata) is None
-
-
-@pytest.fixture
-def xml_with_swhid(atom_dataset):
- return atom_dataset["entry-data-with-swhid"]
-
-
-@pytest.mark.parametrize(
- "swhid",
- [
- "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa
- "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
- "swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
- "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
- "swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
- "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49",
- ],
-)
-def test_parse_swh_reference_swhid(swhid, xml_with_swhid):
- xml_data = xml_with_swhid.format(swhid=swhid)
- metadata = parse_xml(xml_data)
-
- actual_swhid = parse_swh_reference(metadata)
- assert actual_swhid is not None
-
- expected_swhid = parse_swhid(swhid)
- assert actual_swhid == expected_swhid
-
-
-@pytest.mark.parametrize(
- "invalid_swhid,error_msg",
- [
- ("swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235", "Unexpected length"),
- (
- "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa
- "visit qualifier should be a core SWHID with type",
- ),
- (
- "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa
- "anchor qualifier should be a core SWHID with type one of",
- ),
- (
- "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04", # noqa
- "anchor=swh:1:snp",
- ),
- ],
-)
-def test_parse_swh_reference_invalid_swhid(invalid_swhid, error_msg, xml_with_swhid):
- """Unparsable swhid should raise
-
- """
- xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid)
- metadata = parse_xml(xml_invalid_swhid)
-
- with pytest.raises(ValidationError, match=error_msg):
- parse_swh_reference(metadata)
diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py
--- a/swh/deposit/tests/test_utils.py
+++ b/swh/deposit/tests/test_utils.py
@@ -9,10 +9,28 @@
import pytest
from swh.deposit import utils
+from swh.deposit.parsers import parse_xml
+from swh.model.exceptions import ValidationError
from swh.model.identifiers import SWHID, parse_swhid
from swh.model.model import MetadataTargetType
+@pytest.fixture
+def xml_with_origin_reference():
+ xml_data = """
+
+
+
+
+
+
+
+ """
+ return xml_data.strip()
+
+
def test_merge():
"""Calling utils.merge on dicts should merge without losing information
@@ -198,3 +216,96 @@
assert object_type == expected_type
assert metadata_context == expected_metadata_context
+
+
+def test_parse_swh_reference_origin(xml_with_origin_reference):
+ url = "https://url"
+ xml_data = xml_with_origin_reference.format(url=url)
+ metadata = parse_xml(xml_data)
+
+ actual_origin = utils.parse_swh_reference(metadata)
+ assert actual_origin == url
+
+
+@pytest.fixture
+def xml_with_empty_reference():
+ xml_data = """
+
+
+ {swh_reference}
+
+
+ """
+ return xml_data.strip()
+
+
+@pytest.mark.parametrize(
+ "xml_ref",
+ [
+ "",
+ "",
+ "",
+ """""",
+ ],
+)
+def test_parse_swh_reference_empty(xml_with_empty_reference, xml_ref):
+ xml_body = xml_with_empty_reference.format(swh_reference=xml_ref)
+ metadata = utils.parse_xml(xml_body)
+
+ assert utils.parse_swh_reference(metadata) is None
+
+
+@pytest.fixture
+def xml_with_swhid(atom_dataset):
+ return atom_dataset["entry-data-with-swhid"]
+
+
+@pytest.mark.parametrize(
+ "swhid",
+ [
+ "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa
+ "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
+ "swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
+ "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
+ "swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
+ "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49",
+ ],
+)
+def test_parse_swh_reference_swhid(swhid, xml_with_swhid):
+ xml_data = xml_with_swhid.format(swhid=swhid)
+ metadata = utils.parse_xml(xml_data)
+
+ actual_swhid = utils.parse_swh_reference(metadata)
+ assert actual_swhid is not None
+
+ expected_swhid = parse_swhid(swhid)
+ assert actual_swhid == expected_swhid
+
+
+@pytest.mark.parametrize(
+ "invalid_swhid,error_msg",
+ [
+ ("swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235", "Unexpected length"),
+ (
+ "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa
+ "visit qualifier should be a core SWHID with type",
+ ),
+ (
+ "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa
+ "anchor qualifier should be a core SWHID with type one of",
+ ),
+ (
+ "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04", # noqa
+ "anchor=swh:1:snp",
+ ),
+ ],
+)
+def test_parse_swh_reference_invalid_swhid(invalid_swhid, error_msg, xml_with_swhid):
+ """Unparsable swhid should raise
+
+ """
+ xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid)
+ metadata = utils.parse_xml(xml_invalid_swhid)
+
+ with pytest.raises(ValidationError, match=error_msg):
+ utils.parse_swh_reference(metadata)
diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py
--- a/swh/deposit/utils.py
+++ b/swh/deposit/utils.py
@@ -3,15 +3,27 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import logging
from types import GeneratorType
-from typing import Any, Dict, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union
import iso8601
import xmltodict
-from swh.model.identifiers import SWHID, normalize_timestamp, parse_swhid
+from swh.model.exceptions import ValidationError
+from swh.model.identifiers import (
+ DIRECTORY,
+ RELEASE,
+ REVISION,
+ SNAPSHOT,
+ SWHID,
+ normalize_timestamp,
+ parse_swhid,
+)
from swh.model.model import MetadataTargetType
+logger = logging.getLogger(__name__)
+
def parse_xml(stream, encoding="utf-8"):
namespaces = {
@@ -140,3 +152,93 @@
object_type = MetadataTargetType.ORIGIN
return object_type, metadata_context
+
+
+ALLOWED_QUALIFIERS_NODE_TYPE = (SNAPSHOT, REVISION, RELEASE, DIRECTORY)
+
+
+def parse_swh_reference(metadata: Dict) -> Optional[Union[str, SWHID]]:
+ """Parse swh reference within the metadata dict (or origin) reference if found, None
+ otherwise.
+
+
+
+
+
+
+
+ or:
+
+
+
+
+
+
+ Raises:
+ ValidationError in case the swhid referenced (if any) is invalid
+
+ Returns:
+ Either swhid or origin reference if any. None otherwise.
+
+ """ # noqa
+ visit_swhid = None
+ anchor_swhid = None
+
+ swh_deposit = metadata.get("swh:deposit")
+ if not swh_deposit:
+ return None
+
+ swh_reference = swh_deposit.get("swh:reference")
+ if not swh_reference:
+ return None
+
+ swh_origin = swh_reference.get("swh:origin")
+ if swh_origin:
+ url = swh_origin.get("@url")
+ if url:
+ return url
+
+ swh_object = swh_reference.get("swh:object")
+ if not swh_object:
+ return None
+
+ swhid = swh_object.get("@swhid")
+ if not swhid:
+ return None
+ swhid_reference = parse_swhid(swhid)
+
+ if swhid_reference.metadata:
+ anchor = swhid_reference.metadata.get("anchor")
+ if anchor:
+ anchor_swhid = parse_swhid(anchor)
+ if anchor_swhid.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE:
+ error_msg = (
+ "anchor qualifier should be a core SWHID with type one of "
+ f" {', '.join(ALLOWED_QUALIFIERS_NODE_TYPE)}"
+ )
+ raise ValidationError(error_msg)
+
+ visit = swhid_reference.metadata.get("visit")
+ if visit:
+ visit_swhid = parse_swhid(visit)
+ if visit_swhid.object_type != SNAPSHOT:
+ raise ValidationError(
+ f"visit qualifier should be a core SWHID with type {SNAPSHOT}"
+ )
+
+ if (
+ visit_swhid
+ and anchor_swhid
+ and visit_swhid.object_type == SNAPSHOT
+ and anchor_swhid.object_type == SNAPSHOT
+ ):
+ logger.warn(
+ "SWHID use of both anchor and visit targeting "
+ f"a snapshot: {swhid_reference}"
+ )
+ raise ValidationError(
+ "'anchor=swh:1:snp:' is not supported when 'visit' is also provided."
+ )
+
+ return swhid_reference