diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -8,6 +8,7 @@ """ +from typing import Dict, Optional, Union from xml.parsers.expat import ExpatError from django.conf import settings @@ -15,6 +16,8 @@ import xmltodict from swh.deposit.errors import ParserError +from swh.deposit.utils import clean_swhid +from swh.model.identifiers import SWHID, parse_swhid class SWHFileUploadZipParser(FileUploadParser): @@ -51,6 +54,7 @@ "http://purl.org/dc/terms/": None, "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta", "http://purl.org/net/sword/": "sword", + "https://www.softwareheritage.org/schema/2018/deposit": "swh", } data = xmltodict.parse( @@ -101,3 +105,50 @@ return SWHXMLParser().parse(raw_content) except ExpatError as e: raise ParserError(str(e)) + + +def parse_swh_reference(metadata: Dict) -> Optional[Union[str, SWHID]]: + """Parse swh reference within the metadata dict (or origin) reference if found, None + otherwise. + + + + + + + + or: + + + + + + + """ + swh_deposit = metadata.get("swh:deposit") + if not swh_deposit: + return None + + swh_reference = swh_deposit.get("swh:reference") + if not swh_reference: + return None + + swh_origin = swh_reference.get("swh:origin") + if swh_origin: + url = swh_origin.get("@url") + if url: + return url + + swh_object = swh_reference.get("swh:object") + if not swh_object: + return None + + swhid = swh_object.get("@swhid") + if not swhid: + return None + return parse_swhid(clean_swhid(swhid)) diff --git a/swh/deposit/tests/api/test_parser.py b/swh/deposit/tests/api/test_parser.py --- a/swh/deposit/tests/api/test_parser.py +++ b/swh/deposit/tests/api/test_parser.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,7 +6,11 @@ from collections import OrderedDict import io -from swh.deposit.parsers import SWHXMLParser +import pytest + +from swh.deposit.parsers import SWHXMLParser, parse_swh_reference, parse_xml +from swh.deposit.utils import clean_swhid +from swh.model.identifiers import parse_swhid def test_parsing_without_duplicates(): @@ -127,3 +131,94 @@ ] ) assert expected_dict == actual_result + + +@pytest.fixture +def xml_with_origin_reference(): + xml_data = """ + + + + + + + + """ + return xml_data.strip() + + +def test_parse_swh_reference_origin(xml_with_origin_reference): + url = "https://url" + xml_data = xml_with_origin_reference.format(url=url) + metadata = parse_xml(xml_data) + + actual_origin = parse_swh_reference(metadata) + assert actual_origin == url + + +@pytest.fixture +def xml_with_empty_reference(): + xml_data = """ + + + {swh_reference} + + + """ + return xml_data.strip() + + +@pytest.mark.parametrize( + "xml_ref", + [ + "", + "", + "", + """""", + ], +) +def test_parse_swh_reference_empty(xml_with_empty_reference, xml_ref): + xml_body = xml_with_empty_reference.format(swh_reference=xml_ref) + metadata = parse_xml(xml_body) + + assert parse_swh_reference(metadata) is None + + +@pytest.fixture +def xml_with_swhid(): + xml_data = """ + + + + + + + + """ + return xml_data.strip() + + +@pytest.mark.parametrize( + "swhid", + [ + """swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49; + origin=https://hal.archives-ouvertes.fr/hal-01243573; + visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321; + anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba; + path=/moranegg-AffectationRO-df7f68b/""", + "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", + ], +) +def test_parse_swh_reference_swhid(swhid, xml_with_swhid): + xml_data = xml_with_swhid.format(swhid=swhid) + metadata = parse_xml(xml_data) + + actual_swhid = parse_swh_reference(metadata) + assert actual_swhid is not None + + expected_swhid = parse_swhid(clean_swhid(swhid)) + assert actual_swhid == expected_swhid diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -139,3 +139,17 @@ expected_date = "2017-01-01 00:00:00+00:00" assert str(actual_date) == expected_date + + +def test_clean_swhid(): + noisy_swhid = """; + origin=; + visit=; + anchor=; path=/ + + """ + + actual_swhid = utils.clean_swhid(noisy_swhid) + assert actual_swhid == ( + ";origin=;visit=;anchor=;path=/" + ) diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -81,3 +81,10 @@ date = iso8601.parse_date(date) return normalize_timestamp(date) + + +def clean_swhid(swhid: str) -> str: + """Clean a potential swhid str with blank character noises + + """ + return ";".join(map(lambda s: s.strip(), swhid.split(";")))