diff --git a/docs/specs/spec-meta-deposit.rst b/docs/specs/spec-meta-deposit.rst index 80e4fe66..b9600abd 100644 --- a/docs/specs/spec-meta-deposit.rst +++ b/docs/specs/spec-meta-deposit.rst @@ -1,141 +1,136 @@ The metadata-deposit ==================== Goal ---- A client wishes to deposit only metadata about an origin or object already present in the Software Heritage archive. The metadata-deposit is a special deposit where no content is provided and the data transferred to Software Heritage is only the metadata about an object in the archive. Requirements ------------ 1. Create a metadata-only deposit through a :ref:`POST request` 2. It is composed of ONLY one xml metadata file 3. It MUST comply with :ref:`the metadata requirements` 4. It MUST reference an **object** or an **origin** in a deposit tag 5. The reference SHOULD exist in the SWH archive 6. The **object** reference MUST be a SWHID on one of the following artifact types: - origin - snapshot - release - revision - directory - content 7. The SWHID MAY be a `core identifier`_ with or without `qualifiers`_ 8. The SWHID MUST NOT reference a fragment of code with the classifier `lines` .. _core identifier: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html#core-identifiers .. _qualifiers: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html#qualifiers A complete metadata example --------------------------- The reference element is included in the metadata xml atomEntry under the swh namespace: .. code:: xml HAL hal@ccsd.cnrs.fr hal hal-01243573 The assignment problem https://hal.archives-ouvertes.fr/hal-01243573 other identifier, DOI, ARK Domain description author1 Inria UPMC author2 Inria UPMC References ^^^^^^^^^^ The metadata reference can be either on: - an origin - a graph object (core SWHID with or without qualifiers) Origins ======= The metadata may be on an origin, identified by the origin's URL: .. code:: xml Graph objects ============= It may also reference an object in the `SWH graph `: contents, directories, revisions, releases, and snapshots: .. code:: xml .. code:: xml - + The value of the ``swhid`` attribute must be a `SWHID `, with any context qualifiers in this list: * ``origin`` * ``visit`` * ``anchor`` * ``path`` and they should be provided whenever relevant, especially ``origin``. Other qualifiers are not allowed (for example, ``line`` isn't because SWH cannot store metadata at a finer level than entire contents). Loading procedure ------------------ In this case, the metadata-deposit will be injected as a metadata entry of the relevant object, with the information about the contributor of the deposit. Contrary to the complete and sparse deposit, there will be no object creation. diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py index e3213b4b..e8d8a0a0 100644 --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -1,103 +1,155 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining parsers with SWORD 2.0 supported mediatypes. """ +from typing import Dict, Optional, Union from xml.parsers.expat import ExpatError from django.conf import settings from rest_framework.parsers import BaseParser, FileUploadParser, MultiPartParser import xmltodict from swh.deposit.errors import ParserError +from swh.model.identifiers import SWHID, parse_swhid class SWHFileUploadZipParser(FileUploadParser): """File upload parser limited to zip archive. """ media_type = "application/zip" class SWHFileUploadTarParser(FileUploadParser): """File upload parser limited to tarball (tar, tar.gz, tar.*) archives. """ media_type = "application/x-tar" class SWHXMLParser(BaseParser): """ XML parser. """ media_type = "application/xml" def parse(self, stream, media_type=None, parser_context=None): """ Parses the incoming bytestream as XML and returns the resulting data. """ parser_context = parser_context or {} encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET) namespaces = { "http://www.w3.org/2005/Atom": None, "http://purl.org/dc/terms/": None, "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta", "http://purl.org/net/sword/": "sword", + "https://www.softwareheritage.org/schema/2018/deposit": "swh", } data = xmltodict.parse( stream, encoding=encoding, namespaces=namespaces, process_namespaces=True ) if "entry" in data: data = data["entry"] return data class SWHAtomEntryParser(SWHXMLParser): """Atom entry parser limited to specific mediatype """ media_type = "application/atom+xml;type=entry" def parse(self, stream, media_type=None, parser_context=None): # We do not actually want to parse the stream yet # because we want to keep the raw data as well # this is done later in the atom entry call # (cf. swh.deposit.api.common.APIBase._atom_entry) return stream class SWHMultiPartParser(MultiPartParser): """Multipart parser limited to a subset of mediatypes. """ media_type = "multipart/*; *" def parse_xml(raw_content): """Parse xml body. Args: raw_content (bytes): The content to parse Raises: ParserError in case of a malformed xml Returns: content parsed as dict. """ try: return SWHXMLParser().parse(raw_content) except ExpatError as e: raise ParserError(str(e)) + + +def parse_swh_reference(metadata: Dict) -> Optional[Union[str, SWHID]]: + """Parse swh reference within the metadata dict (or origin) reference if found, None + otherwise. + + + + + + + + or: + + + + + + + Raises: + ValidationError in case the swhid referenced (if any) is invalid + + Returns: + Either swhid or origin reference if any. None otherwise. + + """ # noqa + swh_deposit = metadata.get("swh:deposit") + if not swh_deposit: + return None + + swh_reference = swh_deposit.get("swh:reference") + if not swh_reference: + return None + + swh_origin = swh_reference.get("swh:origin") + if swh_origin: + url = swh_origin.get("@url") + if url: + return url + + swh_object = swh_reference.get("swh:object") + if not swh_object: + return None + + swhid = swh_object.get("@swhid") + if not swhid: + return None + return parse_swhid(swhid) diff --git a/swh/deposit/tests/api/test_parser.py b/swh/deposit/tests/api/test_parser.py index d29d27c4..1806139c 100644 --- a/swh/deposit/tests/api/test_parser.py +++ b/swh/deposit/tests/api/test_parser.py @@ -1,129 +1,232 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import OrderedDict import io -from swh.deposit.parsers import SWHXMLParser +import pytest + +from swh.deposit.parsers import SWHXMLParser, parse_swh_reference, parse_xml +from swh.model.exceptions import ValidationError +from swh.model.identifiers import parse_swhid def test_parsing_without_duplicates(): xml_no_duplicate = io.BytesIO( b""" Awesome Compiler GPL3.0 https://opensource.org/licenses/GPL-3.0 Python3 author1 Inria ocaml http://issuetracker.com """ ) actual_result = SWHXMLParser().parse(xml_no_duplicate) expected_dict = OrderedDict( [ ("title", "Awesome Compiler"), ( "codemeta:license", OrderedDict( [ ("codemeta:name", "GPL3.0"), ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), ] ), ), ("codemeta:runtimePlatform", "Python3"), ( "codemeta:author", OrderedDict( [("codemeta:name", "author1"), ("codemeta:affiliation", "Inria")] ), ), ("codemeta:programmingLanguage", "ocaml"), ("codemeta:issueTracker", "http://issuetracker.com"), ] ) assert expected_dict == actual_result def test_parsing_with_duplicates(): xml_with_duplicates = io.BytesIO( b""" Another Compiler GNU/Linux GPL3.0 https://opensource.org/licenses/GPL-3.0 Un*x author1 Inria author2 Inria ocaml haskell spdx http://spdx.org python3 """ ) actual_result = SWHXMLParser().parse(xml_with_duplicates) expected_dict = OrderedDict( [ ("title", "Another Compiler"), ("codemeta:runtimePlatform", ["GNU/Linux", "Un*x"]), ( "codemeta:license", [ OrderedDict( [ ("codemeta:name", "GPL3.0"), ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), ] ), OrderedDict( [("codemeta:name", "spdx"), ("codemeta:url", "http://spdx.org")] ), ], ), ( "codemeta:author", [ OrderedDict( [ ("codemeta:name", "author1"), ("codemeta:affiliation", "Inria"), ] ), OrderedDict( [ ("codemeta:name", "author2"), ("codemeta:affiliation", "Inria"), ] ), ], ), ("codemeta:programmingLanguage", ["ocaml", "haskell", "python3"]), ] ) assert expected_dict == actual_result + + +@pytest.fixture +def xml_with_origin_reference(): + xml_data = """ + + + + + + + + """ + return xml_data.strip() + + +def test_parse_swh_reference_origin(xml_with_origin_reference): + url = "https://url" + xml_data = xml_with_origin_reference.format(url=url) + metadata = parse_xml(xml_data) + + actual_origin = parse_swh_reference(metadata) + assert actual_origin == url + + +@pytest.fixture +def xml_with_empty_reference(): + xml_data = """ + + + {swh_reference} + + + """ + return xml_data.strip() + + +@pytest.mark.parametrize( + "xml_ref", + [ + "", + "", + "", + """""", + ], +) +def test_parse_swh_reference_empty(xml_with_empty_reference, xml_ref): + xml_body = xml_with_empty_reference.format(swh_reference=xml_ref) + metadata = parse_xml(xml_body) + + assert parse_swh_reference(metadata) is None + + +@pytest.fixture +def xml_with_swhid(): + xml_data = """ + + + + + + + + """ + return xml_data.strip() + + +@pytest.mark.parametrize( + "swhid", + [ + "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa + "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", + ], +) +def test_parse_swh_reference_swhid(swhid, xml_with_swhid): + xml_data = xml_with_swhid.format(swhid=swhid) + metadata = parse_xml(xml_data) + + actual_swhid = parse_swh_reference(metadata) + assert actual_swhid is not None + + expected_swhid = parse_swhid(swhid) + assert actual_swhid == expected_swhid + + +def test_parse_swh_reference_invalid_swhid(xml_with_swhid): + """Unparsable swhid should raise + + """ + invalid_swhid = "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc235" + xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid) + metadata = parse_xml(xml_invalid_swhid) + + with pytest.raises(ValidationError, match="Unexpected length"): + parse_swh_reference(metadata)