diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py index 7472123..1fc613f 100644 --- a/swh/indexer/metadata_dictionary/codemeta.py +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -1,168 +1,196 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import collections import json +import logging import re from typing import Any, Dict, List, Optional, Tuple, Union import xml.etree.ElementTree as ET import iso8601 import xmltodict from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping ATOM_URI = "http://www.w3.org/2005/Atom" _TAG_RE = re.compile(r"\{(?P.*?)\}(?P.*)") _IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) _DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$") +logger = logging.getLogger(__name__) + class CodemetaMapping(SingleFileIntrinsicMapping): """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ name = "codemeta" filename = b"codemeta.json" string_fields = None @classmethod def supported_terms(cls) -> List[str]: return [term for term in CODEMETA_TERMS if not term.startswith("@")] def translate(self, content: bytes) -> Optional[Dict[str, Any]]: try: return self.normalize_translation(expand(json.loads(content.decode()))) except Exception: return None class SwordCodemetaMapping(BaseExtrinsicMapping): """ dedicated class for mapping and translation from JSON-LD statements embedded in SWORD documents, optionally using Codemeta contexts, as described in the :ref:`deposit-protocol`. """ name = "sword-codemeta" @classmethod def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: return ( "sword-v2-atom-codemeta", "sword-v2-atom-codemeta-v2", ) @classmethod def supported_terms(cls) -> List[str]: return [term for term in CODEMETA_TERMS if not term.startswith("@")] def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]: + # Keys are JSON-LD property names (URIs or terms). + # Values are either a single string (if key is "type") or list of + # other dicts with the same type recursively. + # To simply annotations, we omit the single string case here. doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list) + for child in e: m = _TAG_RE.match(child.tag) assert m, f"Tag with no namespace: {child}" namespace = m.group("namespace") localname = m.group("localname") if namespace == ATOM_URI and localname in ("title", "name"): # Convert Atom to Codemeta name; in case codemeta:name # is not provided or different doc["name"].append(self.xml_to_jsonld(child)) elif namespace == ATOM_URI and localname in ("author", "email"): # ditto for these author properties (note that author email is also # covered by the previous test) doc[localname].append(self.xml_to_jsonld(child)) elif namespace in _IGNORED_NAMESPACES: # SWORD-specific namespace that is not interesting to translate pass elif namespace.lower() == CODEMETA_CONTEXT_URL: # It is a term defined by the context; write is as-is and JSON-LD # expansion will convert it to a full URI based on # "@context": CODEMETA_CONTEXT_URL jsonld_child = self.xml_to_jsonld(child) if ( localname in ( "dateCreated", "dateModified", "datePublished", ) and isinstance(jsonld_child, str) and _DATE_RE.match(jsonld_child) ): # Dates missing a leading zero for their day/month, used # to be allowed by the deposit; so we need to reformat them # to be valid ISO8601. jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat() - doc[localname].append(jsonld_child) + if localname == "id": + # JSON-LD only allows a single id, and they have to be strings. + if localname in doc: + logger.error( + "Duplicate s in SWORD document: %r and %r", + doc[localname], + jsonld_child, + ) + continue + elif not jsonld_child: + logger.error("Empty value in SWORD document") + continue + elif not isinstance(jsonld_child, str): + logger.error( + "Unexpected value in SWORD document: %r", jsonld_child + ) + continue + else: + doc[localname] = jsonld_child # type: ignore[assignment] + else: + doc[localname].append(jsonld_child) else: # Otherwise, we already know the URI doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) # The above needed doc values to be list to work; now we allow any type # of value as key "@value" cannot have a list as value. doc_: Dict[str, Any] = doc text = e.text.strip() if e.text else None if text: # TODO: check doc is empty, and raise mixed-content error otherwise? return text return doc_ def translate(self, content: bytes) -> Optional[Dict[str, Any]]: # Parse XML root = ET.fromstring(content) # Transform to JSON-LD document doc = self.xml_to_jsonld(root) assert isinstance(doc, dict), f"Root object is not a dict: {doc}" # Add @context to JSON-LD expansion replaces the "codemeta:" prefix # hash (which uses the context URL as namespace URI for historical # reasons) into properties in `http://schema.org/` and # `https://codemeta.github.io/terms/` namespaces doc["@context"] = CODEMETA_CONTEXT_URL # Normalize as a Codemeta document return self.normalize_translation(expand(doc)) def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: return compact(metadata, forgefed=False) class JsonSwordCodemetaMapping(SwordCodemetaMapping): """ Variant of :class:`SwordCodemetaMapping` that reads the legacy ``sword-v2-atom-codemeta-v2-in-json`` format and converts it back to ``sword-v2-atom-codemeta-v2`` XML """ name = "json-sword-codemeta" @classmethod def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: return ("sword-v2-atom-codemeta-v2-in-json",) def translate(self, content: bytes) -> Optional[Dict[str, Any]]: # ``content`` was generated by calling ``xmltodict.parse()`` on a XML document, # so ``xmltodict.unparse()`` is guaranteed to return a document that is # semantically equivalent to the original and pass it to SwordCodemetaMapping. json_doc = json.loads(content) if json_doc.get("@xmlns") != ATOM_URI: # Technically, non-default XMLNS were allowed, but it does not seem like # anyone used them, so they do not need to be implemented here. raise NotImplementedError(f"Unexpected XMLNS set: {json_doc}") # Root tag was stripped by swh-deposit json_doc = {"entry": json_doc} return super().translate(xmltodict.unparse(json_doc)) diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py index bc08b25..6c9d6de 100644 --- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -1,422 +1,534 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from hypothesis import HealthCheck, given, settings +import pytest from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from ..utils import json_document_strategy def test_compute_metadata_valid_codemeta(): raw_content = b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""" # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X", }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738", }, ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X", }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation", }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": ["metadata", "software"], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD", } result = MAPPINGS["CodemetaMapping"]().translate(raw_content) assert result == expected_result def test_compute_metadata_codemeta_alternate_context(): raw_content = b"""{ "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", "@type": "SoftwareSourceCode", "identifier": "CodeMeta" }""" # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", } result = MAPPINGS["CodemetaMapping"]().translate(raw_content) assert result == expected_result @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=CODEMETA_TERMS)) def test_codemeta_adversarial(doc): raw = json.dumps(doc).encode() MAPPINGS["CodemetaMapping"]().translate(raw) def test_detect_metadata_codemeta_json_uppercase(): df = [ { "sha1_git": b"abc", "name": b"index.html", "target": b"abc", "length": 897, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, { "sha1_git": b"aab", "name": b"CODEMETA.json", "target": b"aab", "length": 712, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, ] results = detect_metadata(df) expected_results = {"CodemetaMapping": [b"bcd"]} assert expected_results == results def test_sword_default_xmlns(): content = """ My Software Author 1 foo@example.org Author 2 """ result = MAPPINGS["SwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "My Software", "author": [ {"name": "Author 1", "email": "foo@example.org"}, {"name": "Author 2"}, ], } def test_sword_basics(): content = """ My Software Author 1 foo@example.org Author 2 2022-10-26 Author 3 bar@example.org """ result = MAPPINGS["SwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "My Software", "author": [ {"name": "Author 1", "email": "foo@example.org"}, {"name": "Author 2"}, {"name": "Author 3", "email": "bar@example.org"}, ], "dateCreated": "2022-10-26", } def test_sword_mixed(): content = """ My Software blah 1.2.3 blih """ result = MAPPINGS["SwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "My Software", "version": "1.2.3", } +@pytest.mark.parametrize("id_", ["", " ", "\n"]) +def test_sword_invalid_id(id_): + content = f""" + + My Software + {id_} + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + } + + +@pytest.mark.parametrize( + "id_", + [ + "foo", + "42", + "http://example.org/", + "http://example.org/foo", + "https://example.org/", + "https://example.org/foo", + ], +) +def test_sword_id(id_): + content = f""" + + My Software + {id_} + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": id_, + "name": "My Software", + } + + +def test_sword_multiple_ids(): + """JSON-LD only allows a single id, so we ignore all but the first one.""" + content = """ + + My Software + http://example.org/foo + http://example.org/bar + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": "http://example.org/foo", + "name": "My Software", + } + + +def test_sword_type(): + content = """ + + My Software + http://schema.org/WebSite + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "schema:WebSite", + "name": "My Software", + } + + +def test_sword_multiple_type(): + content = """ + + My Software + http://schema.org/WebSite + http://schema.org/SoftwareSourceCode + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result in ( + { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": ["schema:WebSite", "SoftwareSourceCode"], + "name": "My Software", + }, + { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": ["SoftwareSourceCode", "schema:WebSite"], + "name": "My Software", + }, + ) + + def test_sword_schemaorg_in_codemeta(): content = """ My Software 1.2.3 """ result = MAPPINGS["SwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "My Software", "version": "1.2.3", } def test_sword_schemaorg_in_codemeta_constrained(): """Resulting property has the compact URI 'schema:url' instead of just the term 'url', because term 'url' is defined by the Codemeta schema has having type '@id'. Ditto for dates (with type http://schema.org/Date).""" content = """ My Software http://example.org/my-software foo 2022-10-26 """ result = MAPPINGS["SwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "My Software", "schema:url": "http://example.org/my-software", "schema:dateCreated": "foo", "schema:dateModified": "2022-10-26", } def test_sword_schemaorg_not_in_codemeta(): content = """ My Software http://example.org/my-software """ result = MAPPINGS["SwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "My Software", "schema:sameAs": "http://example.org/my-software", } def test_sword_atom_name(): content = """ My Software """ result = MAPPINGS["SwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "My Software", } def test_sword_multiple_names(): content = """ Atom Name 1 Atom Name 2 Atom Title 1 Atom Title 2 Codemeta Name 1 Codemeta Name 2 """ result = MAPPINGS["SwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": [ "Atom Name 1", "Atom Name 2", "Atom Title 1", "Atom Title 2", "Codemeta Name 1", "Codemeta Name 2", ], } def test_sword_propertyvalue(): content = """ Name schema:PropertyValue HAL-ID hal-03780423 """ result = MAPPINGS["SwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "Name", "identifier": { "schema:propertyID": "HAL-ID", "schema:value": "hal-03780423", "type": "schema:PropertyValue", }, } def test_sword_fix_date(): content = """ Name 2020-12-1 2020-12-2 2020-12-3 """ result = MAPPINGS["SwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "Name", "dateModified": "2020-12-01", "dateCreated": "2020-12-02", "datePublished": "2020-12-03", } def test_json_sword(): content = """{"id": "hal-01243573", "@xmlns": "http://www.w3.org/2005/Atom", "author": {"name": "Author 1", "email": "foo@example.org"}, "client": "hal", "codemeta:url": "http://example.org/", "codemeta:name": "The assignment problem", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "codemeta:author": {"codemeta:name": "Author 2"}, "codemeta:license": {"codemeta:name": "GNU General Public License v3.0 or later"}}""" # noqa result = MAPPINGS["JsonSwordCodemetaMapping"]().translate(content) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": [ {"name": "Author 1", "email": "foo@example.org"}, {"name": "Author 2"}, ], "license": {"name": "GNU General Public License v3.0 or later"}, "name": "The assignment problem", "url": "http://example.org/", "name": "The assignment problem", }