diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst --- a/docs/metadata-workflow.rst +++ b/docs/metadata-workflow.rst @@ -166,7 +166,7 @@ (other than the `codemeta` mapping, which is the identity function, and therefore supports all properties): -.. program-output:: python3 -m swh.indexer.cli mapping list-terms --exclude-mapping codemeta +.. program-output:: python3 -m swh.indexer.cli mapping list-terms --exclude-mapping codemeta --exclude-mapping sword-codemeta :nostderr: diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -109,7 +109,10 @@ Reads the local codemeta.jsonld file instead of fetching it from the Internet every single time.""" - if url == CODEMETA_CONTEXT_URL or url in CODEMETA_ALTERNATE_CONTEXT_URLS: + if ( + url.lower() == CODEMETA_CONTEXT_URL.lower() + or url in CODEMETA_ALTERNATE_CONTEXT_URLS + ): return { "contextUrl": None, "documentUrl": url, diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py --- a/swh/indexer/metadata_dictionary/__init__.py +++ b/swh/indexer/metadata_dictionary/__init__.py @@ -25,6 +25,7 @@ EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = { "GitHubMapping": github.GitHubMapping, + "SwordCodemetaMapping": codemeta.SwordCodemetaMapping, } diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py --- a/swh/indexer/metadata_dictionary/codemeta.py +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -1,14 +1,22 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import collections import json -from typing import Any, Dict, List, Optional +import re +from typing import Any, Dict, List, Optional, Tuple +import xml.etree.ElementTree as ET -from swh.indexer.codemeta import CODEMETA_TERMS, expand +from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand -from .base import SingleFileIntrinsicMapping +from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping + +ATOM_URI = "http://www.w3.org/2005/Atom" + +_TAG_RE = re.compile(r"\{(?P.*?)\}(?P.*)") +_IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) class CodemetaMapping(SingleFileIntrinsicMapping): @@ -29,3 +37,77 @@ return self.normalize_translation(expand(json.loads(content.decode()))) except Exception: return None + + +class SwordCodemetaMapping(BaseExtrinsicMapping): + """ + dedicated class for mapping and translation from JSON-LD statements + embedded in SWORD documents, optionally using Codemeta contexts, + as described in the :ref:`deposit-protocol`. + """ + + name = "sword-codemeta" + + @classmethod + def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: + return ( + "sword-v2-atom-codemeta", + "sword-v2-atom-codemeta-v2", + ) + + @classmethod + def supported_terms(cls) -> List[str]: + return [term for term in CODEMETA_TERMS if not term.startswith("@")] + + def xml_to_jsonld(self, e: ET.Element) -> Dict[str, Any]: + doc: Dict[str, List[Dict[str, Any]]] = collections.defaultdict(list) + for child in e: + m = _TAG_RE.match(child.tag) + assert m, f"Tag with no namespace: {child}" + namespace = m.group("namespace") + localname = m.group("localname") + if namespace == ATOM_URI and localname in ("title", "name"): + # Convert Atom name/title to Codemeta name; in case codemeta:name + # is not provided or different + doc["name"].append(self.xml_to_jsonld(child)) + elif namespace in _IGNORED_NAMESPACES: + # SWORD-specific namespace that is not interesting to translate + pass + elif namespace.lower() == CODEMETA_CONTEXT_URL: + # It is a term defined by the context; write is as-is and JSON-LD + # expansion will convert it to a full URI based on + # "@context": CODEMETA_CONTEXT_URL + doc[localname].append(self.xml_to_jsonld(child)) + else: + # Otherwise, we already know the URI + doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) + + # The above needed doc values to be list to work; now we allow any type + # of value as key "@value" cannot have a list as value. + doc_: Dict[str, Any] = doc + + text = e.text.strip() if e.text else None + if text: + # TODO: check doc is empty, and raise mixed-content error otherwise? + doc_["@value"] = text + + return doc_ + + def translate(self, content: bytes) -> Optional[Dict[str, Any]]: + # Parse XML + root = ET.fromstring(content) + + # Transform to JSON-LD document + doc = self.xml_to_jsonld(root) + + # Add @context to JSON-LD expansion replaces the "codemeta:" prefix + # hash (which uses the context URL as namespace URI for historical + # reasons) into properties in `http://schema.org/` and + # `https://codemeta.github.io/terms/` namespaces + doc["@context"] = CODEMETA_CONTEXT_URL + + # Normalize as a Codemeta document + return self.normalize_translation(expand(doc)) + + def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: + return compact(metadata, forgefed=False) diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py --- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -173,3 +173,174 @@ expected_results = {"CodemetaMapping": [b"bcd"]} assert expected_results == results + + +def test_sword_default_xmlns(): + content = """ + + My Software + + Author 1 + foo@example.org + + + Author 2 + + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + "author": [ + {"name": "Author 1", "email": "foo@example.org"}, + {"name": "Author 2"}, + ], + } + + +def test_sword_basics(): + content = """ + + My Software + + Author 1 + foo@example.org + + + Author 2 + + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + "author": [ + {"name": "Author 1", "email": "foo@example.org"}, + {"name": "Author 2"}, + ], + } + + +def test_sword_mixed(): + content = """ + + My Software + blah + 1.2.3 + blih + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + "version": "1.2.3", + } + + +def test_sword_schemaorg_in_codemeta(): + content = """ + + My Software + 1.2.3 + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + "version": "1.2.3", + } + + +def test_sword_schemaorg_in_codemeta_constrained(): + """Resulting property has the compact URI 'schema:url' instead of just + the term 'url', because term 'url' is defined by the Codemeta schema + has having type '@id'.""" + content = """ + + My Software + http://example.org/my-software + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + "schema:url": "http://example.org/my-software", + } + + +def test_sword_schemaorg_not_in_codemeta(): + content = """ + + My Software + http://example.org/my-software + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + "schema:sameAs": "http://example.org/my-software", + } + + +def test_sword_atom_name(): + content = """ + + My Software + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + } + + +def test_sword_multiple_names(): + content = """ + + Atom Name 1 + Atom Name 2 + Atom Title 1 + Atom Title 2 + Codemeta Name 1 + Codemeta Name 2 + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": [ + "Atom Name 1", + "Atom Name 2", + "Atom Title 1", + "Atom Title 2", + "Codemeta Name 1", + "Codemeta Name 2", + ], + } diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -116,6 +116,7 @@ "nuget", "pkg-info", "pubspec", + "sword-codemeta", "", ] # must be sorted for test to pass ) @@ -141,7 +142,16 @@ def test_cli_mapping_list_terms_exclude(cli_runner, swh_config): result = cli_runner.invoke( indexer_cli_group, - ["-C", swh_config, "mapping", "list-terms", "--exclude-mapping", "codemeta"], + [ + "-C", + swh_config, + "mapping", + "list-terms", + "--exclude-mapping", + "codemeta", + "--exclude-mapping", + "sword-codemeta", + ], catch_exceptions=False, ) assert result.exit_code == 0, result.output