diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py --- a/swh/indexer/metadata_dictionary/codemeta.py +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -5,6 +5,7 @@ import collections import json +import logging import re from typing import Any, Dict, List, Optional, Tuple, Union import xml.etree.ElementTree as ET @@ -22,6 +23,8 @@ _IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) _DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$") +logger = logging.getLogger(__name__) + class CodemetaMapping(SingleFileIntrinsicMapping): """ @@ -64,7 +67,12 @@ return [term for term in CODEMETA_TERMS if not term.startswith("@")] def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]: + # Keys are JSON-LD property names (URIs or terms). + # Values are either a single string (if key is "type") or list of + # other dicts with the same type recursively. + # To simply annotations, we omit the single string case here. doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list) + for child in e: m = _TAG_RE.match(child.tag) assert m, f"Tag with no namespace: {child}" @@ -100,7 +108,27 @@ # to be allowed by the deposit; so we need to reformat them # to be valid ISO8601. jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat() - doc[localname].append(jsonld_child) + if localname == "id": + # JSON-LD only allows a single id, and they have to be strings. + if localname in doc: + logger.error( + "Duplicate s in SWORD document: %r and %r", + doc[localname], + jsonld_child, + ) + continue + elif not jsonld_child: + logger.error("Empty value in SWORD document") + continue + elif not isinstance(jsonld_child, str): + logger.error( + "Unexpected value in SWORD document: %r", jsonld_child + ) + continue + else: + doc[localname] = jsonld_child # type: ignore[assignment] + else: + doc[localname].append(jsonld_child) else: # Otherwise, we already know the URI doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py --- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -6,6 +6,7 @@ import json from hypothesis import HealthCheck, given, settings +import pytest from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata_detector import detect_metadata @@ -254,6 +255,117 @@ } +@pytest.mark.parametrize("id_", ["", " ", "\n"]) +def test_sword_invalid_id(id_): + content = f""" + + My Software + {id_} + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + } + + +@pytest.mark.parametrize( + "id_", + [ + "foo", + "42", + "http://example.org/", + "http://example.org/foo", + "https://example.org/", + "https://example.org/foo", + ], +) +def test_sword_id(id_): + content = f""" + + My Software + {id_} + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": id_, + "name": "My Software", + } + + +def test_sword_multiple_ids(): + """JSON-LD only allows a single id, so we ignore all but the first one.""" + content = """ + + My Software + http://example.org/foo + http://example.org/bar + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": "http://example.org/foo", + "name": "My Software", + } + + +def test_sword_type(): + content = """ + + My Software + http://schema.org/WebSite + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "schema:WebSite", + "name": "My Software", + } + + +def test_sword_multiple_type(): + content = """ + + My Software + http://schema.org/WebSite + http://schema.org/SoftwareSourceCode + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result in ( + { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": ["schema:WebSite", "SoftwareSourceCode"], + "name": "My Software", + }, + { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": ["SoftwareSourceCode", "schema:WebSite"], + "name": "My Software", + }, + ) + + def test_sword_schemaorg_in_codemeta(): content = """