diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py --- a/swh/indexer/metadata_dictionary/codemeta.py +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -64,7 +64,12 @@ return [term for term in CODEMETA_TERMS if not term.startswith("@")] def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]: + # Keys are JSON-LD property names (URIs or terms). + # Values are either a single string (if key is "type") or list of + # other dicts with the same type recursively. + # To simply annotations, we omit the single string case here. doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list) + for child in e: m = _TAG_RE.match(child.tag) assert m, f"Tag with no namespace: {child}" @@ -100,7 +105,14 @@ # to be allowed by the deposit; so we need to reformat them # to be valid ISO8601. jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat() - doc[localname].append(jsonld_child) + if localname == "id": + # JSON-LD only allows a single id, and they have to be strings. + if localname in doc or not isinstance(jsonld_child, str): + continue + else: + doc[localname] = jsonld_child # type: ignore[assignment] + else: + doc[localname].append(jsonld_child) else: # Otherwise, we already know the URI doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py --- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -6,6 +6,7 @@ import json from hypothesis import HealthCheck, given, settings +import pytest from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata_detector import detect_metadata @@ -254,6 +255,117 @@ } +@pytest.mark.parametrize("id_", ["", " ", "\n"]) +def test_sword_invalid_id(id_): + content = f""" + + My Software + {id_} + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + } + + +@pytest.mark.parametrize( + "id_", + [ + "foo", + "42", + "http://example.org/", + "http://example.org/foo", + "https://example.org/", + "https://example.org/foo", + ], +) +def test_sword_id(id_): + content = f""" + + My Software + {id_} + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": id_, + "name": "My Software", + } + + +def test_sword_multiple_ids(): + """JSON-LD only allows a single id, so we ignore all but the first one.""" + content = """ + + My Software + http://example.org/foo + http://example.org/bar + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": "http://example.org/foo", + "name": "My Software", + } + + +def test_sword_type(): + content = """ + + My Software + http://schema.org/WebSite + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "schema:WebSite", + "name": "My Software", + } + + +def test_sword_multiple_type(): + content = """ + + My Software + http://schema.org/WebSite + http://schema.org/SoftwareSourceCode + + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result in ( + { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": ["schema:WebSite", "SoftwareSourceCode"], + "name": "My Software", + }, + { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": ["SoftwareSourceCode", "schema:WebSite"], + "name": "My Software", + }, + ) + + def test_sword_schemaorg_in_codemeta(): content = """