diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py
--- a/swh/indexer/metadata_dictionary/codemeta.py
+++ b/swh/indexer/metadata_dictionary/codemeta.py
@@ -64,7 +64,12 @@
return [term for term in CODEMETA_TERMS if not term.startswith("@")]
def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]:
+ # Keys are JSON-LD property names (URIs or terms).
+ # Values are either a single string (if key is "type") or list of
+ # other dicts with the same type recursively.
+ # To simply annotations, we omit the single string case here.
doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list)
+
for child in e:
m = _TAG_RE.match(child.tag)
assert m, f"Tag with no namespace: {child}"
@@ -100,7 +105,14 @@
# to be allowed by the deposit; so we need to reformat them
# to be valid ISO8601.
jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat()
- doc[localname].append(jsonld_child)
+ if localname == "id":
+ # JSON-LD only allows a single id, and they have to be strings.
+ if localname in doc or not isinstance(jsonld_child, str):
+ continue
+ else:
+ doc[localname] = jsonld_child # type: ignore[assignment]
+ else:
+ doc[localname].append(jsonld_child)
else:
# Otherwise, we already know the URI
doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child))
diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
--- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py
+++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
@@ -6,6 +6,7 @@
import json
from hypothesis import HealthCheck, given, settings
+import pytest
from swh.indexer.codemeta import CODEMETA_TERMS
from swh.indexer.metadata_detector import detect_metadata
@@ -254,6 +255,117 @@
}
+@pytest.mark.parametrize("id_", ["", " ", "\n"])
+def test_sword_invalid_id(id_):
+ content = f"""
+
+ My Software
+ {id_}
+
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "My Software",
+ }
+
+
+@pytest.mark.parametrize(
+ "id_",
+ [
+ "foo",
+ "42",
+ "http://example.org/",
+ "http://example.org/foo",
+ "https://example.org/",
+ "https://example.org/foo",
+ ],
+)
+def test_sword_id(id_):
+ content = f"""
+
+ My Software
+ {id_}
+
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "id": id_,
+ "name": "My Software",
+ }
+
+
+def test_sword_multiple_ids():
+ """JSON-LD only allows a single id, so we ignore all but the first one."""
+ content = """
+
+ My Software
+ http://example.org/foo
+ http://example.org/bar
+
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "id": "http://example.org/foo",
+ "name": "My Software",
+ }
+
+
+def test_sword_type():
+ content = """
+
+ My Software
+ http://schema.org/WebSite
+
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "schema:WebSite",
+ "name": "My Software",
+ }
+
+
+def test_sword_multiple_type():
+ content = """
+
+ My Software
+ http://schema.org/WebSite
+ http://schema.org/SoftwareSourceCode
+
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result in (
+ {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": ["schema:WebSite", "SoftwareSourceCode"],
+ "name": "My Software",
+ },
+ {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": ["SoftwareSourceCode", "schema:WebSite"],
+ "name": "My Software",
+ },
+ )
+
+
def test_sword_schemaorg_in_codemeta():
content = """