diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py --- a/swh/indexer/metadata_dictionary/codemeta.py +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -6,7 +6,7 @@ import collections import json import re -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union import xml.etree.ElementTree as ET import xmltodict @@ -61,8 +61,8 @@ def supported_terms(cls) -> List[str]: return [term for term in CODEMETA_TERMS if not term.startswith("@")] - def xml_to_jsonld(self, e: ET.Element) -> Dict[str, Any]: - doc: Dict[str, List[Dict[str, Any]]] = collections.defaultdict(list) + def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]: + doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list) for child in e: m = _TAG_RE.match(child.tag) assert m, f"Tag with no namespace: {child}" @@ -84,12 +84,6 @@ # expansion will convert it to a full URI based on # "@context": CODEMETA_CONTEXT_URL jsonld_child = self.xml_to_jsonld(child) - if localname == "type" and isinstance(jsonld_child, dict): - # With a codemeta context, this is later translated to a JSON-LD - # @type, which must be either an array of strings or a string. - if set(jsonld_child) != {"@value"}: - raise ValueError(f'Unexpected value for "type": {jsonld_child}') - jsonld_child = jsonld_child["@value"] doc[localname].append(jsonld_child) else: # Otherwise, we already know the URI @@ -102,7 +96,7 @@ text = e.text.strip() if e.text else None if text: # TODO: check doc is empty, and raise mixed-content error otherwise? - doc_["@value"] = text + return text return doc_ @@ -113,6 +107,8 @@ # Transform to JSON-LD document doc = self.xml_to_jsonld(root) + assert isinstance(doc, dict), f"Root object is not a dict: {doc}" + # Add @context to JSON-LD expansion replaces the "codemeta:" prefix # hash (which uses the context URL as namespace URI for historical # reasons) into properties in `http://schema.org/` and diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py --- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -213,6 +213,7 @@ Author 2 + 2022-10-26 Author 3 bar@example.org @@ -229,6 +230,7 @@ {"name": "Author 2"}, {"name": "Author 3", "email": "bar@example.org"}, ], + "dateCreated": "2022-10-26", } @@ -273,13 +275,16 @@ def test_sword_schemaorg_in_codemeta_constrained(): """Resulting property has the compact URI 'schema:url' instead of just the term 'url', because term 'url' is defined by the Codemeta schema - has having type '@id'.""" + has having type '@id'. + Ditto for dates (with type http://schema.org/Date).""" content = """ My Software http://example.org/my-software + foo + 2022-10-26 """ @@ -288,6 +293,8 @@ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "My Software", "schema:url": "http://example.org/my-software", + "schema:dateCreated": "foo", + "schema:dateModified": "2022-10-26", } @@ -388,6 +395,6 @@ ], "license": {"name": "GNU General Public License v3.0 or later"}, "name": "The assignment problem", - "schema:url": "http://example.org/", + "url": "http://example.org/", "name": "The assignment problem", }