Page MenuHomeSoftware Heritage

D8799.diff
No OneTemporary

D8799.diff

diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py
--- a/swh/indexer/metadata_dictionary/codemeta.py
+++ b/swh/indexer/metadata_dictionary/codemeta.py
@@ -5,6 +5,7 @@
import collections
import json
+import logging
import re
from typing import Any, Dict, List, Optional, Tuple, Union
import xml.etree.ElementTree as ET
@@ -22,6 +23,8 @@
_IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",)
_DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$")
+logger = logging.getLogger(__name__)
+
class CodemetaMapping(SingleFileIntrinsicMapping):
"""
@@ -64,7 +67,12 @@
return [term for term in CODEMETA_TERMS if not term.startswith("@")]
def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]:
+ # Keys are JSON-LD property names (URIs or terms).
+ # Values are either a single string (if key is "type") or list of
+ # other dicts with the same type recursively.
+ # To simply annotations, we omit the single string case here.
doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list)
+
for child in e:
m = _TAG_RE.match(child.tag)
assert m, f"Tag with no namespace: {child}"
@@ -100,7 +108,27 @@
# to be allowed by the deposit; so we need to reformat them
# to be valid ISO8601.
jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat()
- doc[localname].append(jsonld_child)
+ if localname == "id":
+ # JSON-LD only allows a single id, and they have to be strings.
+ if localname in doc:
+ logger.error(
+ "Duplicate <id>s in SWORD document: %r and %r",
+ doc[localname],
+ jsonld_child,
+ )
+ continue
+ elif not jsonld_child:
+ logger.error("Empty <id> value in SWORD document")
+ continue
+ elif not isinstance(jsonld_child, str):
+ logger.error(
+ "Unexpected <id> value in SWORD document: %r", jsonld_child
+ )
+ continue
+ else:
+ doc[localname] = jsonld_child # type: ignore[assignment]
+ else:
+ doc[localname].append(jsonld_child)
else:
# Otherwise, we already know the URI
doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child))
diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
--- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py
+++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
@@ -6,6 +6,7 @@
import json
from hypothesis import HealthCheck, given, settings
+import pytest
from swh.indexer.codemeta import CODEMETA_TERMS
from swh.indexer.metadata_detector import detect_metadata
@@ -254,6 +255,117 @@
}
+@pytest.mark.parametrize("id_", ["", " ", "\n"])
+def test_sword_invalid_id(id_):
+ content = f"""<?xml version="1.0"?>
+ <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+ xmlns:schema="http://schema.org/">
+ <name>My Software</name>
+ <id>{id_}</id>
+ </atom:entry>
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "My Software",
+ }
+
+
+@pytest.mark.parametrize(
+ "id_",
+ [
+ "foo",
+ "42",
+ "http://example.org/",
+ "http://example.org/foo",
+ "https://example.org/",
+ "https://example.org/foo",
+ ],
+)
+def test_sword_id(id_):
+ content = f"""<?xml version="1.0"?>
+ <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+ xmlns:schema="http://schema.org/">
+ <name>My Software</name>
+ <id>{id_}</id>
+ </atom:entry>
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "id": id_,
+ "name": "My Software",
+ }
+
+
+def test_sword_multiple_ids():
+ """JSON-LD only allows a single id, so we ignore all but the first one."""
+ content = """<?xml version="1.0"?>
+ <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+ xmlns:schema="http://schema.org/">
+ <name>My Software</name>
+ <id>http://example.org/foo</id>
+ <id>http://example.org/bar</id>
+ </atom:entry>
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "id": "http://example.org/foo",
+ "name": "My Software",
+ }
+
+
+def test_sword_type():
+ content = """<?xml version="1.0"?>
+ <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+ xmlns:schema="http://schema.org/">
+ <name>My Software</name>
+ <type>http://schema.org/WebSite</type>
+ </atom:entry>
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "schema:WebSite",
+ "name": "My Software",
+ }
+
+
+def test_sword_multiple_type():
+ content = """<?xml version="1.0"?>
+ <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+ xmlns:schema="http://schema.org/">
+ <name>My Software</name>
+ <type>http://schema.org/WebSite</type>
+ <type>http://schema.org/SoftwareSourceCode</type>
+ </atom:entry>
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result in (
+ {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": ["schema:WebSite", "SoftwareSourceCode"],
+ "name": "My Software",
+ },
+ {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": ["SoftwareSourceCode", "schema:WebSite"],
+ "name": "My Software",
+ },
+ )
+
+
def test_sword_schemaorg_in_codemeta():
content = """<?xml version="1.0"?>
<atom:entry xmlns:atom="http://www.w3.org/2005/Atom"

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 8:06 PM (1 d, 2 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214123

Event Timeline