Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9696462
D8799.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D8799.diff
View Options
diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py
--- a/swh/indexer/metadata_dictionary/codemeta.py
+++ b/swh/indexer/metadata_dictionary/codemeta.py
@@ -5,6 +5,7 @@
import collections
import json
+import logging
import re
from typing import Any, Dict, List, Optional, Tuple, Union
import xml.etree.ElementTree as ET
@@ -22,6 +23,8 @@
_IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",)
_DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$")
+logger = logging.getLogger(__name__)
+
class CodemetaMapping(SingleFileIntrinsicMapping):
"""
@@ -64,7 +67,12 @@
return [term for term in CODEMETA_TERMS if not term.startswith("@")]
def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]:
+ # Keys are JSON-LD property names (URIs or terms).
+ # Values are either a single string (if key is "type") or list of
+ # other dicts with the same type recursively.
+ # To simply annotations, we omit the single string case here.
doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list)
+
for child in e:
m = _TAG_RE.match(child.tag)
assert m, f"Tag with no namespace: {child}"
@@ -100,7 +108,27 @@
# to be allowed by the deposit; so we need to reformat them
# to be valid ISO8601.
jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat()
- doc[localname].append(jsonld_child)
+ if localname == "id":
+ # JSON-LD only allows a single id, and they have to be strings.
+ if localname in doc:
+ logger.error(
+ "Duplicate <id>s in SWORD document: %r and %r",
+ doc[localname],
+ jsonld_child,
+ )
+ continue
+ elif not jsonld_child:
+ logger.error("Empty <id> value in SWORD document")
+ continue
+ elif not isinstance(jsonld_child, str):
+ logger.error(
+ "Unexpected <id> value in SWORD document: %r", jsonld_child
+ )
+ continue
+ else:
+ doc[localname] = jsonld_child # type: ignore[assignment]
+ else:
+ doc[localname].append(jsonld_child)
else:
# Otherwise, we already know the URI
doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child))
diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
--- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py
+++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
@@ -6,6 +6,7 @@
import json
from hypothesis import HealthCheck, given, settings
+import pytest
from swh.indexer.codemeta import CODEMETA_TERMS
from swh.indexer.metadata_detector import detect_metadata
@@ -254,6 +255,117 @@
}
+@pytest.mark.parametrize("id_", ["", " ", "\n"])
+def test_sword_invalid_id(id_):
+ content = f"""<?xml version="1.0"?>
+ <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+ xmlns:schema="http://schema.org/">
+ <name>My Software</name>
+ <id>{id_}</id>
+ </atom:entry>
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "My Software",
+ }
+
+
+@pytest.mark.parametrize(
+ "id_",
+ [
+ "foo",
+ "42",
+ "http://example.org/",
+ "http://example.org/foo",
+ "https://example.org/",
+ "https://example.org/foo",
+ ],
+)
+def test_sword_id(id_):
+ content = f"""<?xml version="1.0"?>
+ <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+ xmlns:schema="http://schema.org/">
+ <name>My Software</name>
+ <id>{id_}</id>
+ </atom:entry>
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "id": id_,
+ "name": "My Software",
+ }
+
+
+def test_sword_multiple_ids():
+ """JSON-LD only allows a single id, so we ignore all but the first one."""
+ content = """<?xml version="1.0"?>
+ <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+ xmlns:schema="http://schema.org/">
+ <name>My Software</name>
+ <id>http://example.org/foo</id>
+ <id>http://example.org/bar</id>
+ </atom:entry>
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "id": "http://example.org/foo",
+ "name": "My Software",
+ }
+
+
+def test_sword_type():
+ content = """<?xml version="1.0"?>
+ <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+ xmlns:schema="http://schema.org/">
+ <name>My Software</name>
+ <type>http://schema.org/WebSite</type>
+ </atom:entry>
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "schema:WebSite",
+ "name": "My Software",
+ }
+
+
+def test_sword_multiple_type():
+ content = """<?xml version="1.0"?>
+ <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+ xmlns:schema="http://schema.org/">
+ <name>My Software</name>
+ <type>http://schema.org/WebSite</type>
+ <type>http://schema.org/SoftwareSourceCode</type>
+ </atom:entry>
+ """
+
+ result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+ assert result in (
+ {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": ["schema:WebSite", "SoftwareSourceCode"],
+ "name": "My Software",
+ },
+ {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": ["SoftwareSourceCode", "schema:WebSite"],
+ "name": "My Software",
+ },
+ )
+
+
def test_sword_schemaorg_in_codemeta():
content = """<?xml version="1.0"?>
<atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Aug 17, 8:06 PM (1 d, 2 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214123
Attached To
D8799: codemeta: Fix crash on SWORD documents that specify an id
Event Timeline
Log In to Comment