Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9123330
D8279.id29898.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
68 KB
Subscribers
None
D8279.id29898.diff
View Options
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -26,5 +26,8 @@
[mypy-pytest.*]
ignore_missing_imports = True
+[mypy-rdflib.*]
+ignore_missing_imports = True
+
[mypy-xmltodict.*]
ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,7 @@
# cf https://forge.softwareheritage.org/T3815
frozendict != 2.1.2
pyld
+rdflib
sentry-sdk
typing-extensions
xmltodict
diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py
--- a/swh/indexer/codemeta.py
+++ b/swh/indexer/codemeta.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018 The Software Heritage developers
+# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -12,6 +12,7 @@
from typing import Any, List
from pyld import jsonld
+import rdflib
import swh.indexer
from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA
@@ -61,7 +62,7 @@
uri = jsonld.JsonLdProcessor.get_context_value(
_PROCESSED_CODEMETA_CONTEXT, local_name, "@id"
)
- assert uri.startswith(("@", CODEMETA._uri, SCHEMA._uri)), (local_name, uri)
+ assert uri.startswith(("@", CODEMETA, SCHEMA)), (local_name, uri)
return uri
@@ -92,7 +93,9 @@
# For each of the data source's properties that maps
# to this canonical name
if local_name.strip():
- codemeta_translation[col][local_name.strip()] = canonical_name
+ codemeta_translation[col][local_name.strip()] = rdflib.URIRef(
+ canonical_name
+ )
return (terms, codemeta_translation)
@@ -112,10 +115,10 @@
"documentUrl": url,
"document": CODEMETA_CONTEXT,
}
- elif url == CODEMETA._uri:
+ elif url == CODEMETA:
raise Exception(
"{} is CodeMeta's URI, use {} as context url".format(
- CODEMETA._uri, CODEMETA_CONTEXT_URL
+ CODEMETA, CODEMETA_CONTEXT_URL
)
)
else:
@@ -132,7 +135,7 @@
"""
contexts: List[Any] = [CODEMETA_CONTEXT_URL]
if forgefed:
- contexts.append({"as": ACTIVITYSTREAMS._uri, "forge": FORGEFED._uri})
+ contexts.append({"as": str(ACTIVITYSTREAMS), "forge": str(FORGEFED)})
return jsonld.compact(doc, contexts, options={"documentLoader": _document_loader})
@@ -141,40 +144,6 @@
return jsonld.expand(doc, options={"documentLoader": _document_loader})
-def merge_values(v1, v2):
- """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`,
- returns `{"@list": l1 + l2}`.
- Otherwise, make them lists (if they are not already) and concatenate
- them.
-
- >>> merge_values('a', 'b')
- ['a', 'b']
- >>> merge_values(['a', 'b'], 'c')
- ['a', 'b', 'c']
- >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']})
- {'@list': ['a', 'b', 'c']}
- """
- if v1 is None:
- return v2
- elif v2 is None:
- return v1
- elif isinstance(v1, dict) and set(v1) == {"@list"}:
- assert isinstance(v1["@list"], list)
- if isinstance(v2, dict) and set(v2) == {"@list"}:
- assert isinstance(v2["@list"], list)
- return {"@list": v1["@list"] + v2["@list"]}
- else:
- raise ValueError("Cannot merge %r and %r" % (v1, v2))
- else:
- if isinstance(v2, dict) and "@list" in v2:
- raise ValueError("Cannot merge %r and %r" % (v1, v2))
- if not isinstance(v1, list):
- v1 = [v1]
- if not isinstance(v2, list):
- v2 = [v2]
- return v1 + v2
-
-
def merge_documents(documents):
"""Takes a list of metadata dicts, each generated from a different
metadata file, and merges them.
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -6,14 +6,17 @@
import json
import logging
from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
+import uuid
import xml.parsers.expat
+from pyld import jsonld
+import rdflib
from typing_extensions import TypedDict
import xmltodict
import yaml
-from swh.indexer.codemeta import compact, merge_values
-from swh.indexer.namespaces import SCHEMA
+from swh.indexer.codemeta import _document_loader, compact
+from swh.indexer.namespaces import RDF, SCHEMA
from swh.indexer.storage.interface import Sha1
@@ -25,7 +28,8 @@
TTranslateCallable = TypeVar(
- "TTranslateCallable", bound=Callable[[Any, Dict[str, Any], Any], None]
+ "TTranslateCallable",
+ bound=Callable[[Any, rdflib.Graph, rdflib.term.BNode, Any], None],
)
@@ -145,7 +149,7 @@
def supported_terms(cls):
# one-to-one mapping from the original key to a CodeMeta term
simple_terms = {
- term
+ str(term)
for (key, term) in cls.mapping.items()
if key in cls.string_fields
or hasattr(cls, "normalize_" + cls._normalize_method_name(key))
@@ -153,7 +157,7 @@
# more complex mapping from the original key to JSON-LD
complex_terms = {
- term
+ str(term)
for meth_name in dir(cls)
if meth_name.startswith("translate_")
for term in getattr(getattr(cls, meth_name), "produced_terms", [])
@@ -174,7 +178,20 @@
the indexer
"""
- translated_metadata = {"@type": SCHEMA.SoftwareSourceCode}
+ graph = rdflib.Graph()
+
+ # The main object being described (the SoftwareSourceCode) does not necessarily
+ # may or may not have an id.
+ # Either way, we temporarily use this URI to identify it. Unfortunately,
+ # we cannot use a blank node as we need to use it for JSON-LD framing later,
+ # and blank nodes cannot be used for framing in JSON-LD >= 1.1
+ root_id = (
+ "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"
+ + str(uuid.uuid4())
+ )
+ root = rdflib.URIRef(root_id)
+ graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode))
+
for k, v in content_dict.items():
# First, check if there is a specific translation
# method for this key
@@ -182,40 +199,66 @@
self, "translate_" + self._normalize_method_name(k), None
)
if translation_method:
- translation_method(translated_metadata, v)
+ translation_method(graph, root, v)
elif k in self.mapping:
# if there is no method, but the key is known from the
# crosswalk table
codemeta_key = self.mapping[k]
- # if there is a normalization method, use it on the value
+ # if there is a normalization method, use it on the value,
+ # and add its results to the triples
normalization_method = getattr(
self, "normalize_" + self._normalize_method_name(k), None
)
if normalization_method:
v = normalization_method(v)
+ if v is None:
+ pass
+ elif isinstance(v, list):
+ for item in reversed(v):
+ graph.add((root, codemeta_key, item))
+ else:
+ graph.add((root, codemeta_key, v))
elif k in self.string_fields and isinstance(v, str):
- pass
+ graph.add((root, codemeta_key, rdflib.Literal(v)))
elif k in self.string_fields and isinstance(v, list):
- v = [x for x in v if isinstance(x, str)]
+ for item in v:
+ graph.add((root, codemeta_key, rdflib.Literal(item)))
else:
continue
- # set the translation metadata with the normalized value
- if codemeta_key in translated_metadata:
- translated_metadata[codemeta_key] = merge_values(
- translated_metadata[codemeta_key], v
- )
- else:
- translated_metadata[codemeta_key] = v
+ self.extra_translation(graph, root, content_dict)
+
+ # Convert from rdflib's internal graph representation to JSON
+ s = graph.serialize(format="application/ld+json")
+
+ # Load from JSON to a list of Python objects
+ jsonld_graph = json.loads(s)
+
+ # Use JSON-LD framing to turn the graph into a rooted tree
+ # frame = {"@type": str(SCHEMA.SoftwareSourceCode)}
+ translated_metadata = jsonld.frame(
+ jsonld_graph,
+ {"@id": root_id},
+ options={
+ "documentLoader": _document_loader,
+ "processingMode": "json-ld-1.1",
+ },
+ )
- self.extra_translation(translated_metadata, content_dict)
+ # Remove the temporary id we added at the beginning
+ if isinstance(translated_metadata["@id"], list):
+ translated_metadata["@id"].remove(root_id)
+ else:
+ del translated_metadata["@id"]
return self.normalize_translation(translated_metadata)
- def extra_translation(self, translated_metadata: Dict[str, Any], d: Dict[str, Any]):
- """Called at the end of the translation process, and may add arbitrary keys
- to ``translated_metadata`` based on the input dictionary (passed as ``d``).
+ def extra_translation(
+ self, graph: rdflib.Graph, root: rdflib.term.Node, d: Dict[str, Any]
+ ):
+ """Called at the end of the translation process, and may add arbitrary triples
+ to ``graph`` based on the input dictionary (passed as ``d``).
"""
pass
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -1,9 +1,21 @@
-from typing import Dict, List, Optional, Union
+# Copyright (C) 2021-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import List
+
+from rdflib import BNode, Graph, Literal, URIRef
+import rdflib.term
from swh.indexer.codemeta import CROSSWALK_TABLE
-from swh.indexer.namespaces import SCHEMA
+from swh.indexer.namespaces import RDF, SCHEMA
from .base import YamlMapping
+from .utils import add_map
+
+DOI = URIRef("https://doi.org/")
+SPDX = URIRef("https://spdx.org/licenses/")
class CffMapping(YamlMapping):
@@ -14,41 +26,41 @@
mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"]
string_fields = ["keywords", "license", "abstract", "version", "doi"]
- def normalize_authors(self, d: List[dict]) -> Dict[str, list]:
- result = []
- for author in d:
- author_data: Dict[str, Optional[Union[str, Dict]]] = {
- "@type": SCHEMA.Person
- }
- if "orcid" in author and isinstance(author["orcid"], str):
- author_data["@id"] = author["orcid"]
- if "affiliation" in author and isinstance(author["affiliation"], str):
- author_data[SCHEMA.affiliation] = {
- "@type": SCHEMA.Organization,
- SCHEMA.name: author["affiliation"],
- }
- if "family-names" in author and isinstance(author["family-names"], str):
- author_data[SCHEMA.familyName] = author["family-names"]
- if "given-names" in author and isinstance(author["given-names"], str):
- author_data[SCHEMA.givenName] = author["given-names"]
-
- result.append(author_data)
-
- result_final = {"@list": result}
- return result_final
-
- def normalize_doi(self, s: str) -> Dict[str, str]:
+ def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node:
+ node: rdflib.term.Node
+ if "orcid" in author and isinstance(author["orcid"], str):
+ node = URIRef(author["orcid"])
+ else:
+ node = BNode()
+ graph.add((node, RDF.type, SCHEMA.Person))
+ if "affiliation" in author and isinstance(author["affiliation"], str):
+ affiliation = BNode()
+ graph.add((node, SCHEMA.affiliation, affiliation))
+ graph.add((affiliation, RDF.type, SCHEMA.Organization))
+ graph.add((affiliation, SCHEMA.name, Literal(author["affiliation"])))
+ if "family-names" in author and isinstance(author["family-names"], str):
+ graph.add((node, SCHEMA.familyName, Literal(author["family-names"])))
+ if "given-names" in author and isinstance(author["given-names"], str):
+ graph.add((node, SCHEMA.givenName, Literal(author["given-names"])))
+ return node
+
+ def translate_authors(
+ self, graph: Graph, root: URIRef, authors: List[dict]
+ ) -> None:
+ add_map(graph, root, SCHEMA.author, self._translate_author, authors)
+
+ def normalize_doi(self, s: str) -> URIRef:
if isinstance(s, str):
- return {"@id": "https://doi.org/" + s}
+ return DOI + s
- def normalize_license(self, s: str) -> Dict[str, str]:
+ def normalize_license(self, s: str) -> URIRef:
if isinstance(s, str):
- return {"@id": "https://spdx.org/licenses/" + s}
+ return SPDX + s
- def normalize_repository_code(self, s: str) -> Dict[str, str]:
+ def normalize_repository_code(self, s: str) -> URIRef:
if isinstance(s, str):
- return {"@id": s}
+ return URIRef(s)
- def normalize_date_released(self, s: str) -> Dict[str, str]:
+ def normalize_date_released(self, s: str) -> Literal:
if isinstance(s, str):
- return {"@value": s, "@type": SCHEMA.Date}
+ return Literal(s, datatype=SCHEMA.Date)
diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py
--- a/swh/indexer/metadata_dictionary/composer.py
+++ b/swh/indexer/metadata_dictionary/composer.py
@@ -4,11 +4,18 @@
# See top-level LICENSE file for more information
import os.path
+from typing import Optional
+
+from rdflib import BNode, Graph, Literal, URIRef
from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
-from swh.indexer.namespaces import SCHEMA
+from swh.indexer.namespaces import RDF, SCHEMA
from .base import JsonMapping, SingleFileIntrinsicMapping
+from .utils import add_map
+
+SPDX = URIRef("https://spdx.org/licenses/")
+
COMPOSER_TABLE_PATH = os.path.join(_DATA_DIR, "composer.csv")
@@ -35,23 +42,24 @@
def normalize_homepage(self, s):
if isinstance(s, str):
- return {"@id": s}
+ return URIRef(s)
def normalize_license(self, s):
if isinstance(s, str):
- return {"@id": "https://spdx.org/licenses/" + s}
+ return SPDX + s
- def normalize_authors(self, author_list):
- authors = []
- for author in author_list:
- author_obj = {"@type": SCHEMA.Person}
+ def _translate_author(self, graph: Graph, author) -> Optional[BNode]:
+ if not isinstance(author, dict):
+ return None
+ node = BNode()
+ graph.add((node, RDF.type, SCHEMA.Person))
- if isinstance(author, dict):
- if isinstance(author.get("name", None), str):
- author_obj[SCHEMA.name] = author.get("name", None)
- if isinstance(author.get("email", None), str):
- author_obj[SCHEMA.email] = author.get("email", None)
+ if isinstance(author.get("name"), str):
+ graph.add((node, SCHEMA.name, Literal(author["name"])))
+ if isinstance(author.get("email"), str):
+ graph.add((node, SCHEMA.email, Literal(author["email"])))
- authors.append(author_obj)
+ return node
- return {"@list": authors}
+ def translate_authors(self, graph: Graph, root: URIRef, authors) -> None:
+ add_map(graph, root, SCHEMA.author, self._translate_author, authors)
diff --git a/swh/indexer/metadata_dictionary/dart.py b/swh/indexer/metadata_dictionary/dart.py
--- a/swh/indexer/metadata_dictionary/dart.py
+++ b/swh/indexer/metadata_dictionary/dart.py
@@ -6,10 +6,15 @@
import os.path
import re
+from rdflib import RDF, BNode, Graph, Literal, URIRef
+
from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
from swh.indexer.namespaces import SCHEMA
from .base import YamlMapping
+from .utils import add_map
+
+SPDX = URIRef("https://spdx.org/licenses/")
PUB_TABLE_PATH = os.path.join(_DATA_DIR, "pubspec.csv")
@@ -43,33 +48,32 @@
def normalize_license(self, s):
if isinstance(s, str):
- return {"@id": "https://spdx.org/licenses/" + s}
+ return SPDX + s
def normalize_homepage(self, s):
if isinstance(s, str):
- return {"@id": s}
+ return URIRef(s)
- def normalize_author(self, s):
- name_email_regex = "(?P<name>.*?)( <(?P<email>.*)>)"
- author = {"@type": SCHEMA.Person}
+ def _translate_author(self, graph, s):
+ name_email_re = re.compile("(?P<name>.*?)( <(?P<email>.*)>)")
if isinstance(s, str):
- match = re.search(name_email_regex, s)
+ author = BNode()
+ graph.add((author, RDF.type, SCHEMA.Person))
+ match = name_email_re.search(s)
if match:
name = match.group("name")
email = match.group("email")
- author[SCHEMA.email] = email
+ graph.add((author, SCHEMA.email, Literal(email)))
else:
name = s
- author[SCHEMA.name] = name
+ graph.add((author, SCHEMA.name, Literal(name)))
- return {"@list": [author]}
+ return author
- def normalize_authors(self, authors_list):
- authors = {"@list": []}
+ def translate_author(self, graph: Graph, root, s) -> None:
+ add_map(graph, root, SCHEMA.author, self._translate_author, [s])
- if isinstance(authors_list, list):
- for s in authors_list:
- author = self.normalize_author(s)["@list"]
- authors["@list"] += author
- return authors
+ def translate_authors(self, graph: Graph, root, authors) -> None:
+ if isinstance(authors, list):
+ add_map(graph, root, SCHEMA.author, self._translate_author, authors)
diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
--- a/swh/indexer/metadata_dictionary/github.py
+++ b/swh/indexer/metadata_dictionary/github.py
@@ -3,17 +3,17 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import json
-from typing import Any, Dict, Tuple
+from typing import Any, Tuple
+
+from rdflib import RDF, BNode, Graph, Literal, URIRef
from swh.indexer.codemeta import CROSSWALK_TABLE
-from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED
+from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA
from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
+from .utils import prettyprint_graph # noqa
-
-def _prettyprint(d):
- print(json.dumps(d, indent=4))
+SPDX = URIRef("https://spdx.org/licenses/")
class GitHubMapping(BaseExtrinsicMapping, JsonMapping):
@@ -33,94 +33,81 @@
def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
return ("application/vnd.github.v3+json",)
- def _translate_dict(self, content_dict: Dict[str, Any], **kwargs) -> Dict[str, Any]:
- d = super()._translate_dict(content_dict, **kwargs)
- d["type"] = FORGEFED.Repository
- return d
+ def extra_translation(self, graph, root, content_dict):
+ graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode))
+ graph.add((root, RDF.type, FORGEFED.Repository))
@produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems)
- def translate_forks_count(
- self, translated_metadata: Dict[str, Any], v: Any
- ) -> None:
+ def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None:
"""
- >>> translated_metadata = {}
- >>> GitHubMapping().translate_forks_count(translated_metadata, 42)
- >>> _prettyprint(translated_metadata)
+ >>> graph = Graph()
+ >>> root = URIRef("http://example.org/test-software")
+ >>> GitHubMapping().translate_forks_count(graph, root, 42)
+ >>> prettyprint_graph(graph, root)
{
- "https://forgefed.org/ns#forks": [
- {
- "@type": "https://www.w3.org/ns/activitystreams#OrderedCollection",
- "https://www.w3.org/ns/activitystreams#totalItems": 42
- }
- ]
+ "@id": ...,
+ "https://forgefed.org/ns#forks": {
+ "@type": "https://www.w3.org/ns/activitystreams#OrderedCollection",
+ "https://www.w3.org/ns/activitystreams#totalItems": 42
+ }
}
"""
if isinstance(v, int):
- translated_metadata.setdefault(FORGEFED.forks, []).append(
- {
- "@type": ACTIVITYSTREAMS.OrderedCollection,
- ACTIVITYSTREAMS.totalItems: v,
- }
- )
+ collection = BNode()
+ graph.add((root, FORGEFED.forks, collection))
+ graph.add((collection, RDF.type, ACTIVITYSTREAMS.OrderedCollection))
+ graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
@produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems)
- def translate_stargazers_count(
- self, translated_metadata: Dict[str, Any], v: Any
- ) -> None:
+ def translate_stargazers_count(self, graph: Graph, root: BNode, v: Any) -> None:
"""
- >>> translated_metadata = {}
- >>> GitHubMapping().translate_stargazers_count(translated_metadata, 42)
- >>> _prettyprint(translated_metadata)
+ >>> graph = Graph()
+ >>> root = URIRef("http://example.org/test-software")
+ >>> GitHubMapping().translate_stargazers_count(graph, root, 42)
+ >>> prettyprint_graph(graph, root)
{
- "https://www.w3.org/ns/activitystreams#likes": [
- {
- "@type": "https://www.w3.org/ns/activitystreams#Collection",
- "https://www.w3.org/ns/activitystreams#totalItems": 42
- }
- ]
+ "@id": ...,
+ "https://www.w3.org/ns/activitystreams#likes": {
+ "@type": "https://www.w3.org/ns/activitystreams#Collection",
+ "https://www.w3.org/ns/activitystreams#totalItems": 42
+ }
}
"""
if isinstance(v, int):
- translated_metadata.setdefault(ACTIVITYSTREAMS.likes, []).append(
- {
- "@type": ACTIVITYSTREAMS.Collection,
- ACTIVITYSTREAMS.totalItems: v,
- }
- )
+ collection = BNode()
+ graph.add((root, ACTIVITYSTREAMS.likes, collection))
+ graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
+ graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
@produce_terms(ACTIVITYSTREAMS.followers, ACTIVITYSTREAMS.totalItems)
- def translate_watchers_count(
- self, translated_metadata: Dict[str, Any], v: Any
- ) -> None:
+ def translate_watchers_count(self, graph: Graph, root: BNode, v: Any) -> None:
"""
- >>> translated_metadata = {}
- >>> GitHubMapping().translate_watchers_count(translated_metadata, 42)
- >>> _prettyprint(translated_metadata)
+ >>> graph = Graph()
+ >>> root = URIRef("http://example.org/test-software")
+ >>> GitHubMapping().translate_watchers_count(graph, root, 42)
+ >>> prettyprint_graph(graph, root)
{
- "https://www.w3.org/ns/activitystreams#followers": [
- {
- "@type": "https://www.w3.org/ns/activitystreams#Collection",
- "https://www.w3.org/ns/activitystreams#totalItems": 42
- }
- ]
+ "@id": ...,
+ "https://www.w3.org/ns/activitystreams#followers": {
+ "@type": "https://www.w3.org/ns/activitystreams#Collection",
+ "https://www.w3.org/ns/activitystreams#totalItems": 42
+ }
}
"""
if isinstance(v, int):
- translated_metadata.setdefault(ACTIVITYSTREAMS.followers, []).append(
- {
- "@type": ACTIVITYSTREAMS.Collection,
- ACTIVITYSTREAMS.totalItems: v,
- }
- )
+ collection = BNode()
+ graph.add((root, ACTIVITYSTREAMS.followers, collection))
+ graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
+ graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
def normalize_license(self, d):
"""
>>> GitHubMapping().normalize_license({'spdx_id': 'MIT'})
- {'@id': 'https://spdx.org/licenses/MIT'}
+ rdflib.term.URIRef('https://spdx.org/licenses/MIT')
"""
if isinstance(d, dict) and isinstance(d.get("spdx_id"), str):
- return {"@id": "https://spdx.org/licenses/" + d["spdx_id"]}
+ return SPDX + d["spdx_id"]
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2021 The Software Heritage developers
+# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -6,10 +6,13 @@
import os
from typing import Any, Dict
+from rdflib import Graph, Literal, URIRef
+
from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import SCHEMA
from .base import SingleFileIntrinsicMapping, XmlMapping
+from .utils import prettyprint_graph # noqa
class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
@@ -27,14 +30,13 @@
def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]:
return super()._translate_dict(d.get("project") or {})
- def extra_translation(self, translated_metadata, d):
- repositories = self.parse_repositories(d)
- if repositories:
- translated_metadata[SCHEMA.codeRepository] = repositories
+ def extra_translation(self, graph: Graph, root, d):
+ self.parse_repositories(graph, root, d)
- def parse_repositories(self, d):
+ def parse_repositories(self, graph: Graph, root, d):
"""https://maven.apache.org/pom.html#Repositories
+ >>> import rdflib
>>> import xmltodict
>>> from pprint import pprint
>>> d = xmltodict.parse('''
@@ -47,21 +49,19 @@
... </repository>
... </repositories>
... ''')
- >>> MavenMapping().parse_repositories(d)
+ >>> MavenMapping().parse_repositories(rdflib.Graph(), rdflib.BNode(), d)
"""
repositories = d.get("repositories")
if not repositories:
- results = [self.parse_repository(d, self._default_repository)]
+ self.parse_repository(graph, root, d, self._default_repository)
elif isinstance(repositories, dict):
repositories = repositories.get("repository") or []
if not isinstance(repositories, list):
repositories = [repositories]
- results = [self.parse_repository(d, repo) for repo in repositories]
- else:
- results = []
- return [res for res in results if res] or None
+ for repo in repositories:
+ self.parse_repository(graph, root, d, repo)
- def parse_repository(self, d, repo):
+ def parse_repository(self, graph: Graph, root, d, repo):
if not isinstance(repo, dict):
return
if repo.get("layout", "default") != "default":
@@ -75,23 +75,18 @@
and isinstance(artifact_id, str)
):
repo = os.path.join(url, *group_id.split("."), artifact_id)
- return {"@id": repo}
+ graph.add((root, SCHEMA.codeRepository, URIRef(repo)))
def normalize_groupId(self, id_):
"""https://maven.apache.org/pom.html#Maven_Coordinates
>>> MavenMapping().normalize_groupId('org.example')
- {'@id': 'org.example'}
+ rdflib.term.Literal('org.example')
"""
if isinstance(id_, str):
- return {"@id": id_}
-
- def translate_licenses(self, translated_metadata, d):
- licenses = self.parse_licenses(d)
- if licenses:
- translated_metadata[SCHEMA.license] = licenses
+ return Literal(id_)
- def parse_licenses(self, licenses):
+ def translate_licenses(self, graph, root, licenses):
"""https://maven.apache.org/pom.html#Licenses
>>> import xmltodict
@@ -113,8 +108,16 @@
}
}
}
- >>> MavenMapping().parse_licenses(d["licenses"])
- [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}]
+ >>> graph = Graph()
+ >>> root = URIRef("http://example.org/test-software")
+ >>> MavenMapping().translate_licenses(graph, root, d["licenses"])
+ >>> prettyprint_graph(graph, root)
+ {
+ "@id": ...,
+ "http://schema.org/license": {
+ "@id": "https://www.apache.org/licenses/LICENSE-2.0.txt"
+ }
+ }
or, if there are more than one license:
@@ -132,9 +135,16 @@
... </license>
... </licenses>
... ''')
- >>> pprint(MavenMapping().parse_licenses(d["licenses"]))
- [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'},
- {'@id': 'https://opensource.org/licenses/MIT'}]
+ >>> graph = Graph()
+ >>> root = URIRef("http://example.org/test-software")
+ >>> MavenMapping().translate_licenses(graph, root, d["licenses"])
+ >>> pprint(set(graph.triples((root, URIRef("http://schema.org/license"), None))))
+ {(rdflib.term.URIRef('http://example.org/test-software'),
+ rdflib.term.URIRef('http://schema.org/license'),
+ rdflib.term.URIRef('https://opensource.org/licenses/MIT')),
+ (rdflib.term.URIRef('http://example.org/test-software'),
+ rdflib.term.URIRef('http://schema.org/license'),
+ rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}
"""
if not isinstance(licenses, dict):
@@ -144,8 +154,6 @@
licenses = [licenses]
elif not isinstance(licenses, list):
return
- return [
- {"@id": license["url"]}
- for license in licenses
- if isinstance(license, dict) and isinstance(license.get("url"), str)
- ] or None
+ for license in licenses:
+ if isinstance(license, dict) and isinstance(license.get("url"), str):
+ graph.add((root, SCHEMA.license, URIRef(license["url"])))
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -6,10 +6,15 @@
import re
import urllib.parse
+from rdflib import RDF, BNode, Graph, Literal, URIRef
+
from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import SCHEMA
from .base import JsonMapping, SingleFileIntrinsicMapping
+from .utils import add_list, prettyprint_graph # noqa
+
+SPDX = URIRef("https://spdx.org/licenses/")
class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
@@ -38,13 +43,13 @@
... 'type': 'git',
... 'url': 'https://example.org/foo.git'
... })
- {'@id': 'git+https://example.org/foo.git'}
+ rdflib.term.URIRef('git+https://example.org/foo.git')
>>> NpmMapping().normalize_repository(
... 'gitlab:foo/bar')
- {'@id': 'git+https://gitlab.com/foo/bar.git'}
+ rdflib.term.URIRef('git+https://gitlab.com/foo/bar.git')
>>> NpmMapping().normalize_repository(
... 'foo/bar')
- {'@id': 'git+https://github.com/foo/bar.git'}
+ rdflib.term.URIRef('git+https://github.com/foo/bar.git')
"""
if (
isinstance(d, dict)
@@ -67,7 +72,7 @@
else:
return None
- return {"@id": url}
+ return URIRef(url)
def normalize_bugs(self, d):
"""https://docs.npmjs.com/files/package.json#bugs
@@ -76,15 +81,15 @@
... 'url': 'https://example.org/bugs/',
... 'email': 'bugs@example.org'
... })
- {'@id': 'https://example.org/bugs/'}
+ rdflib.term.URIRef('https://example.org/bugs/')
>>> NpmMapping().normalize_bugs(
... 'https://example.org/bugs/')
- {'@id': 'https://example.org/bugs/'}
+ rdflib.term.URIRef('https://example.org/bugs/')
"""
if isinstance(d, dict) and isinstance(d.get("url"), str):
- return {"@id": d["url"]}
+ return URIRef(d["url"])
elif isinstance(d, str):
- return {"@id": d}
+ return URIRef(d)
else:
return None
@@ -92,36 +97,75 @@
r"^ *" r"(?P<name>.*?)" r"( +<(?P<email>.*)>)?" r"( +\((?P<url>.*)\))?" r" *$"
)
- def normalize_author(self, d):
+ def translate_author(self, graph: Graph, root, d):
r"""https://docs.npmjs.com/files/package.json#people-fields-author-contributors'
>>> from pprint import pprint
- >>> pprint(NpmMapping().normalize_author({
+ >>> root = URIRef("http://example.org/test-software")
+ >>> graph = Graph()
+ >>> NpmMapping().translate_author(graph, root, {
... 'name': 'John Doe',
... 'email': 'john.doe@example.org',
... 'url': 'https://example.org/~john.doe',
- ... }))
- {'@list': [{'@type': 'http://schema.org/Person',
- 'http://schema.org/email': 'john.doe@example.org',
- 'http://schema.org/name': 'John Doe',
- 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
- >>> pprint(NpmMapping().normalize_author(
+ ... })
+ >>> prettyprint_graph(graph, root)
+ {
+ "@id": ...,
+ "http://schema.org/author": {
+ "@list": [
+ {
+ "@type": "http://schema.org/Person",
+ "http://schema.org/email": "john.doe@example.org",
+ "http://schema.org/name": "John Doe",
+ "http://schema.org/url": {
+ "@id": "https://example.org/~john.doe"
+ }
+ }
+ ]
+ }
+ }
+ >>> graph = Graph()
+ >>> NpmMapping().translate_author(graph, root,
... 'John Doe <john.doe@example.org> (https://example.org/~john.doe)'
- ... ))
- {'@list': [{'@type': 'http://schema.org/Person',
- 'http://schema.org/email': 'john.doe@example.org',
- 'http://schema.org/name': 'John Doe',
- 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
- >>> pprint(NpmMapping().normalize_author({
+ ... )
+ >>> prettyprint_graph(graph, root)
+ {
+ "@id": ...,
+ "http://schema.org/author": {
+ "@list": [
+ {
+ "@type": "http://schema.org/Person",
+ "http://schema.org/email": "john.doe@example.org",
+ "http://schema.org/name": "John Doe",
+ "http://schema.org/url": {
+ "@id": "https://example.org/~john.doe"
+ }
+ }
+ ]
+ }
+ }
+ >>> graph = Graph()
+ >>> NpmMapping().translate_author(graph, root, {
... 'name': 'John Doe',
... 'email': 'john.doe@example.org',
... 'url': 'https:\\\\example.invalid/~john.doe',
- ... }))
- {'@list': [{'@type': 'http://schema.org/Person',
- 'http://schema.org/email': 'john.doe@example.org',
- 'http://schema.org/name': 'John Doe'}]}
+ ... })
+ >>> prettyprint_graph(graph, root)
+ {
+ "@id": ...,
+ "http://schema.org/author": {
+ "@list": [
+ {
+ "@type": "http://schema.org/Person",
+ "http://schema.org/email": "john.doe@example.org",
+ "http://schema.org/name": "John Doe"
+ }
+ ]
+ }
+ }
""" # noqa
- author = {"@type": SCHEMA.Person}
+ author = BNode()
+ graph.add((author, RDF.type, SCHEMA.Person))
if isinstance(d, dict):
name = d.get("name", None)
email = d.get("email", None)
@@ -137,32 +181,32 @@
return None
if name and isinstance(name, str):
- author[SCHEMA.name] = name
+ graph.add((author, SCHEMA.name, Literal(name)))
if email and isinstance(email, str):
- author[SCHEMA.email] = email
+ graph.add((author, SCHEMA.email, Literal(email)))
if url and isinstance(url, str):
# Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
# URLs that are blatantly invalid early, so PyLD does not crash.
parsed_url = urllib.parse.urlparse(url)
if parsed_url.netloc:
- author[SCHEMA.url] = {"@id": url}
+ graph.add((author, SCHEMA.url, URIRef(url)))
- return {"@list": [author]}
+ add_list(graph, root, SCHEMA.author, [author])
def normalize_description(self, description):
r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
mistake that causes issues in the database because of null bytes in JSON.
>>> NpmMapping().normalize_description("foo bar")
- 'foo bar'
+ rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
... )
- 'foo bar'
+ rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
... )
- 'foo bar'
+ rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... # invalid UTF-16 and meaningless UTF-8:
... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
@@ -213,32 +257,34 @@
if description:
if description.startswith("# "):
description = description[2:]
- return description.rstrip()
- return description
+ return Literal(description.rstrip())
+ else:
+ return None
+ return Literal(description)
def normalize_license(self, s):
"""https://docs.npmjs.com/files/package.json#license
>>> NpmMapping().normalize_license('MIT')
- {'@id': 'https://spdx.org/licenses/MIT'}
+ rdflib.term.URIRef('https://spdx.org/licenses/MIT')
"""
if isinstance(s, str):
- return {"@id": "https://spdx.org/licenses/" + s}
+ return SPDX + s
def normalize_homepage(self, s):
"""https://docs.npmjs.com/files/package.json#homepage
>>> NpmMapping().normalize_homepage('https://example.org/~john.doe')
- {'@id': 'https://example.org/~john.doe'}
+ rdflib.term.URIRef('https://example.org/~john.doe')
"""
if isinstance(s, str):
- return {"@id": s}
+ return URIRef(s)
def normalize_keywords(self, lst):
"""https://docs.npmjs.com/files/package.json#homepage
>>> NpmMapping().normalize_keywords(['foo', 'bar'])
- ['foo', 'bar']
+ [rdflib.term.Literal('foo'), rdflib.term.Literal('bar')]
"""
if isinstance(lst, list):
- return [x for x in lst if isinstance(x, str)]
+ return [Literal(x) for x in lst if isinstance(x, str)]
diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py
--- a/swh/indexer/metadata_dictionary/nuget.py
+++ b/swh/indexer/metadata_dictionary/nuget.py
@@ -7,17 +7,22 @@
import re
from typing import Any, Dict, List
+from rdflib import RDF, BNode, Graph, Literal, URIRef
+
from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
from swh.indexer.namespaces import SCHEMA
from swh.indexer.storage.interface import Sha1
from .base import BaseIntrinsicMapping, DirectoryLsEntry, XmlMapping
+from .utils import add_list
NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv")
with open(NUGET_TABLE_PATH) as fd:
(CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd)
+SPDX = URIRef("https://spdx.org/licenses/")
+
class NuGetMapping(XmlMapping, BaseIntrinsicMapping):
"""
@@ -26,8 +31,8 @@
name = "nuget"
mapping = NUGET_TABLE["NuGet"]
- mapping["copyright"] = "http://schema.org/copyrightNotice"
- mapping["language"] = "http://schema.org/inLanguage"
+ mapping["copyright"] = URIRef("http://schema.org/copyrightNotice")
+ mapping["language"] = URIRef("http://schema.org/inLanguage")
string_fields = [
"description",
"version",
@@ -53,12 +58,12 @@
def normalize_projectUrl(self, s):
if isinstance(s, str):
- return {"@id": s}
+ return URIRef(s)
- def translate_repository(self, translated_metadata, v):
+ def translate_repository(self, graph, root, v):
if isinstance(v, dict) and isinstance(v["@url"], str):
- codemeta_key = self.mapping["repository.url"]
- translated_metadata[codemeta_key] = {"@id": v["@url"]}
+ codemeta_key = URIRef(self.mapping["repository.url"])
+ graph.add((root, codemeta_key, URIRef(v["@url"])))
def normalize_license(self, v):
if isinstance(v, dict) and v["@type"] == "expression":
@@ -67,7 +72,7 @@
re.search(r" with |\(|\)| and ", license_string, re.IGNORECASE)
):
return [
- {"@id": "https://spdx.org/licenses/" + license_type.strip()}
+ SPDX + license_type.strip()
for license_type in re.split(
r" or ", license_string, flags=re.IGNORECASE
)
@@ -77,22 +82,23 @@
def normalize_licenseUrl(self, s):
if isinstance(s, str):
- return {"@id": s}
+ return URIRef(s)
- def normalize_authors(self, s):
+ def translate_authors(self, graph: Graph, root, s):
if isinstance(s, str):
- author_names = [a.strip() for a in s.split(",")]
- authors = [
- {"@type": SCHEMA.Person, SCHEMA.name: name} for name in author_names
- ]
- return {"@list": authors}
-
- def translate_releaseNotes(self, translated_metadata, s):
+ authors = []
+ for author_name in s.split(","):
+ author_name = author_name.strip()
+ author = BNode()
+ graph.add((author, RDF.type, SCHEMA.Person))
+ graph.add((author, SCHEMA.name, Literal(author_name)))
+ authors.append(author)
+ add_list(graph, root, SCHEMA.author, authors)
+
+ def translate_releaseNotes(self, graph: Graph, root, s):
if isinstance(s, str):
- translated_metadata.setdefault("http://schema.org/releaseNotes", []).append(
- s
- )
+ graph.add((root, SCHEMA.releaseNotes, Literal(s)))
def normalize_tags(self, s):
if isinstance(s, str):
- return s.split(" ")
+ return [Literal(tag) for tag in s.split(" ")]
diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py
--- a/swh/indexer/metadata_dictionary/python.py
+++ b/swh/indexer/metadata_dictionary/python.py
@@ -1,16 +1,18 @@
-# Copyright (C) 2018-2019 The Software Heritage developers
+# Copyright (C) 2018-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import email.parser
import email.policy
-import itertools
+
+from rdflib import BNode, Literal, URIRef
from swh.indexer.codemeta import CROSSWALK_TABLE
-from swh.indexer.namespaces import SCHEMA
+from swh.indexer.namespaces import RDF, SCHEMA
from .base import DictMapping, SingleFileIntrinsicMapping
+from .utils import add_list
_normalize_pkginfo_key = str.lower
@@ -54,25 +56,25 @@
d.setdefault(key, []).append(value)
return self._translate_dict(d)
- def extra_translation(self, translated_metadata, d):
- author_name = translated_metadata.pop(SCHEMA.author, None)
- author_email = translated_metadata.pop(SCHEMA.email, None)
- if author_name or author_email:
- translated_metadata[SCHEMA.author] = {
- "@list": [
- {
- "@type": SCHEMA.Person,
- SCHEMA.name: author_name,
- SCHEMA.email: author_email,
- }
- ]
- }
+ def extra_translation(self, graph, root, d):
+ author_names = list(graph.triples((root, SCHEMA.author, None)))
+ author_emails = list(graph.triples((root, SCHEMA.email, None)))
+ graph.remove((root, SCHEMA.author, None))
+ graph.remove((root, SCHEMA.email, None))
+ if author_names or author_emails:
+ author = BNode()
+ graph.add((author, RDF.type, SCHEMA.Person))
+ for (_, _, author_name) in author_names:
+ graph.add((author, SCHEMA.name, author_name))
+ for (_, _, author_email) in author_emails:
+ graph.add((author, SCHEMA.email, author_email))
+ add_list(graph, root, SCHEMA.author, [author])
def normalize_home_page(self, urls):
- return [{"@id": url} for url in urls]
+ return [URIRef(url) for url in urls]
def normalize_keywords(self, keywords):
- return list(itertools.chain.from_iterable(s.split(" ") for s in keywords))
+ return [Literal(keyword) for s in keywords for keyword in s.split(" ")]
def normalize_license(self, licenses):
- return [{"@id": license} for license in licenses]
+ return [URIRef("https://spdx.org/licenses/" + license) for license in licenses]
diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py
--- a/swh/indexer/metadata_dictionary/ruby.py
+++ b/swh/indexer/metadata_dictionary/ruby.py
@@ -8,19 +8,26 @@
import re
from typing import List
+from rdflib import RDF, BNode, Graph, Literal, URIRef
+
from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
from swh.indexer.namespaces import SCHEMA
from swh.indexer.storage.interface import Sha1
from .base import BaseIntrinsicMapping, DictMapping
+from .utils import add_map
+
+SPDX = URIRef("https://spdx.org/licenses/")
-def name_to_person(name):
- return {
- "@type": SCHEMA.Person,
- SCHEMA.name: name,
- }
+def name_to_person(graph: Graph, name):
+ if not isinstance(name, str):
+ return None
+ author = BNode()
+ graph.add((author, RDF.type, SCHEMA.Person))
+ graph.add((author, SCHEMA.name, Literal(name)))
+ return author
class GemspecMapping(BaseIntrinsicMapping, DictMapping):
@@ -107,30 +114,20 @@
def normalize_homepage(self, s):
if isinstance(s, str):
- return {"@id": s}
+ return URIRef(s)
def normalize_license(self, s):
if isinstance(s, str):
- return [{"@id": "https://spdx.org/licenses/" + s}]
+ return SPDX + s
def normalize_licenses(self, licenses):
if isinstance(licenses, list):
- return [
- {"@id": "https://spdx.org/licenses/" + license}
- for license in licenses
- if isinstance(license, str)
- ]
+ return [SPDX + license for license in licenses if isinstance(license, str)]
- def normalize_author(self, author):
+ def translate_author(self, graph: Graph, root, author):
if isinstance(author, str):
- return {"@list": [name_to_person(author)]}
+ add_map(graph, root, SCHEMA.author, name_to_person, [author])
- def normalize_authors(self, authors):
+ def translate_authors(self, graph: Graph, root, authors):
if isinstance(authors, list):
- return {
- "@list": [
- name_to_person(author)
- for author in authors
- if isinstance(author, str)
- ]
- }
+ add_map(graph, root, SCHEMA.author, name_to_person, authors)
diff --git a/swh/indexer/metadata_dictionary/utils.py b/swh/indexer/metadata_dictionary/utils.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/utils.py
@@ -0,0 +1,72 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+import json
+from typing import Callable, Iterable, Optional, Sequence, TypeVar
+
+from pyld import jsonld
+from rdflib import RDF, Graph, URIRef
+import rdflib.term
+
+from swh.indexer.codemeta import _document_loader
+
+
+def prettyprint_graph(graph: Graph, root: URIRef):
+ s = graph.serialize(format="application/ld+json")
+ jsonld_graph = json.loads(s)
+ translated_metadata = jsonld.frame(
+ jsonld_graph,
+ {"@id": str(root)},
+ options={
+ "documentLoader": _document_loader,
+ "processingMode": "json-ld-1.1",
+ },
+ )
+ print(json.dumps(translated_metadata, indent=4))
+
+
+def add_list(
+ graph: Graph,
+ subject: rdflib.term.Node,
+ predicate: rdflib.term.Identifier,
+ objects: Sequence[rdflib.term.Node],
+) -> None:
+ """Adds triples to the ``graph`` so that they are equivalent to this
+ JSON-LD object::
+
+ {
+ "@id": subject,
+ predicate: {"@list": objects}
+ }
+
+ This is a naive implementation of
+ https://json-ld.org/spec/latest/json-ld-api/#list-to-rdf-conversion
+ """
+ # JSON-LD's @list is syntactic sugar for a linked list / chain in the RDF graph,
+ # which is what we are going to construct, starting from the end:
+ last_link: rdflib.term.Node
+ last_link = RDF.nil
+ for item in reversed(objects):
+ link = rdflib.BNode()
+ graph.add((link, RDF.first, item))
+ graph.add((link, RDF.rest, last_link))
+ last_link = link
+ graph.add((subject, predicate, last_link))
+
+
+TValue = TypeVar("TValue")
+
+
+def add_map(
+ graph: Graph,
+ subject: rdflib.term.Node,
+ predicate: rdflib.term.Identifier,
+ f: Callable[[Graph, TValue], Optional[rdflib.term.Node]],
+ values: Iterable[TValue],
+) -> None:
+ """Helper for :func:`add_list` that takes a mapper function ``f``."""
+ nodes = [f(graph, value) for value in values]
+ add_list(graph, subject, predicate, [node for node in nodes if node])
diff --git a/swh/indexer/namespaces.py b/swh/indexer/namespaces.py
--- a/swh/indexer/namespaces.py
+++ b/swh/indexer/namespaces.py
@@ -3,24 +3,8 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
-class _Namespace:
- """Handy class to get terms within a namespace by accessing them as attributes.
-
- This is similar to `rdflib's namespaces
- <https://rdflib.readthedocs.io/en/stable/namespaces_and_bindings.html>`__
- """
-
- def __init__(self, uri: str):
- if not uri.endswith(("#", "/")):
- # Sanity check, to make sure it doesn't end with an alphanumerical
- # character, which is very likely to be invalid.
- raise ValueError(f"Invalid trailing character for namespace URI: {uri}")
- self._uri = uri
-
- def __getattr__(self, term: str) -> str:
- return self._uri + term
-
+from rdflib import Namespace as _Namespace
+from rdflib import RDF # noqa
SCHEMA = _Namespace("http://schema.org/")
CODEMETA = _Namespace("https://codemeta.github.io/terms/")
diff --git a/swh/indexer/tests/metadata_dictionary/test_cff.py b/swh/indexer/tests/metadata_dictionary/test_cff.py
--- a/swh/indexer/tests/metadata_dictionary/test_cff.py
+++ b/swh/indexer/tests/metadata_dictionary/test_cff.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 The Software Heritage developers
+# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -44,6 +44,13 @@
"utf-8"
)
+ result = MAPPINGS["CffMapping"]().translate(content)
+ assert set(result.pop("keywords")) == {
+ "citation",
+ "bibliography",
+ "cff",
+ "CITATION.cff",
+ }
expected = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
@@ -76,12 +83,10 @@
Citation File Format to various other formats such as BibTeX, EndNote, \
RIS, schema.org, CodeMeta, and .zenodo.json.""",
"identifier": "https://doi.org/10.5281/zenodo.1162057",
- "keywords": ["citation", "bibliography", "cff", "CITATION.cff"],
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "1.4.0-alpha0",
}
- result = MAPPINGS["CffMapping"]().translate(content)
assert expected == result
diff --git a/swh/indexer/tests/metadata_dictionary/test_composer.py b/swh/indexer/tests/metadata_dictionary/test_composer.py
--- a/swh/indexer/tests/metadata_dictionary/test_composer.py
+++ b/swh/indexer/tests/metadata_dictionary/test_composer.py
@@ -60,11 +60,16 @@
result = MAPPINGS["ComposerMapping"]().translate(raw_content)
+ assert set(result.pop("keywords")) == {
+ "polyfill",
+ "shim",
+ "compatibility",
+ "portable",
+ }, result
expected = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "symfony/polyfill-mbstring",
- "keywords": ["polyfill", "shim", "compatibility", "portable"],
"description": "Symfony polyfill for the Mbstring extension",
"url": "https://symfony.com",
"license": "https://spdx.org/licenses/MIT",
diff --git a/swh/indexer/tests/metadata_dictionary/test_dart.py b/swh/indexer/tests/metadata_dictionary/test_dart.py
--- a/swh/indexer/tests/metadata_dictionary/test_dart.py
+++ b/swh/indexer/tests/metadata_dictionary/test_dart.py
@@ -3,6 +3,8 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import pytest
+
from swh.indexer.metadata_dictionary import MAPPINGS
@@ -41,17 +43,17 @@
result = MAPPINGS["PubMapping"]().translate(raw_content)
+ assert set(result.pop("keywords")) == {
+ "polyfill",
+ "shim",
+ "compatibility",
+ "portable",
+ "mbstring",
+ }, result
expected = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "newtify",
- "keywords": [
- "polyfill",
- "shim",
- "compatibility",
- "portable",
- "mbstring",
- ],
"description": """Have you been turned into a newt? Would you like to be? \
This package can help. It has all of the \
newt-transmogrification functionality you have been looking \
@@ -109,6 +111,7 @@
assert result == expected
+@pytest.mark.xfail(reason="https://github.com/w3c/json-ld-api/issues/547")
def test_normalize_author_authors_pubspec():
raw_content = """
authors:
diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py
--- a/swh/indexer/tests/metadata_dictionary/test_github.py
+++ b/swh/indexer/tests/metadata_dictionary/test_github.py
@@ -120,7 +120,7 @@
result = MAPPINGS["GitHubMapping"]().translate(content)
assert result == {
"@context": CONTEXT,
- "type": "https://forgefed.org/ns#Repository",
+ "type": "forge:Repository",
"forge:forks": {
"as:totalItems": 1,
"type": "as:OrderedCollection",
diff --git a/swh/indexer/tests/metadata_dictionary/test_maven.py b/swh/indexer/tests/metadata_dictionary/test_maven.py
--- a/swh/indexer/tests/metadata_dictionary/test_maven.py
+++ b/swh/indexer/tests/metadata_dictionary/test_maven.py
@@ -45,7 +45,7 @@
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
- "identifier": "com.mycompany.app",
+ "schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
"codeRepository": ("http://repo1.maven.org/maven2/com/mycompany/app/my-app"),
@@ -167,7 +167,7 @@
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
- "identifier": "com.mycompany.app",
+ "schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
@@ -191,7 +191,7 @@
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
- "identifier": "com.mycompany.app",
+ "schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
@@ -211,7 +211,7 @@
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
- "identifier": "com.mycompany.app",
+ "schema:identifier": "com.mycompany.app",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
@@ -229,7 +229,7 @@
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
- "identifier": "com.mycompany.app",
+ "schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
@@ -251,7 +251,7 @@
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
- "identifier": "com.mycompany.app",
+ "schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
@@ -288,7 +288,7 @@
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
- "identifier": "com.mycompany.app",
+ "schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
@@ -336,20 +336,20 @@
</licenses>
</project>"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert set(result.pop("license")) == {
+ "https://www.apache.org/licenses/LICENSE-2.0.txt",
+ "https://opensource.org/licenses/MIT",
+ }, result
+ assert set(result.pop("codeRepository")) == {
+ "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
+ "http://example.org/maven2/com/mycompany/app/my-app",
+ }, result
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
- "identifier": "com.mycompany.app",
+ "schema:identifier": "com.mycompany.app",
"version": "1.2.3",
- "license": [
- "https://www.apache.org/licenses/LICENSE-2.0.txt",
- "https://opensource.org/licenses/MIT",
- ],
- "codeRepository": [
- "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
- "http://example.org/maven2/com/mycompany/app/my-app",
- ],
}
diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -147,12 +147,6 @@
"license": "https://spdx.org/licenses/Artistic-2.0",
"version": "5.0.3",
"name": "npm",
- "keywords": [
- "install",
- "modules",
- "package manager",
- "package.json",
- ],
"url": "https://docs.npmjs.com/",
},
),
@@ -160,6 +154,7 @@
for result in results:
del result.tool["id"]
+ result.metadata.pop("keywords", None)
# The assertion below returns False sometimes because of nested lists
assert expected_results == results
diff --git a/swh/indexer/tests/metadata_dictionary/test_nuget.py b/swh/indexer/tests/metadata_dictionary/test_nuget.py
--- a/swh/indexer/tests/metadata_dictionary/test_nuget.py
+++ b/swh/indexer/tests/metadata_dictionary/test_nuget.py
@@ -35,7 +35,26 @@
<file src="bin\\Debug\\*.dll" target="lib" />
</files>
</package>"""
+
result = MAPPINGS["NuGetMapping"]().translate(raw_content)
+
+ assert set(result.pop("keywords")) == {
+ "python3",
+ "java",
+ "cpp",
+ "search-tag",
+ }, result
+
+ assert set(result.pop("license")) == {
+ "https://spdx.org/licenses/MIT",
+ "https://raw.github.com/timrwood/moment/master/LICENSE",
+ }, result
+
+ assert set(result.pop("description")) == {
+ "Sample exists only to show a sample .nuspec file.",
+ "Summary is being deprecated. Use description instead.",
+ }, result
+
expected = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
@@ -44,25 +63,11 @@
{"type": "Person", "name": "Franck Halmaert"},
],
"codeRepository": "https://github.com/NuGet/NuGet.Client.git",
- "description": [
- "Sample exists only to show a sample .nuspec file.",
- "Summary is being deprecated. Use description instead.",
- ],
- "license": [
- "https://spdx.org/licenses/MIT",
- "https://raw.github.com/timrwood/moment/master/LICENSE",
- ],
"url": "http://example.org/",
"version": "1.2.3",
"schema:releaseNotes": (
"See the [changelog](https://github.com/httpie/httpie/releases/tag/3.2.0)."
),
- "keywords": [
- "python3",
- "java",
- "cpp",
- "search-tag",
- ],
}
assert result == expected
@@ -114,13 +119,13 @@
</files>
</package>"""
result = MAPPINGS["NuGetMapping"]().translate(raw_content)
+ assert set(result.pop("license")) == {
+ "https://spdx.org/licenses/BitTorrent-1.0",
+ "https://spdx.org/licenses/GPL-3.0-with-GCC-exception",
+ }
expected = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
- "license": [
- "https://spdx.org/licenses/BitTorrent-1.0",
- "https://spdx.org/licenses/GPL-3.0-with-GCC-exception",
- ],
}
assert result == expected
diff --git a/swh/indexer/tests/metadata_dictionary/test_python.py b/swh/indexer/tests/metadata_dictionary/test_python.py
--- a/swh/indexer/tests/metadata_dictionary/test_python.py
+++ b/swh/indexer/tests/metadata_dictionary/test_python.py
@@ -38,7 +38,7 @@
Provides-Extra: testing
""" # noqa
result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
- assert result["description"] == [
+ assert set(result.pop("description")) == {
"Software Heritage core utilities", # note the comma here
"swh-core\n"
"========\n"
@@ -49,8 +49,7 @@
"- serialization\n"
"- logging mechanism\n"
"",
- ], result
- del result["description"]
+ }, result
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
@@ -91,11 +90,11 @@
Keywords: foo bar baz
""" # noqa
result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
+ assert set(result.pop("keywords")) == {"foo", "bar", "baz"}, result
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "foo",
- "keywords": ["foo", "bar", "baz"],
}
@@ -110,5 +109,5 @@
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "foo",
- "license": "MIT",
+ "license": "https://spdx.org/licenses/MIT",
}
diff --git a/swh/indexer/tests/metadata_dictionary/test_ruby.py b/swh/indexer/tests/metadata_dictionary/test_ruby.py
--- a/swh/indexer/tests/metadata_dictionary/test_ruby.py
+++ b/swh/indexer/tests/metadata_dictionary/test_ruby.py
@@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
from hypothesis import HealthCheck, given, settings, strategies
+import pytest
from swh.indexer.metadata_dictionary import MAPPINGS
@@ -39,6 +40,7 @@
}
+@pytest.mark.xfail(reason="https://github.com/w3c/json-ld-api/issues/547")
def test_gemspec_two_author_fields():
raw_content = b"""
Gem::Specification.new do |s|
diff --git a/swh/indexer/tests/test_codemeta.py b/swh/indexer/tests/test_codemeta.py
--- a/swh/indexer/tests/test_codemeta.py
+++ b/swh/indexer/tests/test_codemeta.py
@@ -3,13 +3,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import pytest
-
-from swh.indexer.codemeta import CROSSWALK_TABLE, merge_documents, merge_values
+from swh.indexer.codemeta import CROSSWALK_TABLE, merge_documents
def test_crosstable():
- assert CROSSWALK_TABLE["NodeJS"] == {
+ assert {k: str(v) for (k, v) in CROSSWALK_TABLE["NodeJS"].items()} == {
"repository": "http://schema.org/codeRepository",
"os": "http://schema.org/operatingSystem",
"cpu": "http://schema.org/processorRequirements",
@@ -28,32 +26,6 @@
}
-def test_merge_values():
- assert merge_values("a", "b") == ["a", "b"]
- assert merge_values(["a", "b"], "c") == ["a", "b", "c"]
- assert merge_values("a", ["b", "c"]) == ["a", "b", "c"]
-
- assert merge_values({"@list": ["a"]}, {"@list": ["b"]}) == {"@list": ["a", "b"]}
- assert merge_values({"@list": ["a", "b"]}, {"@list": ["c"]}) == {
- "@list": ["a", "b", "c"]
- }
-
- with pytest.raises(ValueError):
- merge_values({"@list": ["a"]}, "b")
- with pytest.raises(ValueError):
- merge_values("a", {"@list": ["b"]})
- with pytest.raises(ValueError):
- merge_values({"@list": ["a"]}, ["b"])
- with pytest.raises(ValueError):
- merge_values(["a"], {"@list": ["b"]})
-
- assert merge_values("a", None) == "a"
- assert merge_values(["a", "b"], None) == ["a", "b"]
- assert merge_values(None, ["b", "c"]) == ["b", "c"]
- assert merge_values({"@list": ["a"]}, None) == {"@list": ["a"]}
- assert merge_values(None, {"@list": ["a"]}) == {"@list": ["a"]}
-
-
def test_merge_documents():
"""
Test the creation of a coherent minimal metadata set
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Jun 20, 5:20 PM (1 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225333
Attached To
D8279: Refactor metadata mappings using rdflib.Graph instead of JSON-LD internally
Event Timeline
Log In to Comment