D8279.id29898.diff
No OneTemporary
Actions

Size

68 KB

Subscribers

None

D8279.id29898.diff
View Options

	diff --git a/mypy.ini b/mypy.ini
	--- a/mypy.ini
	+++ b/mypy.ini
	@@ -26,5 +26,8 @@
	[mypy-pytest.*]
	ignore_missing_imports = True

	+[mypy-rdflib.*]
	+ignore_missing_imports = True
	+
	[mypy-xmltodict.*]
	ignore_missing_imports = True
	diff --git a/requirements.txt b/requirements.txt
	--- a/requirements.txt
	+++ b/requirements.txt
	@@ -5,6 +5,7 @@
	# cf https://forge.softwareheritage.org/T3815
	frozendict != 2.1.2
	pyld
	+rdflib
	sentry-sdk
	typing-extensions
	xmltodict
	diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py
	--- a/swh/indexer/codemeta.py
	+++ b/swh/indexer/codemeta.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2018 The Software Heritage developers
	+# Copyright (C) 2018-2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -12,6 +12,7 @@
	from typing import Any, List

	from pyld import jsonld
	+import rdflib

	import swh.indexer
	from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA
	@@ -61,7 +62,7 @@
	uri = jsonld.JsonLdProcessor.get_context_value(
	_PROCESSED_CODEMETA_CONTEXT, local_name, "@id"
	)
	- assert uri.startswith(("@", CODEMETA._uri, SCHEMA._uri)), (local_name, uri)
	+ assert uri.startswith(("@", CODEMETA, SCHEMA)), (local_name, uri)
	return uri


	@@ -92,7 +93,9 @@
	# For each of the data source's properties that maps
	# to this canonical name
	if local_name.strip():
	- codemeta_translation[col][local_name.strip()] = canonical_name
	+ codemeta_translation[col][local_name.strip()] = rdflib.URIRef(
	+ canonical_name
	+ )

	return (terms, codemeta_translation)

	@@ -112,10 +115,10 @@
	"documentUrl": url,
	"document": CODEMETA_CONTEXT,
	}
	- elif url == CODEMETA._uri:
	+ elif url == CODEMETA:
	raise Exception(
	"{} is CodeMeta's URI, use {} as context url".format(
	- CODEMETA._uri, CODEMETA_CONTEXT_URL
	+ CODEMETA, CODEMETA_CONTEXT_URL
	)
	)
	else:
	@@ -132,7 +135,7 @@
	"""
	contexts: List[Any] = [CODEMETA_CONTEXT_URL]
	if forgefed:
	- contexts.append({"as": ACTIVITYSTREAMS._uri, "forge": FORGEFED._uri})
	+ contexts.append({"as": str(ACTIVITYSTREAMS), "forge": str(FORGEFED)})
	return jsonld.compact(doc, contexts, options={"documentLoader": _document_loader})


	@@ -141,40 +144,6 @@
	return jsonld.expand(doc, options={"documentLoader": _document_loader})


	-def merge_values(v1, v2):
	- """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`,
	- returns `{"@list": l1 + l2}`.
	- Otherwise, make them lists (if they are not already) and concatenate
	- them.
	-
	- >>> merge_values('a', 'b')
	- ['a', 'b']
	- >>> merge_values(['a', 'b'], 'c')
	- ['a', 'b', 'c']
	- >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']})
	- {'@list': ['a', 'b', 'c']}
	- """
	- if v1 is None:
	- return v2
	- elif v2 is None:
	- return v1
	- elif isinstance(v1, dict) and set(v1) == {"@list"}:
	- assert isinstance(v1["@list"], list)
	- if isinstance(v2, dict) and set(v2) == {"@list"}:
	- assert isinstance(v2["@list"], list)
	- return {"@list": v1["@list"] + v2["@list"]}
	- else:
	- raise ValueError("Cannot merge %r and %r" % (v1, v2))
	- else:
	- if isinstance(v2, dict) and "@list" in v2:
	- raise ValueError("Cannot merge %r and %r" % (v1, v2))
	- if not isinstance(v1, list):
	- v1 = [v1]
	- if not isinstance(v2, list):
	- v2 = [v2]
	- return v1 + v2
	-
	-
	def merge_documents(documents):
	"""Takes a list of metadata dicts, each generated from a different
	metadata file, and merges them.
	diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
	--- a/swh/indexer/metadata_dictionary/base.py
	+++ b/swh/indexer/metadata_dictionary/base.py
	@@ -6,14 +6,17 @@
	import json
	import logging
	from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
	+import uuid
	import xml.parsers.expat

	+from pyld import jsonld
	+import rdflib
	from typing_extensions import TypedDict
	import xmltodict
	import yaml

	-from swh.indexer.codemeta import compact, merge_values
	-from swh.indexer.namespaces import SCHEMA
	+from swh.indexer.codemeta import _document_loader, compact
	+from swh.indexer.namespaces import RDF, SCHEMA
	from swh.indexer.storage.interface import Sha1


	@@ -25,7 +28,8 @@


	TTranslateCallable = TypeVar(
	- "TTranslateCallable", bound=Callable[[Any, Dict[str, Any], Any], None]
	+ "TTranslateCallable",
	+ bound=Callable[[Any, rdflib.Graph, rdflib.term.BNode, Any], None],
	)


	@@ -145,7 +149,7 @@
	def supported_terms(cls):
	# one-to-one mapping from the original key to a CodeMeta term
	simple_terms = {
	- term
	+ str(term)
	for (key, term) in cls.mapping.items()
	if key in cls.string_fields
	or hasattr(cls, "normalize_" + cls._normalize_method_name(key))
	@@ -153,7 +157,7 @@

	# more complex mapping from the original key to JSON-LD
	complex_terms = {
	- term
	+ str(term)
	for meth_name in dir(cls)
	if meth_name.startswith("translate_")
	for term in getattr(getattr(cls, meth_name), "produced_terms", [])
	@@ -174,7 +178,20 @@
	the indexer

	"""
	- translated_metadata = {"@type": SCHEMA.SoftwareSourceCode}
	+ graph = rdflib.Graph()
	+
	+ # The main object being described (the SoftwareSourceCode) does not necessarily
	+ # may or may not have an id.
	+ # Either way, we temporarily use this URI to identify it. Unfortunately,
	+ # we cannot use a blank node as we need to use it for JSON-LD framing later,
	+ # and blank nodes cannot be used for framing in JSON-LD >= 1.1
	+ root_id = (
	+ "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"
	+ + str(uuid.uuid4())
	+ )
	+ root = rdflib.URIRef(root_id)
	+ graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode))
	+
	for k, v in content_dict.items():
	# First, check if there is a specific translation
	# method for this key
	@@ -182,40 +199,66 @@
	self, "translate_" + self._normalize_method_name(k), None
	)
	if translation_method:
	- translation_method(translated_metadata, v)
	+ translation_method(graph, root, v)
	elif k in self.mapping:
	# if there is no method, but the key is known from the
	# crosswalk table
	codemeta_key = self.mapping[k]

	- # if there is a normalization method, use it on the value
	+ # if there is a normalization method, use it on the value,
	+ # and add its results to the triples
	normalization_method = getattr(
	self, "normalize_" + self._normalize_method_name(k), None
	)
	if normalization_method:
	v = normalization_method(v)
	+ if v is None:
	+ pass
	+ elif isinstance(v, list):
	+ for item in reversed(v):
	+ graph.add((root, codemeta_key, item))
	+ else:
	+ graph.add((root, codemeta_key, v))
	elif k in self.string_fields and isinstance(v, str):
	- pass
	+ graph.add((root, codemeta_key, rdflib.Literal(v)))
	elif k in self.string_fields and isinstance(v, list):
	- v = [x for x in v if isinstance(x, str)]
	+ for item in v:
	+ graph.add((root, codemeta_key, rdflib.Literal(item)))
	else:
	continue

	- # set the translation metadata with the normalized value
	- if codemeta_key in translated_metadata:
	- translated_metadata[codemeta_key] = merge_values(
	- translated_metadata[codemeta_key], v
	- )
	- else:
	- translated_metadata[codemeta_key] = v
	+ self.extra_translation(graph, root, content_dict)
	+
	+ # Convert from rdflib's internal graph representation to JSON
	+ s = graph.serialize(format="application/ld+json")
	+
	+ # Load from JSON to a list of Python objects
	+ jsonld_graph = json.loads(s)
	+
	+ # Use JSON-LD framing to turn the graph into a rooted tree
	+ # frame = {"@type": str(SCHEMA.SoftwareSourceCode)}
	+ translated_metadata = jsonld.frame(
	+ jsonld_graph,
	+ {"@id": root_id},
	+ options={
	+ "documentLoader": _document_loader,
	+ "processingMode": "json-ld-1.1",
	+ },
	+ )

	- self.extra_translation(translated_metadata, content_dict)
	+ # Remove the temporary id we added at the beginning
	+ if isinstance(translated_metadata["@id"], list):
	+ translated_metadata["@id"].remove(root_id)
	+ else:
	+ del translated_metadata["@id"]

	return self.normalize_translation(translated_metadata)

	- def extra_translation(self, translated_metadata: Dict[str, Any], d: Dict[str, Any]):
	- """Called at the end of the translation process, and may add arbitrary keys
	- to ``translated_metadata`` based on the input dictionary (passed as ``d``).
	+ def extra_translation(
	+ self, graph: rdflib.Graph, root: rdflib.term.Node, d: Dict[str, Any]
	+ ):
	+ """Called at the end of the translation process, and may add arbitrary triples
	+ to ``graph`` based on the input dictionary (passed as ``d``).
	"""
	pass

	diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
	--- a/swh/indexer/metadata_dictionary/cff.py
	+++ b/swh/indexer/metadata_dictionary/cff.py
	@@ -1,9 +1,21 @@
	-from typing import Dict, List, Optional, Union
	+# Copyright (C) 2021-2022 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+from typing import List
	+
	+from rdflib import BNode, Graph, Literal, URIRef
	+import rdflib.term

	from swh.indexer.codemeta import CROSSWALK_TABLE
	-from swh.indexer.namespaces import SCHEMA
	+from swh.indexer.namespaces import RDF, SCHEMA

	from .base import YamlMapping
	+from .utils import add_map
	+
	+DOI = URIRef("https://doi.org/")
	+SPDX = URIRef("https://spdx.org/licenses/")


	class CffMapping(YamlMapping):
	@@ -14,41 +26,41 @@
	mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"]
	string_fields = ["keywords", "license", "abstract", "version", "doi"]

	- def normalize_authors(self, d: List[dict]) -> Dict[str, list]:
	- result = []
	- for author in d:
	- author_data: Dict[str, Optional[Union[str, Dict]]] = {
	- "@type": SCHEMA.Person
	- }
	- if "orcid" in author and isinstance(author["orcid"], str):
	- author_data["@id"] = author["orcid"]
	- if "affiliation" in author and isinstance(author["affiliation"], str):
	- author_data[SCHEMA.affiliation] = {
	- "@type": SCHEMA.Organization,
	- SCHEMA.name: author["affiliation"],
	- }
	- if "family-names" in author and isinstance(author["family-names"], str):
	- author_data[SCHEMA.familyName] = author["family-names"]
	- if "given-names" in author and isinstance(author["given-names"], str):
	- author_data[SCHEMA.givenName] = author["given-names"]
	-
	- result.append(author_data)
	-
	- result_final = {"@list": result}
	- return result_final
	-
	- def normalize_doi(self, s: str) -> Dict[str, str]:
	+ def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node:
	+ node: rdflib.term.Node
	+ if "orcid" in author and isinstance(author["orcid"], str):
	+ node = URIRef(author["orcid"])
	+ else:
	+ node = BNode()
	+ graph.add((node, RDF.type, SCHEMA.Person))
	+ if "affiliation" in author and isinstance(author["affiliation"], str):
	+ affiliation = BNode()
	+ graph.add((node, SCHEMA.affiliation, affiliation))
	+ graph.add((affiliation, RDF.type, SCHEMA.Organization))
	+ graph.add((affiliation, SCHEMA.name, Literal(author["affiliation"])))
	+ if "family-names" in author and isinstance(author["family-names"], str):
	+ graph.add((node, SCHEMA.familyName, Literal(author["family-names"])))
	+ if "given-names" in author and isinstance(author["given-names"], str):
	+ graph.add((node, SCHEMA.givenName, Literal(author["given-names"])))
	+ return node
	+
	+ def translate_authors(
	+ self, graph: Graph, root: URIRef, authors: List[dict]
	+ ) -> None:
	+ add_map(graph, root, SCHEMA.author, self._translate_author, authors)
	+
	+ def normalize_doi(self, s: str) -> URIRef:
	if isinstance(s, str):
	- return {"@id": "https://doi.org/" + s}
	+ return DOI + s

	- def normalize_license(self, s: str) -> Dict[str, str]:
	+ def normalize_license(self, s: str) -> URIRef:
	if isinstance(s, str):
	- return {"@id": "https://spdx.org/licenses/" + s}
	+ return SPDX + s

	- def normalize_repository_code(self, s: str) -> Dict[str, str]:
	+ def normalize_repository_code(self, s: str) -> URIRef:
	if isinstance(s, str):
	- return {"@id": s}
	+ return URIRef(s)

	- def normalize_date_released(self, s: str) -> Dict[str, str]:
	+ def normalize_date_released(self, s: str) -> Literal:
	if isinstance(s, str):
	- return {"@value": s, "@type": SCHEMA.Date}
	+ return Literal(s, datatype=SCHEMA.Date)
	diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py
	--- a/swh/indexer/metadata_dictionary/composer.py
	+++ b/swh/indexer/metadata_dictionary/composer.py
	@@ -4,11 +4,18 @@
	# See top-level LICENSE file for more information

	import os.path
	+from typing import Optional
	+
	+from rdflib import BNode, Graph, Literal, URIRef

	from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
	-from swh.indexer.namespaces import SCHEMA
	+from swh.indexer.namespaces import RDF, SCHEMA

	from .base import JsonMapping, SingleFileIntrinsicMapping
	+from .utils import add_map
	+
	+SPDX = URIRef("https://spdx.org/licenses/")
	+

	COMPOSER_TABLE_PATH = os.path.join(_DATA_DIR, "composer.csv")

	@@ -35,23 +42,24 @@

	def normalize_homepage(self, s):
	if isinstance(s, str):
	- return {"@id": s}
	+ return URIRef(s)

	def normalize_license(self, s):
	if isinstance(s, str):
	- return {"@id": "https://spdx.org/licenses/" + s}
	+ return SPDX + s

	- def normalize_authors(self, author_list):
	- authors = []
	- for author in author_list:
	- author_obj = {"@type": SCHEMA.Person}
	+ def _translate_author(self, graph: Graph, author) -> Optional[BNode]:
	+ if not isinstance(author, dict):
	+ return None
	+ node = BNode()
	+ graph.add((node, RDF.type, SCHEMA.Person))

	- if isinstance(author, dict):
	- if isinstance(author.get("name", None), str):
	- author_obj[SCHEMA.name] = author.get("name", None)
	- if isinstance(author.get("email", None), str):
	- author_obj[SCHEMA.email] = author.get("email", None)
	+ if isinstance(author.get("name"), str):
	+ graph.add((node, SCHEMA.name, Literal(author["name"])))
	+ if isinstance(author.get("email"), str):
	+ graph.add((node, SCHEMA.email, Literal(author["email"])))

	- authors.append(author_obj)
	+ return node

	- return {"@list": authors}
	+ def translate_authors(self, graph: Graph, root: URIRef, authors) -> None:
	+ add_map(graph, root, SCHEMA.author, self._translate_author, authors)
	diff --git a/swh/indexer/metadata_dictionary/dart.py b/swh/indexer/metadata_dictionary/dart.py
	--- a/swh/indexer/metadata_dictionary/dart.py
	+++ b/swh/indexer/metadata_dictionary/dart.py
	@@ -6,10 +6,15 @@
	import os.path
	import re

	+from rdflib import RDF, BNode, Graph, Literal, URIRef
	+
	from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
	from swh.indexer.namespaces import SCHEMA

	from .base import YamlMapping
	+from .utils import add_map
	+
	+SPDX = URIRef("https://spdx.org/licenses/")

	PUB_TABLE_PATH = os.path.join(_DATA_DIR, "pubspec.csv")

	@@ -43,33 +48,32 @@

	def normalize_license(self, s):
	if isinstance(s, str):
	- return {"@id": "https://spdx.org/licenses/" + s}
	+ return SPDX + s

	def normalize_homepage(self, s):
	if isinstance(s, str):
	- return {"@id": s}
	+ return URIRef(s)

	- def normalize_author(self, s):
	- name_email_regex = "(?P<name>.?)( <(?P<email>.)>)"
	- author = {"@type": SCHEMA.Person}
	+ def _translate_author(self, graph, s):
	+ name_email_re = re.compile("(?P<name>.?)( <(?P<email>.)>)")
	if isinstance(s, str):
	- match = re.search(name_email_regex, s)
	+ author = BNode()
	+ graph.add((author, RDF.type, SCHEMA.Person))
	+ match = name_email_re.search(s)
	if match:
	name = match.group("name")
	email = match.group("email")
	- author[SCHEMA.email] = email
	+ graph.add((author, SCHEMA.email, Literal(email)))
	else:
	name = s

	- author[SCHEMA.name] = name
	+ graph.add((author, SCHEMA.name, Literal(name)))

	- return {"@list": [author]}
	+ return author

	- def normalize_authors(self, authors_list):
	- authors = {"@list": []}
	+ def translate_author(self, graph: Graph, root, s) -> None:
	+ add_map(graph, root, SCHEMA.author, self._translate_author, [s])

	- if isinstance(authors_list, list):
	- for s in authors_list:
	- author = self.normalize_author(s)["@list"]
	- authors["@list"] += author
	- return authors
	+ def translate_authors(self, graph: Graph, root, authors) -> None:
	+ if isinstance(authors, list):
	+ add_map(graph, root, SCHEMA.author, self._translate_author, authors)
	diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
	--- a/swh/indexer/metadata_dictionary/github.py
	+++ b/swh/indexer/metadata_dictionary/github.py
	@@ -3,17 +3,17 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	-import json
	-from typing import Any, Dict, Tuple
	+from typing import Any, Tuple
	+
	+from rdflib import RDF, BNode, Graph, Literal, URIRef

	from swh.indexer.codemeta import CROSSWALK_TABLE
	-from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED
	+from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA

	from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
	+from .utils import prettyprint_graph # noqa

	-
	-def _prettyprint(d):
	- print(json.dumps(d, indent=4))
	+SPDX = URIRef("https://spdx.org/licenses/")


	class GitHubMapping(BaseExtrinsicMapping, JsonMapping):
	@@ -33,94 +33,81 @@
	def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
	return ("application/vnd.github.v3+json",)

	- def _translate_dict(self, content_dict: Dict[str, Any], **kwargs) -> Dict[str, Any]:
	- d = super()._translate_dict(content_dict, **kwargs)
	- d["type"] = FORGEFED.Repository
	- return d
	+ def extra_translation(self, graph, root, content_dict):
	+ graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode))
	+ graph.add((root, RDF.type, FORGEFED.Repository))

	@produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems)
	- def translate_forks_count(
	- self, translated_metadata: Dict[str, Any], v: Any
	- ) -> None:
	+ def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None:
	"""

	- >>> translated_metadata = {}
	- >>> GitHubMapping().translate_forks_count(translated_metadata, 42)
	- >>> _prettyprint(translated_metadata)
	+ >>> graph = Graph()
	+ >>> root = URIRef("http://example.org/test-software")
	+ >>> GitHubMapping().translate_forks_count(graph, root, 42)
	+ >>> prettyprint_graph(graph, root)
	{
	- "https://forgefed.org/ns#forks": [
	- {
	- "@type": "https://www.w3.org/ns/activitystreams#OrderedCollection",
	- "https://www.w3.org/ns/activitystreams#totalItems": 42
	- }
	- ]
	+ "@id": ...,
	+ "https://forgefed.org/ns#forks": {
	+ "@type": "https://www.w3.org/ns/activitystreams#OrderedCollection",
	+ "https://www.w3.org/ns/activitystreams#totalItems": 42
	+ }
	}
	"""
	if isinstance(v, int):
	- translated_metadata.setdefault(FORGEFED.forks, []).append(
	- {
	- "@type": ACTIVITYSTREAMS.OrderedCollection,
	- ACTIVITYSTREAMS.totalItems: v,
	- }
	- )
	+ collection = BNode()
	+ graph.add((root, FORGEFED.forks, collection))
	+ graph.add((collection, RDF.type, ACTIVITYSTREAMS.OrderedCollection))
	+ graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))

	@produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems)
	- def translate_stargazers_count(
	- self, translated_metadata: Dict[str, Any], v: Any
	- ) -> None:
	+ def translate_stargazers_count(self, graph: Graph, root: BNode, v: Any) -> None:
	"""

	- >>> translated_metadata = {}
	- >>> GitHubMapping().translate_stargazers_count(translated_metadata, 42)
	- >>> _prettyprint(translated_metadata)
	+ >>> graph = Graph()
	+ >>> root = URIRef("http://example.org/test-software")
	+ >>> GitHubMapping().translate_stargazers_count(graph, root, 42)
	+ >>> prettyprint_graph(graph, root)
	{
	- "https://www.w3.org/ns/activitystreams#likes": [
	- {
	- "@type": "https://www.w3.org/ns/activitystreams#Collection",
	- "https://www.w3.org/ns/activitystreams#totalItems": 42
	- }
	- ]
	+ "@id": ...,
	+ "https://www.w3.org/ns/activitystreams#likes": {
	+ "@type": "https://www.w3.org/ns/activitystreams#Collection",
	+ "https://www.w3.org/ns/activitystreams#totalItems": 42
	+ }
	}
	"""
	if isinstance(v, int):
	- translated_metadata.setdefault(ACTIVITYSTREAMS.likes, []).append(
	- {
	- "@type": ACTIVITYSTREAMS.Collection,
	- ACTIVITYSTREAMS.totalItems: v,
	- }
	- )
	+ collection = BNode()
	+ graph.add((root, ACTIVITYSTREAMS.likes, collection))
	+ graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
	+ graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))

	@produce_terms(ACTIVITYSTREAMS.followers, ACTIVITYSTREAMS.totalItems)
	- def translate_watchers_count(
	- self, translated_metadata: Dict[str, Any], v: Any
	- ) -> None:
	+ def translate_watchers_count(self, graph: Graph, root: BNode, v: Any) -> None:
	"""

	- >>> translated_metadata = {}
	- >>> GitHubMapping().translate_watchers_count(translated_metadata, 42)
	- >>> _prettyprint(translated_metadata)
	+ >>> graph = Graph()
	+ >>> root = URIRef("http://example.org/test-software")
	+ >>> GitHubMapping().translate_watchers_count(graph, root, 42)
	+ >>> prettyprint_graph(graph, root)
	{
	- "https://www.w3.org/ns/activitystreams#followers": [
	- {
	- "@type": "https://www.w3.org/ns/activitystreams#Collection",
	- "https://www.w3.org/ns/activitystreams#totalItems": 42
	- }
	- ]
	+ "@id": ...,
	+ "https://www.w3.org/ns/activitystreams#followers": {
	+ "@type": "https://www.w3.org/ns/activitystreams#Collection",
	+ "https://www.w3.org/ns/activitystreams#totalItems": 42
	+ }
	}
	"""
	if isinstance(v, int):
	- translated_metadata.setdefault(ACTIVITYSTREAMS.followers, []).append(
	- {
	- "@type": ACTIVITYSTREAMS.Collection,
	- ACTIVITYSTREAMS.totalItems: v,
	- }
	- )
	+ collection = BNode()
	+ graph.add((root, ACTIVITYSTREAMS.followers, collection))
	+ graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
	+ graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))

	def normalize_license(self, d):
	"""

	>>> GitHubMapping().normalize_license({'spdx_id': 'MIT'})
	- {'@id': 'https://spdx.org/licenses/MIT'}
	+ rdflib.term.URIRef('https://spdx.org/licenses/MIT')
	"""
	if isinstance(d, dict) and isinstance(d.get("spdx_id"), str):
	- return {"@id": "https://spdx.org/licenses/" + d["spdx_id"]}
	+ return SPDX + d["spdx_id"]
	diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
	--- a/swh/indexer/metadata_dictionary/maven.py
	+++ b/swh/indexer/metadata_dictionary/maven.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2018-2021 The Software Heritage developers
	+# Copyright (C) 2018-2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -6,10 +6,13 @@
	import os
	from typing import Any, Dict

	+from rdflib import Graph, Literal, URIRef
	+
	from swh.indexer.codemeta import CROSSWALK_TABLE
	from swh.indexer.namespaces import SCHEMA

	from .base import SingleFileIntrinsicMapping, XmlMapping
	+from .utils import prettyprint_graph # noqa


	class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
	@@ -27,14 +30,13 @@
	def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]:
	return super()._translate_dict(d.get("project") or {})

	- def extra_translation(self, translated_metadata, d):
	- repositories = self.parse_repositories(d)
	- if repositories:
	- translated_metadata[SCHEMA.codeRepository] = repositories
	+ def extra_translation(self, graph: Graph, root, d):
	+ self.parse_repositories(graph, root, d)

	- def parse_repositories(self, d):
	+ def parse_repositories(self, graph: Graph, root, d):
	"""https://maven.apache.org/pom.html#Repositories

	+ >>> import rdflib
	>>> import xmltodict
	>>> from pprint import pprint
	>>> d = xmltodict.parse('''
	@@ -47,21 +49,19 @@
	... </repository>
	... </repositories>
	... ''')
	- >>> MavenMapping().parse_repositories(d)
	+ >>> MavenMapping().parse_repositories(rdflib.Graph(), rdflib.BNode(), d)
	"""
	repositories = d.get("repositories")
	if not repositories:
	- results = [self.parse_repository(d, self._default_repository)]
	+ self.parse_repository(graph, root, d, self._default_repository)
	elif isinstance(repositories, dict):
	repositories = repositories.get("repository") or []
	if not isinstance(repositories, list):
	repositories = [repositories]
	- results = [self.parse_repository(d, repo) for repo in repositories]
	- else:
	- results = []
	- return [res for res in results if res] or None
	+ for repo in repositories:
	+ self.parse_repository(graph, root, d, repo)

	- def parse_repository(self, d, repo):
	+ def parse_repository(self, graph: Graph, root, d, repo):
	if not isinstance(repo, dict):
	return
	if repo.get("layout", "default") != "default":
	@@ -75,23 +75,18 @@
	and isinstance(artifact_id, str)
	):
	repo = os.path.join(url, *group_id.split("."), artifact_id)
	- return {"@id": repo}
	+ graph.add((root, SCHEMA.codeRepository, URIRef(repo)))

	def normalize_groupId(self, id_):
	"""https://maven.apache.org/pom.html#Maven_Coordinates

	>>> MavenMapping().normalize_groupId('org.example')
	- {'@id': 'org.example'}
	+ rdflib.term.Literal('org.example')
	"""
	if isinstance(id_, str):
	- return {"@id": id_}
	-
	- def translate_licenses(self, translated_metadata, d):
	- licenses = self.parse_licenses(d)
	- if licenses:
	- translated_metadata[SCHEMA.license] = licenses
	+ return Literal(id_)

	- def parse_licenses(self, licenses):
	+ def translate_licenses(self, graph, root, licenses):
	"""https://maven.apache.org/pom.html#Licenses

	>>> import xmltodict
	@@ -113,8 +108,16 @@
	}
	}
	}
	- >>> MavenMapping().parse_licenses(d["licenses"])
	- [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}]
	+ >>> graph = Graph()
	+ >>> root = URIRef("http://example.org/test-software")
	+ >>> MavenMapping().translate_licenses(graph, root, d["licenses"])
	+ >>> prettyprint_graph(graph, root)
	+ {
	+ "@id": ...,
	+ "http://schema.org/license": {
	+ "@id": "https://www.apache.org/licenses/LICENSE-2.0.txt"
	+ }
	+ }

	or, if there are more than one license:

	@@ -132,9 +135,16 @@
	... </license>
	... </licenses>
	... ''')
	- >>> pprint(MavenMapping().parse_licenses(d["licenses"]))
	- [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'},
	- {'@id': 'https://opensource.org/licenses/MIT'}]
	+ >>> graph = Graph()
	+ >>> root = URIRef("http://example.org/test-software")
	+ >>> MavenMapping().translate_licenses(graph, root, d["licenses"])
	+ >>> pprint(set(graph.triples((root, URIRef("http://schema.org/license"), None))))
	+ {(rdflib.term.URIRef('http://example.org/test-software'),
	+ rdflib.term.URIRef('http://schema.org/license'),
	+ rdflib.term.URIRef('https://opensource.org/licenses/MIT')),
	+ (rdflib.term.URIRef('http://example.org/test-software'),
	+ rdflib.term.URIRef('http://schema.org/license'),
	+ rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}
	"""

	if not isinstance(licenses, dict):
	@@ -144,8 +154,6 @@
	licenses = [licenses]
	elif not isinstance(licenses, list):
	return
	- return [
	- {"@id": license["url"]}
	- for license in licenses
	- if isinstance(license, dict) and isinstance(license.get("url"), str)
	- ] or None
	+ for license in licenses:
	+ if isinstance(license, dict) and isinstance(license.get("url"), str):
	+ graph.add((root, SCHEMA.license, URIRef(license["url"])))
	diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
	--- a/swh/indexer/metadata_dictionary/npm.py
	+++ b/swh/indexer/metadata_dictionary/npm.py
	@@ -6,10 +6,15 @@
	import re
	import urllib.parse

	+from rdflib import RDF, BNode, Graph, Literal, URIRef
	+
	from swh.indexer.codemeta import CROSSWALK_TABLE
	from swh.indexer.namespaces import SCHEMA

	from .base import JsonMapping, SingleFileIntrinsicMapping
	+from .utils import add_list, prettyprint_graph # noqa
	+
	+SPDX = URIRef("https://spdx.org/licenses/")


	class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
	@@ -38,13 +43,13 @@
	... 'type': 'git',
	... 'url': 'https://example.org/foo.git'
	... })
	- {'@id': 'git+https://example.org/foo.git'}
	+ rdflib.term.URIRef('git+https://example.org/foo.git')
	>>> NpmMapping().normalize_repository(
	... 'gitlab:foo/bar')
	- {'@id': 'git+https://gitlab.com/foo/bar.git'}
	+ rdflib.term.URIRef('git+https://gitlab.com/foo/bar.git')
	>>> NpmMapping().normalize_repository(
	... 'foo/bar')
	- {'@id': 'git+https://github.com/foo/bar.git'}
	+ rdflib.term.URIRef('git+https://github.com/foo/bar.git')
	"""
	if (
	isinstance(d, dict)
	@@ -67,7 +72,7 @@
	else:
	return None

	- return {"@id": url}
	+ return URIRef(url)

	def normalize_bugs(self, d):
	"""https://docs.npmjs.com/files/package.json#bugs
	@@ -76,15 +81,15 @@
	... 'url': 'https://example.org/bugs/',
	... 'email': 'bugs@example.org'
	... })
	- {'@id': 'https://example.org/bugs/'}
	+ rdflib.term.URIRef('https://example.org/bugs/')
	>>> NpmMapping().normalize_bugs(
	... 'https://example.org/bugs/')
	- {'@id': 'https://example.org/bugs/'}
	+ rdflib.term.URIRef('https://example.org/bugs/')
	"""
	if isinstance(d, dict) and isinstance(d.get("url"), str):
	- return {"@id": d["url"]}
	+ return URIRef(d["url"])
	elif isinstance(d, str):
	- return {"@id": d}
	+ return URIRef(d)
	else:
	return None

	@@ -92,36 +97,75 @@
	r"^ " r"(?P<name>.?)" r"( +<(?P<email>.)>)?" r"( +$(?P<url>.)$)?" r" *$"
	)

	- def normalize_author(self, d):
	+ def translate_author(self, graph: Graph, root, d):
	r"""https://docs.npmjs.com/files/package.json#people-fields-author-contributors'

	>>> from pprint import pprint
	- >>> pprint(NpmMapping().normalize_author({
	+ >>> root = URIRef("http://example.org/test-software")
	+ >>> graph = Graph()
	+ >>> NpmMapping().translate_author(graph, root, {
	... 'name': 'John Doe',
	... 'email': 'john.doe@example.org',
	... 'url': 'https://example.org/~john.doe',
	- ... }))
	- {'@list': [{'@type': 'http://schema.org/Person',
	- 'http://schema.org/email': 'john.doe@example.org',
	- 'http://schema.org/name': 'John Doe',
	- 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
	- >>> pprint(NpmMapping().normalize_author(
	+ ... })
	+ >>> prettyprint_graph(graph, root)
	+ {
	+ "@id": ...,
	+ "http://schema.org/author": {
	+ "@list": [
	+ {
	+ "@type": "http://schema.org/Person",
	+ "http://schema.org/email": "john.doe@example.org",
	+ "http://schema.org/name": "John Doe",
	+ "http://schema.org/url": {
	+ "@id": "https://example.org/~john.doe"
	+ }
	+ }
	+ ]
	+ }
	+ }
	+ >>> graph = Graph()
	+ >>> NpmMapping().translate_author(graph, root,
	... 'John Doe <john.doe@example.org> (https://example.org/~john.doe)'
	- ... ))
	- {'@list': [{'@type': 'http://schema.org/Person',
	- 'http://schema.org/email': 'john.doe@example.org',
	- 'http://schema.org/name': 'John Doe',
	- 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
	- >>> pprint(NpmMapping().normalize_author({
	+ ... )
	+ >>> prettyprint_graph(graph, root)
	+ {
	+ "@id": ...,
	+ "http://schema.org/author": {
	+ "@list": [
	+ {
	+ "@type": "http://schema.org/Person",
	+ "http://schema.org/email": "john.doe@example.org",
	+ "http://schema.org/name": "John Doe",
	+ "http://schema.org/url": {
	+ "@id": "https://example.org/~john.doe"
	+ }
	+ }
	+ ]
	+ }
	+ }
	+ >>> graph = Graph()
	+ >>> NpmMapping().translate_author(graph, root, {
	... 'name': 'John Doe',
	... 'email': 'john.doe@example.org',
	... 'url': 'https:\\\\example.invalid/~john.doe',
	- ... }))
	- {'@list': [{'@type': 'http://schema.org/Person',
	- 'http://schema.org/email': 'john.doe@example.org',
	- 'http://schema.org/name': 'John Doe'}]}
	+ ... })
	+ >>> prettyprint_graph(graph, root)
	+ {
	+ "@id": ...,
	+ "http://schema.org/author": {
	+ "@list": [
	+ {
	+ "@type": "http://schema.org/Person",
	+ "http://schema.org/email": "john.doe@example.org",
	+ "http://schema.org/name": "John Doe"
	+ }
	+ ]
	+ }
	+ }
	""" # noqa
	- author = {"@type": SCHEMA.Person}
	+ author = BNode()
	+ graph.add((author, RDF.type, SCHEMA.Person))
	if isinstance(d, dict):
	name = d.get("name", None)
	email = d.get("email", None)
	@@ -137,32 +181,32 @@
	return None

	if name and isinstance(name, str):
	- author[SCHEMA.name] = name
	+ graph.add((author, SCHEMA.name, Literal(name)))
	if email and isinstance(email, str):
	- author[SCHEMA.email] = email
	+ graph.add((author, SCHEMA.email, Literal(email)))
	if url and isinstance(url, str):
	# Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
	# URLs that are blatantly invalid early, so PyLD does not crash.
	parsed_url = urllib.parse.urlparse(url)
	if parsed_url.netloc:
	- author[SCHEMA.url] = {"@id": url}
	+ graph.add((author, SCHEMA.url, URIRef(url)))

	- return {"@list": [author]}
	+ add_list(graph, root, SCHEMA.author, [author])

	def normalize_description(self, description):
	r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
	mistake that causes issues in the database because of null bytes in JSON.

	>>> NpmMapping().normalize_description("foo bar")
	- 'foo bar'
	+ rdflib.term.Literal('foo bar')
	>>> NpmMapping().normalize_description(
	... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
	... )
	- 'foo bar'
	+ rdflib.term.Literal('foo bar')
	>>> NpmMapping().normalize_description(
	... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
	... )
	- 'foo bar'
	+ rdflib.term.Literal('foo bar')
	>>> NpmMapping().normalize_description(
	... # invalid UTF-16 and meaningless UTF-8:
	... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
	@@ -213,32 +257,34 @@
	if description:
	if description.startswith("# "):
	description = description[2:]
	- return description.rstrip()
	- return description
	+ return Literal(description.rstrip())
	+ else:
	+ return None
	+ return Literal(description)

	def normalize_license(self, s):
	"""https://docs.npmjs.com/files/package.json#license

	>>> NpmMapping().normalize_license('MIT')
	- {'@id': 'https://spdx.org/licenses/MIT'}
	+ rdflib.term.URIRef('https://spdx.org/licenses/MIT')
	"""
	if isinstance(s, str):
	- return {"@id": "https://spdx.org/licenses/" + s}
	+ return SPDX + s

	def normalize_homepage(self, s):
	"""https://docs.npmjs.com/files/package.json#homepage

	>>> NpmMapping().normalize_homepage('https://example.org/~john.doe')
	- {'@id': 'https://example.org/~john.doe'}
	+ rdflib.term.URIRef('https://example.org/~john.doe')
	"""
	if isinstance(s, str):
	- return {"@id": s}
	+ return URIRef(s)

	def normalize_keywords(self, lst):
	"""https://docs.npmjs.com/files/package.json#homepage

	>>> NpmMapping().normalize_keywords(['foo', 'bar'])
	- ['foo', 'bar']
	+ [rdflib.term.Literal('foo'), rdflib.term.Literal('bar')]
	"""
	if isinstance(lst, list):
	- return [x for x in lst if isinstance(x, str)]
	+ return [Literal(x) for x in lst if isinstance(x, str)]
	diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py
	--- a/swh/indexer/metadata_dictionary/nuget.py
	+++ b/swh/indexer/metadata_dictionary/nuget.py
	@@ -7,17 +7,22 @@
	import re
	from typing import Any, Dict, List

	+from rdflib import RDF, BNode, Graph, Literal, URIRef
	+
	from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
	from swh.indexer.namespaces import SCHEMA
	from swh.indexer.storage.interface import Sha1

	from .base import BaseIntrinsicMapping, DirectoryLsEntry, XmlMapping
	+from .utils import add_list

	NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv")

	with open(NUGET_TABLE_PATH) as fd:
	(CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd)

	+SPDX = URIRef("https://spdx.org/licenses/")
	+

	class NuGetMapping(XmlMapping, BaseIntrinsicMapping):
	"""
	@@ -26,8 +31,8 @@

	name = "nuget"
	mapping = NUGET_TABLE["NuGet"]
	- mapping["copyright"] = "http://schema.org/copyrightNotice"
	- mapping["language"] = "http://schema.org/inLanguage"
	+ mapping["copyright"] = URIRef("http://schema.org/copyrightNotice")
	+ mapping["language"] = URIRef("http://schema.org/inLanguage")
	string_fields = [
	"description",
	"version",
	@@ -53,12 +58,12 @@

	def normalize_projectUrl(self, s):
	if isinstance(s, str):
	- return {"@id": s}
	+ return URIRef(s)

	- def translate_repository(self, translated_metadata, v):
	+ def translate_repository(self, graph, root, v):
	if isinstance(v, dict) and isinstance(v["@url"], str):
	- codemeta_key = self.mapping["repository.url"]
	- translated_metadata[codemeta_key] = {"@id": v["@url"]}
	+ codemeta_key = URIRef(self.mapping["repository.url"])
	+ graph.add((root, codemeta_key, URIRef(v["@url"])))

	def normalize_license(self, v):
	if isinstance(v, dict) and v["@type"] == "expression":
	@@ -67,7 +72,7 @@
	re.search(r" with \|$\|$\| and ", license_string, re.IGNORECASE)
	):
	return [
	- {"@id": "https://spdx.org/licenses/" + license_type.strip()}
	+ SPDX + license_type.strip()
	for license_type in re.split(
	r" or ", license_string, flags=re.IGNORECASE
	)
	@@ -77,22 +82,23 @@

	def normalize_licenseUrl(self, s):
	if isinstance(s, str):
	- return {"@id": s}
	+ return URIRef(s)

	- def normalize_authors(self, s):
	+ def translate_authors(self, graph: Graph, root, s):
	if isinstance(s, str):
	- author_names = [a.strip() for a in s.split(",")]
	- authors = [
	- {"@type": SCHEMA.Person, SCHEMA.name: name} for name in author_names
	- ]
	- return {"@list": authors}
	-
	- def translate_releaseNotes(self, translated_metadata, s):
	+ authors = []
	+ for author_name in s.split(","):
	+ author_name = author_name.strip()
	+ author = BNode()
	+ graph.add((author, RDF.type, SCHEMA.Person))
	+ graph.add((author, SCHEMA.name, Literal(author_name)))
	+ authors.append(author)
	+ add_list(graph, root, SCHEMA.author, authors)
	+
	+ def translate_releaseNotes(self, graph: Graph, root, s):
	if isinstance(s, str):
	- translated_metadata.setdefault("http://schema.org/releaseNotes", []).append(
	- s
	- )
	+ graph.add((root, SCHEMA.releaseNotes, Literal(s)))

	def normalize_tags(self, s):
	if isinstance(s, str):
	- return s.split(" ")
	+ return [Literal(tag) for tag in s.split(" ")]
	diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py
	--- a/swh/indexer/metadata_dictionary/python.py
	+++ b/swh/indexer/metadata_dictionary/python.py
	@@ -1,16 +1,18 @@
	-# Copyright (C) 2018-2019 The Software Heritage developers
	+# Copyright (C) 2018-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import email.parser
	import email.policy
	-import itertools
	+
	+from rdflib import BNode, Literal, URIRef

	from swh.indexer.codemeta import CROSSWALK_TABLE
	-from swh.indexer.namespaces import SCHEMA
	+from swh.indexer.namespaces import RDF, SCHEMA

	from .base import DictMapping, SingleFileIntrinsicMapping
	+from .utils import add_list

	_normalize_pkginfo_key = str.lower

	@@ -54,25 +56,25 @@
	d.setdefault(key, []).append(value)
	return self._translate_dict(d)

	- def extra_translation(self, translated_metadata, d):
	- author_name = translated_metadata.pop(SCHEMA.author, None)
	- author_email = translated_metadata.pop(SCHEMA.email, None)
	- if author_name or author_email:
	- translated_metadata[SCHEMA.author] = {
	- "@list": [
	- {
	- "@type": SCHEMA.Person,
	- SCHEMA.name: author_name,
	- SCHEMA.email: author_email,
	- }
	- ]
	- }
	+ def extra_translation(self, graph, root, d):
	+ author_names = list(graph.triples((root, SCHEMA.author, None)))
	+ author_emails = list(graph.triples((root, SCHEMA.email, None)))
	+ graph.remove((root, SCHEMA.author, None))
	+ graph.remove((root, SCHEMA.email, None))
	+ if author_names or author_emails:
	+ author = BNode()
	+ graph.add((author, RDF.type, SCHEMA.Person))
	+ for (_, _, author_name) in author_names:
	+ graph.add((author, SCHEMA.name, author_name))
	+ for (_, _, author_email) in author_emails:
	+ graph.add((author, SCHEMA.email, author_email))
	+ add_list(graph, root, SCHEMA.author, [author])

	def normalize_home_page(self, urls):
	- return [{"@id": url} for url in urls]
	+ return [URIRef(url) for url in urls]

	def normalize_keywords(self, keywords):
	- return list(itertools.chain.from_iterable(s.split(" ") for s in keywords))
	+ return [Literal(keyword) for s in keywords for keyword in s.split(" ")]

	def normalize_license(self, licenses):
	- return [{"@id": license} for license in licenses]
	+ return [URIRef("https://spdx.org/licenses/" + license) for license in licenses]
	diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py
	--- a/swh/indexer/metadata_dictionary/ruby.py
	+++ b/swh/indexer/metadata_dictionary/ruby.py
	@@ -8,19 +8,26 @@
	import re
	from typing import List

	+from rdflib import RDF, BNode, Graph, Literal, URIRef
	+
	from swh.indexer.codemeta import CROSSWALK_TABLE
	from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
	from swh.indexer.namespaces import SCHEMA
	from swh.indexer.storage.interface import Sha1

	from .base import BaseIntrinsicMapping, DictMapping
	+from .utils import add_map
	+
	+SPDX = URIRef("https://spdx.org/licenses/")


	-def name_to_person(name):
	- return {
	- "@type": SCHEMA.Person,
	- SCHEMA.name: name,
	- }
	+def name_to_person(graph: Graph, name):
	+ if not isinstance(name, str):
	+ return None
	+ author = BNode()
	+ graph.add((author, RDF.type, SCHEMA.Person))
	+ graph.add((author, SCHEMA.name, Literal(name)))
	+ return author


	class GemspecMapping(BaseIntrinsicMapping, DictMapping):
	@@ -107,30 +114,20 @@

	def normalize_homepage(self, s):
	if isinstance(s, str):
	- return {"@id": s}
	+ return URIRef(s)

	def normalize_license(self, s):
	if isinstance(s, str):
	- return [{"@id": "https://spdx.org/licenses/" + s}]
	+ return SPDX + s

	def normalize_licenses(self, licenses):
	if isinstance(licenses, list):
	- return [
	- {"@id": "https://spdx.org/licenses/" + license}
	- for license in licenses
	- if isinstance(license, str)
	- ]
	+ return [SPDX + license for license in licenses if isinstance(license, str)]

	- def normalize_author(self, author):
	+ def translate_author(self, graph: Graph, root, author):
	if isinstance(author, str):
	- return {"@list": [name_to_person(author)]}
	+ add_map(graph, root, SCHEMA.author, name_to_person, [author])

	- def normalize_authors(self, authors):
	+ def translate_authors(self, graph: Graph, root, authors):
	if isinstance(authors, list):
	- return {
	- "@list": [
	- name_to_person(author)
	- for author in authors
	- if isinstance(author, str)
	- ]
	- }
	+ add_map(graph, root, SCHEMA.author, name_to_person, authors)
	diff --git a/swh/indexer/metadata_dictionary/utils.py b/swh/indexer/metadata_dictionary/utils.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/indexer/metadata_dictionary/utils.py
	@@ -0,0 +1,72 @@
	+# Copyright (C) 2022 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+
	+import json
	+from typing import Callable, Iterable, Optional, Sequence, TypeVar
	+
	+from pyld import jsonld
	+from rdflib import RDF, Graph, URIRef
	+import rdflib.term
	+
	+from swh.indexer.codemeta import _document_loader
	+
	+
	+def prettyprint_graph(graph: Graph, root: URIRef):
	+ s = graph.serialize(format="application/ld+json")
	+ jsonld_graph = json.loads(s)
	+ translated_metadata = jsonld.frame(
	+ jsonld_graph,
	+ {"@id": str(root)},
	+ options={
	+ "documentLoader": _document_loader,
	+ "processingMode": "json-ld-1.1",
	+ },
	+ )
	+ print(json.dumps(translated_metadata, indent=4))
	+
	+
	+def add_list(
	+ graph: Graph,
	+ subject: rdflib.term.Node,
	+ predicate: rdflib.term.Identifier,
	+ objects: Sequence[rdflib.term.Node],
	+) -> None:
	+ """Adds triples to the ``graph`` so that they are equivalent to this
	+ JSON-LD object::
	+
	+ {
	+ "@id": subject,
	+ predicate: {"@list": objects}
	+ }
	+
	+ This is a naive implementation of
	+ https://json-ld.org/spec/latest/json-ld-api/#list-to-rdf-conversion
	+ """
	+ # JSON-LD's @list is syntactic sugar for a linked list / chain in the RDF graph,
	+ # which is what we are going to construct, starting from the end:
	+ last_link: rdflib.term.Node
	+ last_link = RDF.nil
	+ for item in reversed(objects):
	+ link = rdflib.BNode()
	+ graph.add((link, RDF.first, item))
	+ graph.add((link, RDF.rest, last_link))
	+ last_link = link
	+ graph.add((subject, predicate, last_link))
	+
	+
	+TValue = TypeVar("TValue")
	+
	+
	+def add_map(
	+ graph: Graph,
	+ subject: rdflib.term.Node,
	+ predicate: rdflib.term.Identifier,
	+ f: Callable[[Graph, TValue], Optional[rdflib.term.Node]],
	+ values: Iterable[TValue],
	+) -> None:
	+ """Helper for :func:`add_list` that takes a mapper function ``f``."""
	+ nodes = [f(graph, value) for value in values]
	+ add_list(graph, subject, predicate, [node for node in nodes if node])
	diff --git a/swh/indexer/namespaces.py b/swh/indexer/namespaces.py
	--- a/swh/indexer/namespaces.py
	+++ b/swh/indexer/namespaces.py
	@@ -3,24 +3,8 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	-
	-class _Namespace:
	- """Handy class to get terms within a namespace by accessing them as attributes.
	-
	- This is similar to `rdflib's namespaces
	- <https://rdflib.readthedocs.io/en/stable/namespaces_and_bindings.html>`__
	- """
	-
	- def __init__(self, uri: str):
	- if not uri.endswith(("#", "/")):
	- # Sanity check, to make sure it doesn't end with an alphanumerical
	- # character, which is very likely to be invalid.
	- raise ValueError(f"Invalid trailing character for namespace URI: {uri}")
	- self._uri = uri
	-
	- def __getattr__(self, term: str) -> str:
	- return self._uri + term
	-
	+from rdflib import Namespace as _Namespace
	+from rdflib import RDF # noqa

	SCHEMA = _Namespace("http://schema.org/")
	CODEMETA = _Namespace("https://codemeta.github.io/terms/")
	diff --git a/swh/indexer/tests/metadata_dictionary/test_cff.py b/swh/indexer/tests/metadata_dictionary/test_cff.py
	--- a/swh/indexer/tests/metadata_dictionary/test_cff.py
	+++ b/swh/indexer/tests/metadata_dictionary/test_cff.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2017-2022 The Software Heritage developers
	+# Copyright (C) 2021-2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -44,6 +44,13 @@
	"utf-8"
	)

	+ result = MAPPINGS["CffMapping"]().translate(content)
	+ assert set(result.pop("keywords")) == {
	+ "citation",
	+ "bibliography",
	+ "cff",
	+ "CITATION.cff",
	+ }
	expected = {
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	@@ -76,12 +83,10 @@
	Citation File Format to various other formats such as BibTeX, EndNote, \
	RIS, schema.org, CodeMeta, and .zenodo.json.""",
	"identifier": "https://doi.org/10.5281/zenodo.1162057",
	- "keywords": ["citation", "bibliography", "cff", "CITATION.cff"],
	"license": "https://spdx.org/licenses/Apache-2.0",
	"version": "1.4.0-alpha0",
	}

	- result = MAPPINGS["CffMapping"]().translate(content)
	assert expected == result


	diff --git a/swh/indexer/tests/metadata_dictionary/test_composer.py b/swh/indexer/tests/metadata_dictionary/test_composer.py
	--- a/swh/indexer/tests/metadata_dictionary/test_composer.py
	+++ b/swh/indexer/tests/metadata_dictionary/test_composer.py
	@@ -60,11 +60,16 @@

	result = MAPPINGS["ComposerMapping"]().translate(raw_content)

	+ assert set(result.pop("keywords")) == {
	+ "polyfill",
	+ "shim",
	+ "compatibility",
	+ "portable",
	+ }, result
	expected = {
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "symfony/polyfill-mbstring",
	- "keywords": ["polyfill", "shim", "compatibility", "portable"],
	"description": "Symfony polyfill for the Mbstring extension",
	"url": "https://symfony.com",
	"license": "https://spdx.org/licenses/MIT",
	diff --git a/swh/indexer/tests/metadata_dictionary/test_dart.py b/swh/indexer/tests/metadata_dictionary/test_dart.py
	--- a/swh/indexer/tests/metadata_dictionary/test_dart.py
	+++ b/swh/indexer/tests/metadata_dictionary/test_dart.py
	@@ -3,6 +3,8 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	+import pytest
	+
	from swh.indexer.metadata_dictionary import MAPPINGS


	@@ -41,17 +43,17 @@

	result = MAPPINGS["PubMapping"]().translate(raw_content)

	+ assert set(result.pop("keywords")) == {
	+ "polyfill",
	+ "shim",
	+ "compatibility",
	+ "portable",
	+ "mbstring",
	+ }, result
	expected = {
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "newtify",
	- "keywords": [
	- "polyfill",
	- "shim",
	- "compatibility",
	- "portable",
	- "mbstring",
	- ],
	"description": """Have you been turned into a newt? Would you like to be? \
	This package can help. It has all of the \
	newt-transmogrification functionality you have been looking \
	@@ -109,6 +111,7 @@
	assert result == expected


	+@pytest.mark.xfail(reason="https://github.com/w3c/json-ld-api/issues/547")
	def test_normalize_author_authors_pubspec():
	raw_content = """
	authors:
	diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py
	--- a/swh/indexer/tests/metadata_dictionary/test_github.py
	+++ b/swh/indexer/tests/metadata_dictionary/test_github.py
	@@ -120,7 +120,7 @@
	result = MAPPINGS["GitHubMapping"]().translate(content)
	assert result == {
	"@context": CONTEXT,
	- "type": "https://forgefed.org/ns#Repository",
	+ "type": "forge:Repository",
	"forge:forks": {
	"as:totalItems": 1,
	"type": "as:OrderedCollection",
	diff --git a/swh/indexer/tests/metadata_dictionary/test_maven.py b/swh/indexer/tests/metadata_dictionary/test_maven.py
	--- a/swh/indexer/tests/metadata_dictionary/test_maven.py
	+++ b/swh/indexer/tests/metadata_dictionary/test_maven.py
	@@ -45,7 +45,7 @@
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "Maven Default Project",
	- "identifier": "com.mycompany.app",
	+ "schema:identifier": "com.mycompany.app",
	"version": "1.2.3",
	"license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
	"codeRepository": ("http://repo1.maven.org/maven2/com/mycompany/app/my-app"),
	@@ -167,7 +167,7 @@
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "Maven Default Project",
	- "identifier": "com.mycompany.app",
	+ "schema:identifier": "com.mycompany.app",
	"version": "1.2.3",
	"codeRepository": (
	"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
	@@ -191,7 +191,7 @@
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "Maven Default Project",
	- "identifier": "com.mycompany.app",
	+ "schema:identifier": "com.mycompany.app",
	"version": "1.2.3",
	"codeRepository": (
	"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
	@@ -211,7 +211,7 @@
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "Maven Default Project",
	- "identifier": "com.mycompany.app",
	+ "schema:identifier": "com.mycompany.app",
	"codeRepository": (
	"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
	),
	@@ -229,7 +229,7 @@
	assert result == {
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	- "identifier": "com.mycompany.app",
	+ "schema:identifier": "com.mycompany.app",
	"version": "1.2.3",
	"codeRepository": (
	"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
	@@ -251,7 +251,7 @@
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "Maven Default Project",
	- "identifier": "com.mycompany.app",
	+ "schema:identifier": "com.mycompany.app",
	"version": "1.2.3",
	"codeRepository": (
	"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
	@@ -288,7 +288,7 @@
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "Maven Default Project",
	- "identifier": "com.mycompany.app",
	+ "schema:identifier": "com.mycompany.app",
	"version": "1.2.3",
	"codeRepository": (
	"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
	@@ -336,20 +336,20 @@
	</licenses>
	</project>"""
	result = MAPPINGS["MavenMapping"]().translate(raw_content)
	+ assert set(result.pop("license")) == {
	+ "https://www.apache.org/licenses/LICENSE-2.0.txt",
	+ "https://opensource.org/licenses/MIT",
	+ }, result
	+ assert set(result.pop("codeRepository")) == {
	+ "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
	+ "http://example.org/maven2/com/mycompany/app/my-app",
	+ }, result
	assert result == {
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "Maven Default Project",
	- "identifier": "com.mycompany.app",
	+ "schema:identifier": "com.mycompany.app",
	"version": "1.2.3",
	- "license": [
	- "https://www.apache.org/licenses/LICENSE-2.0.txt",
	- "https://opensource.org/licenses/MIT",
	- ],
	- "codeRepository": [
	- "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
	- "http://example.org/maven2/com/mycompany/app/my-app",
	- ],
	}


	diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
	--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
	+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
	@@ -147,12 +147,6 @@
	"license": "https://spdx.org/licenses/Artistic-2.0",
	"version": "5.0.3",
	"name": "npm",
	- "keywords": [
	- "install",
	- "modules",
	- "package manager",
	- "package.json",
	- ],
	"url": "https://docs.npmjs.com/",
	},
	),
	@@ -160,6 +154,7 @@

	for result in results:
	del result.tool["id"]
	+ result.metadata.pop("keywords", None)

	# The assertion below returns False sometimes because of nested lists
	assert expected_results == results
	diff --git a/swh/indexer/tests/metadata_dictionary/test_nuget.py b/swh/indexer/tests/metadata_dictionary/test_nuget.py
	--- a/swh/indexer/tests/metadata_dictionary/test_nuget.py
	+++ b/swh/indexer/tests/metadata_dictionary/test_nuget.py
	@@ -35,7 +35,26 @@
	<file src="bin\\Debug\\*.dll" target="lib" />
	</files>
	</package>"""
	+
	result = MAPPINGS["NuGetMapping"]().translate(raw_content)
	+
	+ assert set(result.pop("keywords")) == {
	+ "python3",
	+ "java",
	+ "cpp",
	+ "search-tag",
	+ }, result
	+
	+ assert set(result.pop("license")) == {
	+ "https://spdx.org/licenses/MIT",
	+ "https://raw.github.com/timrwood/moment/master/LICENSE",
	+ }, result
	+
	+ assert set(result.pop("description")) == {
	+ "Sample exists only to show a sample .nuspec file.",
	+ "Summary is being deprecated. Use description instead.",
	+ }, result
	+
	expected = {
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	@@ -44,25 +63,11 @@
	{"type": "Person", "name": "Franck Halmaert"},
	],
	"codeRepository": "https://github.com/NuGet/NuGet.Client.git",
	- "description": [
	- "Sample exists only to show a sample .nuspec file.",
	- "Summary is being deprecated. Use description instead.",
	- ],
	- "license": [
	- "https://spdx.org/licenses/MIT",
	- "https://raw.github.com/timrwood/moment/master/LICENSE",
	- ],
	"url": "http://example.org/",
	"version": "1.2.3",
	"schema:releaseNotes": (
	"See the [changelog](https://github.com/httpie/httpie/releases/tag/3.2.0)."
	),
	- "keywords": [
	- "python3",
	- "java",
	- "cpp",
	- "search-tag",
	- ],
	}

	assert result == expected
	@@ -114,13 +119,13 @@
	</files>
	</package>"""
	result = MAPPINGS["NuGetMapping"]().translate(raw_content)
	+ assert set(result.pop("license")) == {
	+ "https://spdx.org/licenses/BitTorrent-1.0",
	+ "https://spdx.org/licenses/GPL-3.0-with-GCC-exception",
	+ }
	expected = {
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	- "license": [
	- "https://spdx.org/licenses/BitTorrent-1.0",
	- "https://spdx.org/licenses/GPL-3.0-with-GCC-exception",
	- ],
	}

	assert result == expected
	diff --git a/swh/indexer/tests/metadata_dictionary/test_python.py b/swh/indexer/tests/metadata_dictionary/test_python.py
	--- a/swh/indexer/tests/metadata_dictionary/test_python.py
	+++ b/swh/indexer/tests/metadata_dictionary/test_python.py
	@@ -38,7 +38,7 @@
	Provides-Extra: testing
	""" # noqa
	result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
	- assert result["description"] == [
	+ assert set(result.pop("description")) == {
	"Software Heritage core utilities", # note the comma here
	"swh-core\n"
	"========\n"
	@@ -49,8 +49,7 @@
	"- serialization\n"
	"- logging mechanism\n"
	"",
	- ], result
	- del result["description"]
	+ }, result
	assert result == {
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	@@ -91,11 +90,11 @@
	Keywords: foo bar baz
	""" # noqa
	result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
	+ assert set(result.pop("keywords")) == {"foo", "bar", "baz"}, result
	assert result == {
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "foo",
	- "keywords": ["foo", "bar", "baz"],
	}


	@@ -110,5 +109,5 @@
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"name": "foo",
	- "license": "MIT",
	+ "license": "https://spdx.org/licenses/MIT",
	}
	diff --git a/swh/indexer/tests/metadata_dictionary/test_ruby.py b/swh/indexer/tests/metadata_dictionary/test_ruby.py
	--- a/swh/indexer/tests/metadata_dictionary/test_ruby.py
	+++ b/swh/indexer/tests/metadata_dictionary/test_ruby.py
	@@ -4,6 +4,7 @@
	# See top-level LICENSE file for more information

	from hypothesis import HealthCheck, given, settings, strategies
	+import pytest

	from swh.indexer.metadata_dictionary import MAPPINGS

	@@ -39,6 +40,7 @@
	}


	+@pytest.mark.xfail(reason="https://github.com/w3c/json-ld-api/issues/547")
	def test_gemspec_two_author_fields():
	raw_content = b"""
	Gem::Specification.new do \|s\|
	diff --git a/swh/indexer/tests/test_codemeta.py b/swh/indexer/tests/test_codemeta.py
	--- a/swh/indexer/tests/test_codemeta.py
	+++ b/swh/indexer/tests/test_codemeta.py
	@@ -3,13 +3,11 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	-import pytest
	-
	-from swh.indexer.codemeta import CROSSWALK_TABLE, merge_documents, merge_values
	+from swh.indexer.codemeta import CROSSWALK_TABLE, merge_documents


	def test_crosstable():
	- assert CROSSWALK_TABLE["NodeJS"] == {
	+ assert {k: str(v) for (k, v) in CROSSWALK_TABLE["NodeJS"].items()} == {
	"repository": "http://schema.org/codeRepository",
	"os": "http://schema.org/operatingSystem",
	"cpu": "http://schema.org/processorRequirements",
	@@ -28,32 +26,6 @@
	}


	-def test_merge_values():
	- assert merge_values("a", "b") == ["a", "b"]
	- assert merge_values(["a", "b"], "c") == ["a", "b", "c"]
	- assert merge_values("a", ["b", "c"]) == ["a", "b", "c"]
	-
	- assert merge_values({"@list": ["a"]}, {"@list": ["b"]}) == {"@list": ["a", "b"]}
	- assert merge_values({"@list": ["a", "b"]}, {"@list": ["c"]}) == {
	- "@list": ["a", "b", "c"]
	- }
	-
	- with pytest.raises(ValueError):
	- merge_values({"@list": ["a"]}, "b")
	- with pytest.raises(ValueError):
	- merge_values("a", {"@list": ["b"]})
	- with pytest.raises(ValueError):
	- merge_values({"@list": ["a"]}, ["b"])
	- with pytest.raises(ValueError):
	- merge_values(["a"], {"@list": ["b"]})
	-
	- assert merge_values("a", None) == "a"
	- assert merge_values(["a", "b"], None) == ["a", "b"]
	- assert merge_values(None, ["b", "c"]) == ["b", "c"]
	- assert merge_values({"@list": ["a"]}, None) == {"@list": ["a"]}
	- assert merge_values(None, {"@list": ["a"]}) == {"@list": ["a"]}
	-
	-
	def test_merge_documents():
	"""
	Test the creation of a coherent minimal metadata set

File Metadata

Mime Type: text/plain
Expires: Fri, Jun 20, 5:20 PM (1 w, 4 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3225333

D8279.id29898.diffNo OneTemporaryActions

D8279.id29898.diffView Options

File Metadata

Event Timeline

D8279.id29898.diff
No OneTemporary
Actions

D8279.id29898.diff
View Options