Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary/base.py
Show All 14 Lines | |||||
from typing_extensions import TypedDict | from typing_extensions import TypedDict | ||||
import xmltodict | import xmltodict | ||||
import yaml | import yaml | ||||
from swh.indexer.codemeta import _document_loader, compact | from swh.indexer.codemeta import _document_loader, compact | ||||
from swh.indexer.namespaces import RDF, SCHEMA | from swh.indexer.namespaces import RDF, SCHEMA | ||||
from swh.indexer.storage.interface import Sha1 | from swh.indexer.storage.interface import Sha1 | ||||
TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/" | |||||
"""Prefix used to generate temporary URIs for root nodes being translated.""" | |||||
class DirectoryLsEntry(TypedDict): | class DirectoryLsEntry(TypedDict): | ||||
target: Sha1 | target: Sha1 | ||||
sha1: Sha1 | sha1: Sha1 | ||||
name: bytes | name: bytes | ||||
type: str | type: str | ||||
▲ Show 20 Lines • Show All 149 Lines • ▼ Show 20 Lines | def supported_terms(cls): | ||||
str(term) | str(term) | ||||
for meth_name in dir(cls) | for meth_name in dir(cls) | ||||
if meth_name.startswith("translate_") | if meth_name.startswith("translate_") | ||||
for term in getattr(getattr(cls, meth_name), "produced_terms", []) | for term in getattr(getattr(cls, meth_name), "produced_terms", []) | ||||
} | } | ||||
return simple_terms | complex_terms | return simple_terms | complex_terms | ||||
def get_root_uri(self, content_dict: Dict) -> rdflib.URIRef: | |||||
"""Returns an URI for the SoftwareSourceCode or Repository being described. | |||||
The default implementation uses a temporary URI that is stripped before | |||||
normalization by :meth:`_translate_dict`. | |||||
""" | |||||
# The main object being described (the SoftwareSourceCode) does not necessarily | |||||
# may or may not have an id. | |||||
# If it does, it will need to be set by a subclass. | |||||
# If it doesn't we temporarily use this URI to identify it. Unfortunately, | |||||
# we cannot use a blank node as we need to use it for JSON-LD framing later, | |||||
# and blank nodes cannot be used for framing in JSON-LD >= 1.1 | |||||
root_id = TMP_ROOT_URI_PREFIX + str(uuid.uuid4()) | |||||
return rdflib.URIRef(root_id) | |||||
def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]: | def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]: | ||||
""" | """ | ||||
Translates content by parsing content from a dict object | Translates content by parsing content from a dict object | ||||
and translating with the appropriate mapping | and translating with the appropriate mapping | ||||
Args: | Args: | ||||
content_dict (dict): content dict to translate | content_dict (dict): content dict to translate | ||||
Returns: | Returns: | ||||
dict: translated metadata in json-friendly form needed for | dict: translated metadata in json-friendly form needed for | ||||
the indexer | the indexer | ||||
""" | """ | ||||
graph = rdflib.Graph() | graph = rdflib.Graph() | ||||
# The main object being described (the SoftwareSourceCode) does not necessarily | root = self.get_root_uri(content_dict) | ||||
# may or may not have an id. | |||||
# Either way, we temporarily use this URI to identify it. Unfortunately, | self._translate_to_graph(graph, root, content_dict) | ||||
# we cannot use a blank node as we need to use it for JSON-LD framing later, | |||||
# and blank nodes cannot be used for framing in JSON-LD >= 1.1 | self.sanitize(graph) | ||||
root_id = ( | |||||
"https://www.softwareheritage.org/schema/2022/indexer/tmp-node/" | # Convert from rdflib's internal graph representation to JSON | ||||
+ str(uuid.uuid4()) | s = graph.serialize(format="application/ld+json") | ||||
# Load from JSON to a list of Python objects | |||||
jsonld_graph = json.loads(s) | |||||
# Use JSON-LD framing to turn the graph into a rooted tree | |||||
# frame = {"@type": str(SCHEMA.SoftwareSourceCode)} | |||||
translated_metadata = jsonld.frame( | |||||
jsonld_graph, | |||||
{"@id": str(root)}, | |||||
options={ | |||||
"documentLoader": _document_loader, | |||||
"processingMode": "json-ld-1.1", | |||||
}, | |||||
) | ) | ||||
root = rdflib.URIRef(root_id) | |||||
# Remove the temporary id we added at the beginning | |||||
assert isinstance(translated_metadata["@id"], str) | |||||
if translated_metadata["@id"].startswith(TMP_ROOT_URI_PREFIX): | |||||
del translated_metadata["@id"] | |||||
return self.normalize_translation(translated_metadata) | |||||
def _translate_to_graph( | |||||
self, graph: rdflib.Graph, root: rdflib.term.Identifier, content_dict: Dict | |||||
) -> None: | |||||
""" | |||||
Translates content by parsing content from a dict object | |||||
and translating with the appropriate mapping to the graph passed as parameter | |||||
Args: | |||||
content_dict (dict): content dict to translate | |||||
""" | |||||
graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode)) | graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode)) | ||||
for k, v in content_dict.items(): | for k, v in content_dict.items(): | ||||
# First, check if there is a specific translation | # First, check if there is a specific translation | ||||
# method for this key | # method for this key | ||||
translation_method = getattr( | translation_method = getattr( | ||||
self, "translate_" + self._normalize_method_name(k), None | self, "translate_" + self._normalize_method_name(k), None | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines | ) -> None: | ||||
parsed_url = urllib.parse.urlparse(item) | parsed_url = urllib.parse.urlparse(item) | ||||
if parsed_url.netloc: | if parsed_url.netloc: | ||||
graph.add((root, codemeta_key, rdflib.URIRef(item))) | graph.add((root, codemeta_key, rdflib.URIRef(item))) | ||||
else: | else: | ||||
continue | continue | ||||
self.extra_translation(graph, root, content_dict) | self.extra_translation(graph, root, content_dict) | ||||
self.sanitize(graph) | |||||
# Convert from rdflib's internal graph representation to JSON | |||||
s = graph.serialize(format="application/ld+json") | |||||
# Load from JSON to a list of Python objects | |||||
jsonld_graph = json.loads(s) | |||||
# Use JSON-LD framing to turn the graph into a rooted tree | |||||
# frame = {"@type": str(SCHEMA.SoftwareSourceCode)} | |||||
translated_metadata = jsonld.frame( | |||||
jsonld_graph, | |||||
{"@id": root_id}, | |||||
options={ | |||||
"documentLoader": _document_loader, | |||||
"processingMode": "json-ld-1.1", | |||||
}, | |||||
) | |||||
# Remove the temporary id we added at the beginning | |||||
if isinstance(translated_metadata["@id"], list): | |||||
translated_metadata["@id"].remove(root_id) | |||||
else: | |||||
del translated_metadata["@id"] | |||||
return self.normalize_translation(translated_metadata) | |||||
def sanitize(self, graph: rdflib.Graph) -> None: | def sanitize(self, graph: rdflib.Graph) -> None: | ||||
# Remove triples that make PyLD crash | # Remove triples that make PyLD crash | ||||
for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))): | for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))): | ||||
graph.remove((subject, predicate, rdflib.URIRef(""))) | graph.remove((subject, predicate, rdflib.URIRef(""))) | ||||
# Should not happen, but we's better check as this may lead to incorrect data | # Should not happen, but we's better check as this may lead to incorrect data | ||||
invalid = False | invalid = False | ||||
for triple in graph.triples((rdflib.URIRef(""), None, None)): | for triple in graph.triples((rdflib.URIRef(""), None, None)): | ||||
▲ Show 20 Lines • Show All 76 Lines • Show Last 20 Lines |