diff --git a/swh/indexer/data/Gitea.csv b/swh/indexer/data/Gitea.csv --- a/swh/indexer/data/Gitea.csv +++ b/swh/indexer/data/Gitea.csv @@ -1,5 +1,5 @@ Property,Gitea -codeRepository,html_url +codeRepository,clone_url programmingLanguage,languages runtimePlatform, targetProduct, diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -20,6 +20,9 @@ from swh.indexer.namespaces import RDF, SCHEMA from swh.indexer.storage.interface import Sha1 +TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/" +"""Prefix used to generate temporary URIs for root nodes being translated.""" + class DirectoryLsEntry(TypedDict): target: Sha1 @@ -185,6 +188,21 @@ return simple_terms | complex_terms + def get_root_uri(self, content_dict: Dict) -> rdflib.URIRef: + """Returns an URI for the SoftwareSourceCode or Repository being described. + + The default implementation uses a temporary URI that is stripped before + normalization by :meth:`_translate_dict`. + """ + # The main object being described (the SoftwareSourceCode) does not necessarily + # may or may not have an id. + # If it does, it will need to be set by a subclass. + # If it doesn't we temporarily use this URI to identify it. Unfortunately, + # we cannot use a blank node as we need to use it for JSON-LD framing later, + # and blank nodes cannot be used for framing in JSON-LD >= 1.1 + root_id = TMP_ROOT_URI_PREFIX + str(uuid.uuid4()) + return rdflib.URIRef(root_id) + def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]: """ Translates content by parsing content from a dict object @@ -200,16 +218,47 @@ """ graph = rdflib.Graph() - # The main object being described (the SoftwareSourceCode) does not necessarily - # may or may not have an id. - # Either way, we temporarily use this URI to identify it. Unfortunately, - # we cannot use a blank node as we need to use it for JSON-LD framing later, - # and blank nodes cannot be used for framing in JSON-LD >= 1.1 - root_id = ( - "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/" - + str(uuid.uuid4()) + root = self.get_root_uri(content_dict) + + self._translate_to_graph(graph, root, content_dict) + + self.sanitize(graph) + + # Convert from rdflib's internal graph representation to JSON + s = graph.serialize(format="application/ld+json") + + # Load from JSON to a list of Python objects + jsonld_graph = json.loads(s) + + # Use JSON-LD framing to turn the graph into a rooted tree + # frame = {"@type": str(SCHEMA.SoftwareSourceCode)} + translated_metadata = jsonld.frame( + jsonld_graph, + {"@id": str(root)}, + options={ + "documentLoader": _document_loader, + "processingMode": "json-ld-1.1", + }, ) - root = rdflib.URIRef(root_id) + + # Remove the temporary id we added at the beginning + assert isinstance(translated_metadata["@id"], str) + if translated_metadata["@id"].startswith(TMP_ROOT_URI_PREFIX): + del translated_metadata["@id"] + + return self.normalize_translation(translated_metadata) + + def _translate_to_graph( + self, graph: rdflib.Graph, root: rdflib.term.BNode, content_dict: Dict + ) -> None: + """ + Translates content by parsing content from a dict object + and translating with the appropriate mapping to the graph passed as parameter + + Args: + content_dict (dict): content dict to translate + + """ graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode)) for k, v in content_dict.items(): @@ -270,33 +319,6 @@ self.extra_translation(graph, root, content_dict) - self.sanitize(graph) - - # Convert from rdflib's internal graph representation to JSON - s = graph.serialize(format="application/ld+json") - - # Load from JSON to a list of Python objects - jsonld_graph = json.loads(s) - - # Use JSON-LD framing to turn the graph into a rooted tree - # frame = {"@type": str(SCHEMA.SoftwareSourceCode)} - translated_metadata = jsonld.frame( - jsonld_graph, - {"@id": root_id}, - options={ - "documentLoader": _document_loader, - "processingMode": "json-ld-1.1", - }, - ) - - # Remove the temporary id we added at the beginning - if isinstance(translated_metadata["@id"], list): - translated_metadata["@id"].remove(root_id) - else: - del translated_metadata["@id"] - - return self.normalize_translation(translated_metadata) - def sanitize(self, graph: rdflib.Graph) -> None: # Remove triples that make PyLD crash for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))): diff --git a/swh/indexer/metadata_dictionary/gitea.py b/swh/indexer/metadata_dictionary/gitea.py --- a/swh/indexer/metadata_dictionary/gitea.py +++ b/swh/indexer/metadata_dictionary/gitea.py @@ -27,8 +27,8 @@ name = "gitea" mapping = GITEA_TABLE["Gitea"] uri_fields = [ - "html_url", "website", + "clone_url", ] date_fields = [ "created_at", @@ -49,6 +49,14 @@ graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode)) graph.add((root, RDF.type, FORGEFED.Repository)) + def get_root_uri(self, content_dict: dict) -> URIRef: + if isinstance(content_dict.get("html_url"), str): + return URIRef(content_dict["html_url"]) + else: + raise ValueError( + f"Unexpected html_url in Gitea/Gogs metadata: {content_dict}" + ) + @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems) def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None: """ diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py --- a/swh/indexer/metadata_dictionary/github.py +++ b/swh/indexer/metadata_dictionary/github.py @@ -21,10 +21,11 @@ mapping = { **CROSSWALK_TABLE["GitHub"], "topics": SCHEMA.keywords, # TODO: submit this to the official crosswalk + "clone_url": SCHEMA.codeRepository, } uri_fields = [ "archive_url", - "html_url", + "clone_url", "issues_url", ] date_fields = [ @@ -45,6 +46,12 @@ graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode)) graph.add((root, RDF.type, FORGEFED.Repository)) + def get_root_uri(self, content_dict: dict) -> URIRef: + if isinstance(content_dict.get("html_url"), str): + return URIRef(content_dict["html_url"]) + else: + raise ValueError(f"Unexpected html_url in GitHub metadata: {content_dict}") + @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems) def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None: """ diff --git a/swh/indexer/tests/metadata_dictionary/test_gitea.py b/swh/indexer/tests/metadata_dictionary/test_gitea.py --- a/swh/indexer/tests/metadata_dictionary/test_gitea.py +++ b/swh/indexer/tests/metadata_dictionary/test_gitea.py @@ -121,6 +121,7 @@ assert result == { "@context": CONTEXT, "type": "forge:Repository", + "id": "https://codeberg.org/ForgeFed/ForgeFed", "forge:forks": { "as:totalItems": 6, "type": "as:OrderedCollection", @@ -135,7 +136,7 @@ }, "name": "ForgeFed", "description": "ActivityPub-based forge federation protocol specification", - "codeRepository": "https://codeberg.org/ForgeFed/ForgeFed", + "codeRepository": "https://codeberg.org/ForgeFed/ForgeFed.git", "dateCreated": "2022-06-13T18:54:26+02:00", "dateModified": "2022-09-02T03:57:22+02:00", "url": "https://forgefed.org", diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py --- a/swh/indexer/tests/metadata_dictionary/test_github.py +++ b/swh/indexer/tests/metadata_dictionary/test_github.py @@ -119,6 +119,7 @@ assert result == { "@context": CONTEXT, "type": "forge:Repository", + "id": "https://github.com/SoftwareHeritage/swh-indexer", "forge:forks": { "as:totalItems": 1, "type": "as:OrderedCollection", @@ -134,7 +135,7 @@ "license": "https://spdx.org/licenses/GPL-3.0", "name": "SoftwareHeritage/swh-indexer", "description": "GitHub mirror of Metadata indexer", - "codeRepository": "https://github.com/SoftwareHeritage/swh-indexer", + "codeRepository": "https://github.com/SoftwareHeritage/swh-indexer.git", "dateCreated": "2017-01-31T13:05:39Z", "dateModified": "2022-06-22T08:02:20Z", } @@ -143,6 +144,7 @@ def test_github_topics(): content = b""" { + "html_url": "https://github.com/SoftwareHeritage/swh-indexer", "topics": [ "foo", "bar" @@ -154,4 +156,5 @@ assert result == { "@context": CONTEXT, "type": "forge:Repository", + "id": "https://github.com/SoftwareHeritage/swh-indexer", }