diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst --- a/docs/metadata-workflow.rst +++ b/docs/metadata-workflow.rst @@ -69,7 +69,11 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Intrinsic metadata is extracted from files provided with a project's source -code, and translated using `CodeMeta`_'s `crosswalk table`_. +code, and translated using `CodeMeta`_'s `crosswalk table`_; which is vendored +in :file:`swh/indexer/data/codemeta/codemeta.csv`. +Ecosystems not yet included in Codemeta's crosswalk have their own +:file:`swh/indexer/data/*.csv` file, with one row for each CodeMeta property, +even when not supported by the ecosystem. All input formats supported so far are straightforward dictionaries (eg. JSON) or can be accessed as such (eg. XML); and the first part of the translation is diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -9,7 +9,7 @@ import json import os.path import re -from typing import Any, List +from typing import Any, Dict, List, Set, TextIO, Tuple from pyld import jsonld import rdflib @@ -66,7 +66,15 @@ return uri -def _read_crosstable(fd): +def read_crosstable(fd: TextIO) -> Tuple[Set[str], Dict[str, Dict[str, rdflib.URIRef]]]: + """ + Given a file-like object to a `CodeMeta crosswalk table` (either the main + cross-table with all columns, or an auxiliary table with just the CodeMeta + column and one ecosystem-specific table); returns a list of all CodeMeta + terms, and a dictionary ``{ecosystem: {ecosystem_term: codemeta_term}}`` + + .. _CodeMeta crosswalk table: rdflib.URIRef: + """Returns an URI for the SoftwareSourceCode or Repository being described. + + The default implementation uses a temporary URI that is stripped before + normalization by :meth:`_translate_dict`. + """ + # The main object being described (the SoftwareSourceCode) does not necessarily + # may or may not have an id. + # If it does, it will need to be set by a subclass. + # If it doesn't we temporarily use this URI to identify it. Unfortunately, + # we cannot use a blank node as we need to use it for JSON-LD framing later, + # and blank nodes cannot be used for framing in JSON-LD >= 1.1 + root_id = TMP_ROOT_URI_PREFIX + str(uuid.uuid4()) + return rdflib.URIRef(root_id) + def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]: """ Translates content by parsing content from a dict object @@ -196,16 +218,47 @@ """ graph = rdflib.Graph() - # The main object being described (the SoftwareSourceCode) does not necessarily - # may or may not have an id. - # Either way, we temporarily use this URI to identify it. Unfortunately, - # we cannot use a blank node as we need to use it for JSON-LD framing later, - # and blank nodes cannot be used for framing in JSON-LD >= 1.1 - root_id = ( - "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/" - + str(uuid.uuid4()) + root = self.get_root_uri(content_dict) + + self._translate_to_graph(graph, root, content_dict) + + self.sanitize(graph) + + # Convert from rdflib's internal graph representation to JSON + s = graph.serialize(format="application/ld+json") + + # Load from JSON to a list of Python objects + jsonld_graph = json.loads(s) + + # Use JSON-LD framing to turn the graph into a rooted tree + # frame = {"@type": str(SCHEMA.SoftwareSourceCode)} + translated_metadata = jsonld.frame( + jsonld_graph, + {"@id": str(root)}, + options={ + "documentLoader": _document_loader, + "processingMode": "json-ld-1.1", + }, ) - root = rdflib.URIRef(root_id) + + # Remove the temporary id we added at the beginning + assert isinstance(translated_metadata["@id"], str) + if translated_metadata["@id"].startswith(TMP_ROOT_URI_PREFIX): + del translated_metadata["@id"] + + return self.normalize_translation(translated_metadata) + + def _translate_to_graph( + self, graph: rdflib.Graph, root: rdflib.term.Identifier, content_dict: Dict + ) -> None: + """ + Translates content by parsing content from a dict object + and translating with the appropriate mapping to the graph passed as parameter + + Args: + content_dict (dict): content dict to translate + + """ graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode)) for k, v in content_dict.items(): @@ -240,6 +293,14 @@ elif k in self.string_fields and isinstance(v, list): for item in v: graph.add((root, codemeta_key, rdflib.Literal(item))) + elif k in self.date_fields and isinstance(v, str): + typed_v = rdflib.Literal(v, datatype=SCHEMA.Date) + graph.add((root, codemeta_key, typed_v)) + elif k in self.date_fields and isinstance(v, list): + for item in v: + if isinstance(item, str): + typed_item = rdflib.Literal(item, datatype=SCHEMA.Date) + graph.add((root, codemeta_key, typed_item)) elif k in self.uri_fields and isinstance(v, str): # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop # URLs that are blatantly invalid early, so PyLD does not crash. @@ -258,33 +319,6 @@ self.extra_translation(graph, root, content_dict) - self.sanitize(graph) - - # Convert from rdflib's internal graph representation to JSON - s = graph.serialize(format="application/ld+json") - - # Load from JSON to a list of Python objects - jsonld_graph = json.loads(s) - - # Use JSON-LD framing to turn the graph into a rooted tree - # frame = {"@type": str(SCHEMA.SoftwareSourceCode)} - translated_metadata = jsonld.frame( - jsonld_graph, - {"@id": root_id}, - options={ - "documentLoader": _document_loader, - "processingMode": "json-ld-1.1", - }, - ) - - # Remove the temporary id we added at the beginning - if isinstance(translated_metadata["@id"], list): - translated_metadata["@id"].remove(root_id) - else: - del translated_metadata["@id"] - - return self.normalize_translation(translated_metadata) - def sanitize(self, graph: rdflib.Graph) -> None: # Remove triples that make PyLD crash for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))): diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py --- a/swh/indexer/metadata_dictionary/cff.py +++ b/swh/indexer/metadata_dictionary/cff.py @@ -25,6 +25,7 @@ filename = b"CITATION.cff" mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"] string_fields = ["keywords", "license", "abstract", "version", "doi"] + date_fields = ["date-released"] uri_fields = ["repository-code"] def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node: @@ -57,7 +58,3 @@ def normalize_license(self, s: str) -> URIRef: if isinstance(s, str): return SPDX + s - - def normalize_date_released(self, s: str) -> Literal: - if isinstance(s, str): - return Literal(s, datatype=SCHEMA.Date) diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py --- a/swh/indexer/metadata_dictionary/composer.py +++ b/swh/indexer/metadata_dictionary/composer.py @@ -8,7 +8,7 @@ from rdflib import BNode, Graph, Literal, URIRef -from swh.indexer.codemeta import _DATA_DIR, _read_crosstable +from swh.indexer.codemeta import _DATA_DIR, read_crosstable from swh.indexer.namespaces import RDF, SCHEMA from .base import JsonMapping, SingleFileIntrinsicMapping @@ -20,7 +20,7 @@ COMPOSER_TABLE_PATH = os.path.join(_DATA_DIR, "composer.csv") with open(COMPOSER_TABLE_PATH) as fd: - (CODEMETA_TERMS, COMPOSER_TABLE) = _read_crosstable(fd) + (CODEMETA_TERMS, COMPOSER_TABLE) = read_crosstable(fd) class ComposerMapping(JsonMapping, SingleFileIntrinsicMapping): diff --git a/swh/indexer/metadata_dictionary/dart.py b/swh/indexer/metadata_dictionary/dart.py --- a/swh/indexer/metadata_dictionary/dart.py +++ b/swh/indexer/metadata_dictionary/dart.py @@ -8,7 +8,7 @@ from rdflib import RDF, BNode, Graph, Literal, URIRef -from swh.indexer.codemeta import _DATA_DIR, _read_crosstable +from swh.indexer.codemeta import _DATA_DIR, read_crosstable from swh.indexer.namespaces import SCHEMA from .base import YamlMapping @@ -19,7 +19,7 @@ PUB_TABLE_PATH = os.path.join(_DATA_DIR, "pubspec.csv") with open(PUB_TABLE_PATH) as fd: - (CODEMETA_TERMS, PUB_TABLE) = _read_crosstable(fd) + (CODEMETA_TERMS, PUB_TABLE) = read_crosstable(fd) def name_to_person(name): diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/gitea.py copy from swh/indexer/metadata_dictionary/github.py copy to swh/indexer/metadata_dictionary/gitea.py --- a/swh/indexer/metadata_dictionary/github.py +++ b/swh/indexer/metadata_dictionary/gitea.py @@ -3,11 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os from typing import Any, Tuple from rdflib import RDF, BNode, Graph, Literal, URIRef -from swh.indexer.codemeta import CROSSWALK_TABLE +from swh.indexer.codemeta import _DATA_DIR, read_crosstable from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA from .base import BaseExtrinsicMapping, JsonMapping, produce_terms @@ -16,38 +17,53 @@ SPDX = URIRef("https://spdx.org/licenses/") -class GitHubMapping(BaseExtrinsicMapping, JsonMapping): - name = "github" - mapping = { - **CROSSWALK_TABLE["GitHub"], - "topics": SCHEMA.keywords, # TODO: submit this to the official crosswalk - } - string_fields = [ - "archive_url", +GITEA_TABLE_PATH = os.path.join(_DATA_DIR, "Gitea.csv") + +with open(GITEA_TABLE_PATH) as fd: + (CODEMETA_TERMS, GITEA_TABLE) = read_crosstable(fd) + + +class GiteaMapping(BaseExtrinsicMapping, JsonMapping): + name = "gitea" + mapping = GITEA_TABLE["Gitea"] + uri_fields = [ + "website", + "clone_url", + ] + date_fields = [ "created_at", "updated_at", - "description", + ] + string_fields = [ + "name", "full_name", - "html_url", - "issues_url", - "topics", + "languages", + "description", ] @classmethod def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: - return ("application/vnd.github.v3+json",) + return ("gitea-project-json", "gogs-project-json") def extra_translation(self, graph, root, content_dict): graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode)) graph.add((root, RDF.type, FORGEFED.Repository)) + def get_root_uri(self, content_dict: dict) -> URIRef: + if isinstance(content_dict.get("html_url"), str): + return URIRef(content_dict["html_url"]) + else: + raise ValueError( + f"Gitea/Gogs metadata has invalid/missing html_url: {content_dict}" + ) + @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems) def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None: """ >>> graph = Graph() >>> root = URIRef("http://example.org/test-software") - >>> GitHubMapping().translate_forks_count(graph, root, 42) + >>> GiteaMapping().translate_forks_count(graph, root, 42) >>> prettyprint_graph(graph, root) { "@id": ..., @@ -64,12 +80,12 @@ graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v))) @produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems) - def translate_stargazers_count(self, graph: Graph, root: BNode, v: Any) -> None: + def translate_stars_count(self, graph: Graph, root: BNode, v: Any) -> None: """ >>> graph = Graph() >>> root = URIRef("http://example.org/test-software") - >>> GitHubMapping().translate_stargazers_count(graph, root, 42) + >>> GiteaMapping().translate_stars_count(graph, root, 42) >>> prettyprint_graph(graph, root) { "@id": ..., @@ -91,7 +107,7 @@ >>> graph = Graph() >>> root = URIRef("http://example.org/test-software") - >>> GitHubMapping().translate_watchers_count(graph, root, 42) + >>> GiteaMapping().translate_watchers_count(graph, root, 42) >>> prettyprint_graph(graph, root) { "@id": ..., @@ -106,12 +122,3 @@ graph.add((root, ACTIVITYSTREAMS.followers, collection)) graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection)) graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v))) - - def normalize_license(self, d): - """ - - >>> GitHubMapping().normalize_license({'spdx_id': 'MIT'}) - rdflib.term.URIRef('https://spdx.org/licenses/MIT') - """ - if isinstance(d, dict) and isinstance(d.get("spdx_id"), str): - return SPDX + d["spdx_id"] diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py --- a/swh/indexer/metadata_dictionary/github.py +++ b/swh/indexer/metadata_dictionary/github.py @@ -21,15 +21,20 @@ mapping = { **CROSSWALK_TABLE["GitHub"], "topics": SCHEMA.keywords, # TODO: submit this to the official crosswalk + "clone_url": SCHEMA.codeRepository, } - string_fields = [ + uri_fields = [ "archive_url", + "clone_url", + "issues_url", + ] + date_fields = [ "created_at", "updated_at", + ] + string_fields = [ "description", "full_name", - "html_url", - "issues_url", "topics", ] @@ -41,6 +46,14 @@ graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode)) graph.add((root, RDF.type, FORGEFED.Repository)) + def get_root_uri(self, content_dict: dict) -> URIRef: + if isinstance(content_dict.get("html_url"), str): + return URIRef(content_dict["html_url"]) + else: + raise ValueError( + f"GitHub metadata has missing/invalid html_url: {content_dict}" + ) + @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems) def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None: """ diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py --- a/swh/indexer/metadata_dictionary/nuget.py +++ b/swh/indexer/metadata_dictionary/nuget.py @@ -9,7 +9,7 @@ from rdflib import RDF, BNode, Graph, Literal, URIRef -from swh.indexer.codemeta import _DATA_DIR, _read_crosstable +from swh.indexer.codemeta import _DATA_DIR, read_crosstable from swh.indexer.namespaces import SCHEMA from swh.indexer.storage.interface import Sha1 @@ -19,7 +19,7 @@ NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv") with open(NUGET_TABLE_PATH) as fd: - (CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd) + (CODEMETA_TERMS, NUGET_TABLE) = read_crosstable(fd) SPDX = URIRef("https://spdx.org/licenses/") diff --git a/swh/indexer/tests/metadata_dictionary/test_gitea.py b/swh/indexer/tests/metadata_dictionary/test_gitea.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_gitea.py @@ -0,0 +1,143 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.indexer.metadata_dictionary import MAPPINGS + +CONTEXT = [ + "https://doi.org/10.5063/schema/codemeta-2.0", + { + "as": "https://www.w3.org/ns/activitystreams#", + "forge": "https://forgefed.org/ns#", + }, +] + + +def test_compute_metadata_none(): + """ + testing content empty content is empty + should return None + """ + content = b"" + + # None if no metadata was found or an error occurred + declared_metadata = None + result = MAPPINGS["GiteaMapping"]().translate(content) + assert declared_metadata == result + + +def test_supported_terms(): + terms = MAPPINGS["GiteaMapping"].supported_terms() + assert { + "http://schema.org/name", + "http://schema.org/dateCreated", + "https://forgefed.org/ns#forks", + "https://www.w3.org/ns/activitystreams#totalItems", + } <= terms + + +def test_compute_metadata_gitea(): + content = b""" +{ + "id": 48043, + "owner": { + "id": 48018, + "login": "ForgeFed", + "full_name": "", + "email": "", + "avatar_url": "https://codeberg.org/avatars/c20f7a6733a6156304137566ee35ef33", + "language": "", + "is_admin": false, + "last_login": "0001-01-01T00:00:00Z", + "created": "2022-04-30T20:13:17+02:00", + "restricted": false, + "active": false, + "prohibit_login": false, + "location": "", + "website": "https://forgefed.org/", + "description": "", + "visibility": "public", + "followers_count": 0, + "following_count": 0, + "starred_repos_count": 0, + "username": "ForgeFed" + }, + "name": "ForgeFed", + "full_name": "ForgeFed/ForgeFed", + "description": "ActivityPub-based forge federation protocol specification", + "empty": false, + "private": false, + "fork": false, + "template": false, + "parent": null, + "mirror": false, + "size": 3780, + "language": "CSS", + "languages_url": "https://codeberg.org/api/v1/repos/ForgeFed/ForgeFed/languages", + "html_url": "https://codeberg.org/ForgeFed/ForgeFed", + "ssh_url": "git@codeberg.org:ForgeFed/ForgeFed.git", + "clone_url": "https://codeberg.org/ForgeFed/ForgeFed.git", + "original_url": "https://notabug.org/peers/forgefed", + "website": "https://forgefed.org", + "stars_count": 30, + "forks_count": 6, + "watchers_count": 11, + "open_issues_count": 61, + "open_pr_counter": 10, + "release_counter": 0, + "default_branch": "main", + "archived": false, + "created_at": "2022-06-13T18:54:26+02:00", + "updated_at": "2022-09-02T03:57:22+02:00", + "permissions": { + "admin": false, + "push": false, + "pull": true + }, + "has_issues": true, + "internal_tracker": { + "enable_time_tracker": true, + "allow_only_contributors_to_track_time": true, + "enable_issue_dependencies": true + }, + "has_wiki": false, + "has_pull_requests": true, + "has_projects": true, + "ignore_whitespace_conflicts": false, + "allow_merge_commits": false, + "allow_rebase": false, + "allow_rebase_explicit": false, + "allow_squash_merge": true, + "default_merge_style": "squash", + "avatar_url": "", + "internal": false, + "mirror_interval": "", + "mirror_updated": "0001-01-01T00:00:00Z", + "repo_transfer": null +} + """ + result = MAPPINGS["GiteaMapping"]().translate(content) + assert result == { + "@context": CONTEXT, + "type": "forge:Repository", + "id": "https://codeberg.org/ForgeFed/ForgeFed", + "forge:forks": { + "as:totalItems": 6, + "type": "as:OrderedCollection", + }, + "as:likes": { + "as:totalItems": 30, + "type": "as:Collection", + }, + "as:followers": { + "as:totalItems": 11, + "type": "as:Collection", + }, + "name": "ForgeFed", + "description": "ActivityPub-based forge federation protocol specification", + "codeRepository": "https://codeberg.org/ForgeFed/ForgeFed.git", + "dateCreated": "2022-06-13T18:54:26+02:00", + "dateModified": "2022-09-02T03:57:22+02:00", + "url": "https://forgefed.org", + } diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py --- a/swh/indexer/tests/metadata_dictionary/test_github.py +++ b/swh/indexer/tests/metadata_dictionary/test_github.py @@ -32,6 +32,7 @@ assert { "http://schema.org/name", "http://schema.org/license", + "http://schema.org/dateCreated", "https://forgefed.org/ns#forks", "https://www.w3.org/ns/activitystreams#totalItems", } <= terms @@ -118,6 +119,7 @@ assert result == { "@context": CONTEXT, "type": "forge:Repository", + "id": "https://github.com/SoftwareHeritage/swh-indexer", "forge:forks": { "as:totalItems": 1, "type": "as:OrderedCollection", @@ -133,15 +135,16 @@ "license": "https://spdx.org/licenses/GPL-3.0", "name": "SoftwareHeritage/swh-indexer", "description": "GitHub mirror of Metadata indexer", - "schema:codeRepository": "https://github.com/SoftwareHeritage/swh-indexer", - "schema:dateCreated": "2017-01-31T13:05:39Z", - "schema:dateModified": "2022-06-22T08:02:20Z", + "codeRepository": "https://github.com/SoftwareHeritage/swh-indexer.git", + "dateCreated": "2017-01-31T13:05:39Z", + "dateModified": "2022-06-22T08:02:20Z", } def test_github_topics(): content = b""" { + "html_url": "https://github.com/SoftwareHeritage/swh-indexer", "topics": [ "foo", "bar" @@ -153,4 +156,5 @@ assert result == { "@context": CONTEXT, "type": "forge:Repository", + "id": "https://github.com/SoftwareHeritage/swh-indexer", } diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py --- a/swh/indexer/tests/metadata_dictionary/test_npm.py +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -294,6 +294,20 @@ } +def test_npm_author(): + package_json = rb"""{ + "version": "1.0.0", + "author": "Foo Bar (@example)" +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "Foo Bar", "type": "Person"}], + "version": "1.0.0", + } + + def test_npm_invalid_uris(): package_json = rb"""{ "version": "1.0.0", diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -110,6 +110,7 @@ "codemeta", "composer", "gemspec", + "gitea", "github", "json-sword-codemeta", "maven", @@ -749,6 +750,7 @@ mappings=["github"], metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": "http://example.org/", "type": "https://forgefed.org/ns#Repository", "name": "test software", }, diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -74,7 +74,7 @@ version="1.0.0", ), format="application/vnd.github.v3+json", - metadata=b'{"full_name": "test software"}', + metadata=b'{"full_name": "test software", "html_url": "http://example.org/"}', ) @@ -237,6 +237,7 @@ tool={"id": tool["id"], **TRANSLATOR_TOOL}, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": "http://example.org/", "type": "https://forgefed.org/ns#Repository", "name": "test software", },