diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -6,7 +6,6 @@ import json import logging from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar -import urllib.parse import uuid import xml.parsers.expat @@ -20,6 +19,8 @@ from swh.indexer.namespaces import RDF, SCHEMA from swh.indexer.storage.interface import Sha1 +from .utils import add_url_if_valid + TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/" """Prefix used to generate temporary URIs for root nodes being translated.""" @@ -285,9 +286,15 @@ pass elif isinstance(v, list): for item in reversed(v): - graph.add((root, codemeta_key, item)) + if isinstance(item, rdflib.URIRef): + add_url_if_valid(graph, root, codemeta_key, str(item)) + else: + graph.add((root, codemeta_key, item)) else: - graph.add((root, codemeta_key, v)) + if isinstance(v, rdflib.URIRef): + add_url_if_valid(graph, root, codemeta_key, str(v)) + else: + graph.add((root, codemeta_key, v)) elif k in self.string_fields and isinstance(v, str): graph.add((root, codemeta_key, rdflib.Literal(v))) elif k in self.string_fields and isinstance(v, list): @@ -302,18 +309,10 @@ typed_item = rdflib.Literal(item, datatype=SCHEMA.Date) graph.add((root, codemeta_key, typed_item)) elif k in self.uri_fields and isinstance(v, str): - # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop - # URLs that are blatantly invalid early, so PyLD does not crash. - parsed_url = urllib.parse.urlparse(v) - if parsed_url.netloc: - graph.add((root, codemeta_key, rdflib.URIRef(v))) + add_url_if_valid(graph, root, codemeta_key, v) elif k in self.uri_fields and isinstance(v, list): for item in v: - if isinstance(item, str): - # ditto - parsed_url = urllib.parse.urlparse(item) - if parsed_url.netloc: - graph.add((root, codemeta_key, rdflib.URIRef(item))) + add_url_if_valid(graph, root, codemeta_key, item) else: continue diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py --- a/swh/indexer/metadata_dictionary/cff.py +++ b/swh/indexer/metadata_dictionary/cff.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from typing import List +import urllib.parse from rdflib import BNode, Graph, Literal, URIRef import rdflib.term @@ -30,7 +31,11 @@ def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node: node: rdflib.term.Node - if "orcid" in author and isinstance(author["orcid"], str): + if ( + "orcid" in author + and isinstance(author["orcid"], str) + and urllib.parse.urlparse(author["orcid"]).netloc + ): node = URIRef(author["orcid"]) else: node = BNode() diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py --- a/swh/indexer/metadata_dictionary/github.py +++ b/swh/indexer/metadata_dictionary/github.py @@ -11,7 +11,7 @@ from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA from .base import BaseExtrinsicMapping, JsonMapping, produce_terms -from .utils import prettyprint_graph # noqa +from .utils import add_url_if_valid, prettyprint_graph # noqa SPDX = URIRef("https://spdx.org/licenses/") @@ -45,12 +45,11 @@ graph.add((root, RDF.type, FORGEFED.Repository)) if content_dict.get("has_issues"): - graph.add( - ( - root, - CODEMETA.issueTracker, - URIRef(content_dict["html_url"] + "/issues"), - ) + add_url_if_valid( + graph, + root, + CODEMETA.issueTracker, + URIRef(content_dict["html_url"] + "/issues"), ) def get_root_uri(self, content_dict: dict) -> URIRef: diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py --- a/swh/indexer/metadata_dictionary/maven.py +++ b/swh/indexer/metadata_dictionary/maven.py @@ -6,13 +6,13 @@ import os from typing import Any, Dict -from rdflib import Graph, Literal, URIRef +from rdflib import Graph, Literal from swh.indexer.codemeta import CROSSWALK_TABLE from swh.indexer.namespaces import SCHEMA from .base import SingleFileIntrinsicMapping, XmlMapping -from .utils import prettyprint_graph # noqa +from .utils import add_url_if_valid, prettyprint_graph # noqa class MavenMapping(XmlMapping, SingleFileIntrinsicMapping): @@ -78,7 +78,7 @@ if "${" in repo: # Often use as templating in pom.xml files collected from VCSs return - graph.add((root, SCHEMA.codeRepository, URIRef(repo))) + add_url_if_valid(graph, root, SCHEMA.codeRepository, repo) def normalize_groupId(self, id_): """https://maven.apache.org/pom.html#Maven_Coordinates @@ -94,6 +94,7 @@ >>> import xmltodict >>> import json + >>> from rdflib import URIRef >>> d = xmltodict.parse(''' ... ... @@ -158,5 +159,5 @@ elif not isinstance(licenses, list): return for license in licenses: - if isinstance(license, dict) and isinstance(license.get("url"), str): - graph.add((root, SCHEMA.license, URIRef(license["url"]))) + if isinstance(license, dict): + add_url_if_valid(graph, root, SCHEMA.license, license.get("url")) diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py --- a/swh/indexer/metadata_dictionary/npm.py +++ b/swh/indexer/metadata_dictionary/npm.py @@ -4,7 +4,6 @@ # See top-level LICENSE file for more information import re -import urllib.parse from rdflib import RDF, BNode, Graph, Literal, URIRef @@ -12,7 +11,7 @@ from swh.indexer.namespaces import SCHEMA from .base import JsonMapping, SingleFileIntrinsicMapping -from .utils import add_list, prettyprint_graph # noqa +from .utils import add_list, add_url_if_valid, prettyprint_graph # noqa SPDX = URIRef("https://spdx.org/licenses/") @@ -94,11 +93,7 @@ else: url = "" - parsed_url = urllib.parse.urlparse(url) - if parsed_url.netloc: - return URIRef(url) - else: - return None + return URIRef(url) _parse_author = re.compile( r"^ *" r"(?P.*?)" r"( +<(?P.*)>)?" r"( +\((?P.*)\))?" r" *$" @@ -191,12 +186,7 @@ graph.add((author, SCHEMA.name, Literal(name))) if email and isinstance(email, str): graph.add((author, SCHEMA.email, Literal(email))) - if url and isinstance(url, str): - # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop - # URLs that are blatantly invalid early, so PyLD does not crash. - parsed_url = urllib.parse.urlparse(url) - if parsed_url.netloc: - graph.add((author, SCHEMA.url, URIRef(url))) + add_url_if_valid(graph, author, SCHEMA.url, url) add_list(graph, root, SCHEMA.author, [author]) diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py --- a/swh/indexer/metadata_dictionary/nuget.py +++ b/swh/indexer/metadata_dictionary/nuget.py @@ -14,7 +14,7 @@ from swh.indexer.storage.interface import Sha1 from .base import BaseIntrinsicMapping, DirectoryLsEntry, XmlMapping -from .utils import add_list +from .utils import add_list, add_url_if_valid NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv") @@ -58,7 +58,7 @@ def translate_repository(self, graph, root, v): if isinstance(v, dict) and isinstance(v["@url"], str): codemeta_key = URIRef(self.mapping["repository.url"]) - graph.add((root, codemeta_key, URIRef(v["@url"]))) + add_url_if_valid(graph, root, codemeta_key, v["@url"]) def normalize_license(self, v): if isinstance(v, dict) and v["@type"] == "expression": diff --git a/swh/indexer/metadata_dictionary/utils.py b/swh/indexer/metadata_dictionary/utils.py --- a/swh/indexer/metadata_dictionary/utils.py +++ b/swh/indexer/metadata_dictionary/utils.py @@ -5,7 +5,8 @@ import json -from typing import Callable, Iterable, Optional, Sequence, TypeVar +from typing import Any, Callable, Iterable, Optional, Sequence, TypeVar +import urllib.parse from pyld import jsonld from rdflib import RDF, Graph, URIRef @@ -70,3 +71,42 @@ """Helper for :func:`add_list` that takes a mapper function ``f``.""" nodes = [f(graph, value) for value in values] add_list(graph, subject, predicate, [node for node in nodes if node]) + + +def add_url_if_valid( + graph: Graph, + subject: rdflib.term.Node, + predicate: rdflib.term.Identifier, + url: Any, +) -> None: + """Adds ``(subject, predicate, url)`` to the graph if ``url`` is well-formed. + + This is meant as a workaround for https://github.com/digitalbazaar/pyld/issues/91 + to drop URLs that are blatantly invalid early, so PyLD does not crash. + + >>> from pprint import pprint + >>> graph = Graph() + >>> subject = rdflib.term.URIRef("http://example.org/test-software") + >>> predicate = rdflib.term.URIRef("http://schema.org/license") + >>> add_url_if_valid( + ... graph, subject, predicate, "https//www.apache.org/licenses/LICENSE-2.0.txt" + ... ) + >>> add_url_if_valid( + ... graph, subject, predicate, "http:s//www.apache.org/licenses/LICENSE-2.0.txt" + ... ) + >>> add_url_if_valid( + ... graph, subject, predicate, "https://www.apache.org/licenses/LICENSE-2.0.txt" + ... ) + >>> add_url_if_valid( + ... graph, subject, predicate, 42 + ... ) + >>> pprint(set(graph.triples((subject, predicate, None)))) + {(rdflib.term.URIRef('http://example.org/test-software'), + rdflib.term.URIRef('http://schema.org/license'), + rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))} + """ + if not isinstance(url, str): + return + if " " in url or not urllib.parse.urlparse(url).netloc: + return + graph.add((subject, predicate, rdflib.term.URIRef(url))) diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py --- a/swh/indexer/tests/metadata_dictionary/test_npm.py +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -378,6 +378,17 @@ "version": "1.0.0", } + package_json = rb"""{ + "version": "1.0.0", + "repository": "git+https://g ithub.com/foo/bar.git" +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "version": "1.0.0", + } + def test_npm_invalid_licenses(): package_json = rb"""{