Page MenuHomeSoftware Heritage

D8772.diff
No OneTemporary

D8772.diff

diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -6,7 +6,6 @@
import json
import logging
from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
-import urllib.parse
import uuid
import xml.parsers.expat
@@ -20,6 +19,8 @@
from swh.indexer.namespaces import RDF, SCHEMA
from swh.indexer.storage.interface import Sha1
+from .utils import add_url_if_valid
+
TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"
"""Prefix used to generate temporary URIs for root nodes being translated."""
@@ -285,9 +286,15 @@
pass
elif isinstance(v, list):
for item in reversed(v):
- graph.add((root, codemeta_key, item))
+ if isinstance(item, rdflib.URIRef):
+ add_url_if_valid(graph, root, codemeta_key, str(item))
+ else:
+ graph.add((root, codemeta_key, item))
else:
- graph.add((root, codemeta_key, v))
+ if isinstance(v, rdflib.URIRef):
+ add_url_if_valid(graph, root, codemeta_key, str(v))
+ else:
+ graph.add((root, codemeta_key, v))
elif k in self.string_fields and isinstance(v, str):
graph.add((root, codemeta_key, rdflib.Literal(v)))
elif k in self.string_fields and isinstance(v, list):
@@ -302,18 +309,10 @@
typed_item = rdflib.Literal(item, datatype=SCHEMA.Date)
graph.add((root, codemeta_key, typed_item))
elif k in self.uri_fields and isinstance(v, str):
- # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
- # URLs that are blatantly invalid early, so PyLD does not crash.
- parsed_url = urllib.parse.urlparse(v)
- if parsed_url.netloc:
- graph.add((root, codemeta_key, rdflib.URIRef(v)))
+ add_url_if_valid(graph, root, codemeta_key, v)
elif k in self.uri_fields and isinstance(v, list):
for item in v:
- if isinstance(item, str):
- # ditto
- parsed_url = urllib.parse.urlparse(item)
- if parsed_url.netloc:
- graph.add((root, codemeta_key, rdflib.URIRef(item)))
+ add_url_if_valid(graph, root, codemeta_key, item)
else:
continue
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
from typing import List
+import urllib.parse
from rdflib import BNode, Graph, Literal, URIRef
import rdflib.term
@@ -30,7 +31,11 @@
def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node:
node: rdflib.term.Node
- if "orcid" in author and isinstance(author["orcid"], str):
+ if (
+ "orcid" in author
+ and isinstance(author["orcid"], str)
+ and urllib.parse.urlparse(author["orcid"]).netloc
+ ):
node = URIRef(author["orcid"])
else:
node = BNode()
diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
--- a/swh/indexer/metadata_dictionary/github.py
+++ b/swh/indexer/metadata_dictionary/github.py
@@ -11,7 +11,7 @@
from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA
from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
-from .utils import prettyprint_graph # noqa
+from .utils import add_url_if_valid, prettyprint_graph # noqa
SPDX = URIRef("https://spdx.org/licenses/")
@@ -45,12 +45,11 @@
graph.add((root, RDF.type, FORGEFED.Repository))
if content_dict.get("has_issues"):
- graph.add(
- (
- root,
- CODEMETA.issueTracker,
- URIRef(content_dict["html_url"] + "/issues"),
- )
+ add_url_if_valid(
+ graph,
+ root,
+ CODEMETA.issueTracker,
+ URIRef(content_dict["html_url"] + "/issues"),
)
def get_root_uri(self, content_dict: dict) -> URIRef:
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -6,13 +6,13 @@
import os
from typing import Any, Dict
-from rdflib import Graph, Literal, URIRef
+from rdflib import Graph, Literal
from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import SCHEMA
from .base import SingleFileIntrinsicMapping, XmlMapping
-from .utils import prettyprint_graph # noqa
+from .utils import add_url_if_valid, prettyprint_graph # noqa
class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
@@ -78,7 +78,7 @@
if "${" in repo:
# Often use as templating in pom.xml files collected from VCSs
return
- graph.add((root, SCHEMA.codeRepository, URIRef(repo)))
+ add_url_if_valid(graph, root, SCHEMA.codeRepository, repo)
def normalize_groupId(self, id_):
"""https://maven.apache.org/pom.html#Maven_Coordinates
@@ -94,6 +94,7 @@
>>> import xmltodict
>>> import json
+ >>> from rdflib import URIRef
>>> d = xmltodict.parse('''
... <licenses>
... <license>
@@ -158,5 +159,5 @@
elif not isinstance(licenses, list):
return
for license in licenses:
- if isinstance(license, dict) and isinstance(license.get("url"), str):
- graph.add((root, SCHEMA.license, URIRef(license["url"])))
+ if isinstance(license, dict):
+ add_url_if_valid(graph, root, SCHEMA.license, license.get("url"))
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import re
-import urllib.parse
from rdflib import RDF, BNode, Graph, Literal, URIRef
@@ -12,7 +11,7 @@
from swh.indexer.namespaces import SCHEMA
from .base import JsonMapping, SingleFileIntrinsicMapping
-from .utils import add_list, prettyprint_graph # noqa
+from .utils import add_list, add_url_if_valid, prettyprint_graph # noqa
SPDX = URIRef("https://spdx.org/licenses/")
@@ -94,11 +93,7 @@
else:
url = ""
- parsed_url = urllib.parse.urlparse(url)
- if parsed_url.netloc:
- return URIRef(url)
- else:
- return None
+ return URIRef(url)
_parse_author = re.compile(
r"^ *" r"(?P<name>.*?)" r"( +<(?P<email>.*)>)?" r"( +\((?P<url>.*)\))?" r" *$"
@@ -191,12 +186,7 @@
graph.add((author, SCHEMA.name, Literal(name)))
if email and isinstance(email, str):
graph.add((author, SCHEMA.email, Literal(email)))
- if url and isinstance(url, str):
- # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
- # URLs that are blatantly invalid early, so PyLD does not crash.
- parsed_url = urllib.parse.urlparse(url)
- if parsed_url.netloc:
- graph.add((author, SCHEMA.url, URIRef(url)))
+ add_url_if_valid(graph, author, SCHEMA.url, url)
add_list(graph, root, SCHEMA.author, [author])
diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py
--- a/swh/indexer/metadata_dictionary/nuget.py
+++ b/swh/indexer/metadata_dictionary/nuget.py
@@ -14,7 +14,7 @@
from swh.indexer.storage.interface import Sha1
from .base import BaseIntrinsicMapping, DirectoryLsEntry, XmlMapping
-from .utils import add_list
+from .utils import add_list, add_url_if_valid
NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv")
@@ -58,7 +58,7 @@
def translate_repository(self, graph, root, v):
if isinstance(v, dict) and isinstance(v["@url"], str):
codemeta_key = URIRef(self.mapping["repository.url"])
- graph.add((root, codemeta_key, URIRef(v["@url"])))
+ add_url_if_valid(graph, root, codemeta_key, v["@url"])
def normalize_license(self, v):
if isinstance(v, dict) and v["@type"] == "expression":
diff --git a/swh/indexer/metadata_dictionary/utils.py b/swh/indexer/metadata_dictionary/utils.py
--- a/swh/indexer/metadata_dictionary/utils.py
+++ b/swh/indexer/metadata_dictionary/utils.py
@@ -5,7 +5,8 @@
import json
-from typing import Callable, Iterable, Optional, Sequence, TypeVar
+from typing import Any, Callable, Iterable, Optional, Sequence, TypeVar
+import urllib.parse
from pyld import jsonld
from rdflib import RDF, Graph, URIRef
@@ -70,3 +71,42 @@
"""Helper for :func:`add_list` that takes a mapper function ``f``."""
nodes = [f(graph, value) for value in values]
add_list(graph, subject, predicate, [node for node in nodes if node])
+
+
+def add_url_if_valid(
+ graph: Graph,
+ subject: rdflib.term.Node,
+ predicate: rdflib.term.Identifier,
+ url: Any,
+) -> None:
+ """Adds ``(subject, predicate, url)`` to the graph if ``url`` is well-formed.
+
+ This is meant as a workaround for https://github.com/digitalbazaar/pyld/issues/91
+ to drop URLs that are blatantly invalid early, so PyLD does not crash.
+
+ >>> from pprint import pprint
+ >>> graph = Graph()
+ >>> subject = rdflib.term.URIRef("http://example.org/test-software")
+ >>> predicate = rdflib.term.URIRef("http://schema.org/license")
+ >>> add_url_if_valid(
+ ... graph, subject, predicate, "https//www.apache.org/licenses/LICENSE-2.0.txt"
+ ... )
+ >>> add_url_if_valid(
+ ... graph, subject, predicate, "http:s//www.apache.org/licenses/LICENSE-2.0.txt"
+ ... )
+ >>> add_url_if_valid(
+ ... graph, subject, predicate, "https://www.apache.org/licenses/LICENSE-2.0.txt"
+ ... )
+ >>> add_url_if_valid(
+ ... graph, subject, predicate, 42
+ ... )
+ >>> pprint(set(graph.triples((subject, predicate, None))))
+ {(rdflib.term.URIRef('http://example.org/test-software'),
+ rdflib.term.URIRef('http://schema.org/license'),
+ rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}
+ """
+ if not isinstance(url, str):
+ return
+ if " " in url or not urllib.parse.urlparse(url).netloc:
+ return
+ graph.add((subject, predicate, rdflib.term.URIRef(url)))
diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -378,6 +378,17 @@
"version": "1.0.0",
}
+ package_json = rb"""{
+ "version": "1.0.0",
+ "repository": "git+https://g ithub.com/foo/bar.git"
+}"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "version": "1.0.0",
+ }
+
def test_npm_invalid_licenses():
package_json = rb"""{

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 11:19 AM (3 d, 6 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3229679

Event Timeline