Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7122913
D8772.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
D8772.diff
View Options
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -6,7 +6,6 @@
import json
import logging
from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
-import urllib.parse
import uuid
import xml.parsers.expat
@@ -20,6 +19,8 @@
from swh.indexer.namespaces import RDF, SCHEMA
from swh.indexer.storage.interface import Sha1
+from .utils import add_url_if_valid
+
TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"
"""Prefix used to generate temporary URIs for root nodes being translated."""
@@ -285,9 +286,15 @@
pass
elif isinstance(v, list):
for item in reversed(v):
- graph.add((root, codemeta_key, item))
+ if isinstance(item, rdflib.URIRef):
+ add_url_if_valid(graph, root, codemeta_key, str(item))
+ else:
+ graph.add((root, codemeta_key, item))
else:
- graph.add((root, codemeta_key, v))
+ if isinstance(v, rdflib.URIRef):
+ add_url_if_valid(graph, root, codemeta_key, str(v))
+ else:
+ graph.add((root, codemeta_key, v))
elif k in self.string_fields and isinstance(v, str):
graph.add((root, codemeta_key, rdflib.Literal(v)))
elif k in self.string_fields and isinstance(v, list):
@@ -302,18 +309,10 @@
typed_item = rdflib.Literal(item, datatype=SCHEMA.Date)
graph.add((root, codemeta_key, typed_item))
elif k in self.uri_fields and isinstance(v, str):
- # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
- # URLs that are blatantly invalid early, so PyLD does not crash.
- parsed_url = urllib.parse.urlparse(v)
- if parsed_url.netloc:
- graph.add((root, codemeta_key, rdflib.URIRef(v)))
+ add_url_if_valid(graph, root, codemeta_key, v)
elif k in self.uri_fields and isinstance(v, list):
for item in v:
- if isinstance(item, str):
- # ditto
- parsed_url = urllib.parse.urlparse(item)
- if parsed_url.netloc:
- graph.add((root, codemeta_key, rdflib.URIRef(item)))
+ add_url_if_valid(graph, root, codemeta_key, item)
else:
continue
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
from typing import List
+import urllib.parse
from rdflib import BNode, Graph, Literal, URIRef
import rdflib.term
@@ -30,7 +31,11 @@
def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node:
node: rdflib.term.Node
- if "orcid" in author and isinstance(author["orcid"], str):
+ if (
+ "orcid" in author
+ and isinstance(author["orcid"], str)
+ and urllib.parse.urlparse(author["orcid"]).netloc
+ ):
node = URIRef(author["orcid"])
else:
node = BNode()
diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
--- a/swh/indexer/metadata_dictionary/github.py
+++ b/swh/indexer/metadata_dictionary/github.py
@@ -11,7 +11,7 @@
from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA
from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
-from .utils import prettyprint_graph # noqa
+from .utils import add_url_if_valid, prettyprint_graph # noqa
SPDX = URIRef("https://spdx.org/licenses/")
@@ -45,12 +45,11 @@
graph.add((root, RDF.type, FORGEFED.Repository))
if content_dict.get("has_issues"):
- graph.add(
- (
- root,
- CODEMETA.issueTracker,
- URIRef(content_dict["html_url"] + "/issues"),
- )
+ add_url_if_valid(
+ graph,
+ root,
+ CODEMETA.issueTracker,
+ URIRef(content_dict["html_url"] + "/issues"),
)
def get_root_uri(self, content_dict: dict) -> URIRef:
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -6,13 +6,13 @@
import os
from typing import Any, Dict
-from rdflib import Graph, Literal, URIRef
+from rdflib import Graph, Literal
from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import SCHEMA
from .base import SingleFileIntrinsicMapping, XmlMapping
-from .utils import prettyprint_graph # noqa
+from .utils import add_url_if_valid, prettyprint_graph # noqa
class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
@@ -78,7 +78,7 @@
if "${" in repo:
# Often use as templating in pom.xml files collected from VCSs
return
- graph.add((root, SCHEMA.codeRepository, URIRef(repo)))
+ add_url_if_valid(graph, root, SCHEMA.codeRepository, repo)
def normalize_groupId(self, id_):
"""https://maven.apache.org/pom.html#Maven_Coordinates
@@ -94,6 +94,7 @@
>>> import xmltodict
>>> import json
+ >>> from rdflib import URIRef
>>> d = xmltodict.parse('''
... <licenses>
... <license>
@@ -158,5 +159,5 @@
elif not isinstance(licenses, list):
return
for license in licenses:
- if isinstance(license, dict) and isinstance(license.get("url"), str):
- graph.add((root, SCHEMA.license, URIRef(license["url"])))
+ if isinstance(license, dict):
+ add_url_if_valid(graph, root, SCHEMA.license, license.get("url"))
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import re
-import urllib.parse
from rdflib import RDF, BNode, Graph, Literal, URIRef
@@ -12,7 +11,7 @@
from swh.indexer.namespaces import SCHEMA
from .base import JsonMapping, SingleFileIntrinsicMapping
-from .utils import add_list, prettyprint_graph # noqa
+from .utils import add_list, add_url_if_valid, prettyprint_graph # noqa
SPDX = URIRef("https://spdx.org/licenses/")
@@ -94,11 +93,7 @@
else:
url = ""
- parsed_url = urllib.parse.urlparse(url)
- if parsed_url.netloc:
- return URIRef(url)
- else:
- return None
+ return URIRef(url)
_parse_author = re.compile(
r"^ *" r"(?P<name>.*?)" r"( +<(?P<email>.*)>)?" r"( +\((?P<url>.*)\))?" r" *$"
@@ -191,12 +186,7 @@
graph.add((author, SCHEMA.name, Literal(name)))
if email and isinstance(email, str):
graph.add((author, SCHEMA.email, Literal(email)))
- if url and isinstance(url, str):
- # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
- # URLs that are blatantly invalid early, so PyLD does not crash.
- parsed_url = urllib.parse.urlparse(url)
- if parsed_url.netloc:
- graph.add((author, SCHEMA.url, URIRef(url)))
+ add_url_if_valid(graph, author, SCHEMA.url, url)
add_list(graph, root, SCHEMA.author, [author])
diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py
--- a/swh/indexer/metadata_dictionary/nuget.py
+++ b/swh/indexer/metadata_dictionary/nuget.py
@@ -14,7 +14,7 @@
from swh.indexer.storage.interface import Sha1
from .base import BaseIntrinsicMapping, DirectoryLsEntry, XmlMapping
-from .utils import add_list
+from .utils import add_list, add_url_if_valid
NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv")
@@ -58,7 +58,7 @@
def translate_repository(self, graph, root, v):
if isinstance(v, dict) and isinstance(v["@url"], str):
codemeta_key = URIRef(self.mapping["repository.url"])
- graph.add((root, codemeta_key, URIRef(v["@url"])))
+ add_url_if_valid(graph, root, codemeta_key, v["@url"])
def normalize_license(self, v):
if isinstance(v, dict) and v["@type"] == "expression":
diff --git a/swh/indexer/metadata_dictionary/utils.py b/swh/indexer/metadata_dictionary/utils.py
--- a/swh/indexer/metadata_dictionary/utils.py
+++ b/swh/indexer/metadata_dictionary/utils.py
@@ -5,7 +5,8 @@
import json
-from typing import Callable, Iterable, Optional, Sequence, TypeVar
+from typing import Any, Callable, Iterable, Optional, Sequence, TypeVar
+import urllib.parse
from pyld import jsonld
from rdflib import RDF, Graph, URIRef
@@ -70,3 +71,42 @@
"""Helper for :func:`add_list` that takes a mapper function ``f``."""
nodes = [f(graph, value) for value in values]
add_list(graph, subject, predicate, [node for node in nodes if node])
+
+
+def add_url_if_valid(
+ graph: Graph,
+ subject: rdflib.term.Node,
+ predicate: rdflib.term.Identifier,
+ url: Any,
+) -> None:
+ """Adds ``(subject, predicate, url)`` to the graph if ``url`` is well-formed.
+
+ This is meant as a workaround for https://github.com/digitalbazaar/pyld/issues/91
+ to drop URLs that are blatantly invalid early, so PyLD does not crash.
+
+ >>> from pprint import pprint
+ >>> graph = Graph()
+ >>> subject = rdflib.term.URIRef("http://example.org/test-software")
+ >>> predicate = rdflib.term.URIRef("http://schema.org/license")
+ >>> add_url_if_valid(
+ ... graph, subject, predicate, "https//www.apache.org/licenses/LICENSE-2.0.txt"
+ ... )
+ >>> add_url_if_valid(
+ ... graph, subject, predicate, "http:s//www.apache.org/licenses/LICENSE-2.0.txt"
+ ... )
+ >>> add_url_if_valid(
+ ... graph, subject, predicate, "https://www.apache.org/licenses/LICENSE-2.0.txt"
+ ... )
+ >>> add_url_if_valid(
+ ... graph, subject, predicate, 42
+ ... )
+ >>> pprint(set(graph.triples((subject, predicate, None))))
+ {(rdflib.term.URIRef('http://example.org/test-software'),
+ rdflib.term.URIRef('http://schema.org/license'),
+ rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}
+ """
+ if not isinstance(url, str):
+ return
+ if " " in url or not urllib.parse.urlparse(url).netloc:
+ return
+ graph.add((subject, predicate, rdflib.term.URIRef(url)))
diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -378,6 +378,17 @@
"version": "1.0.0",
}
+ package_json = rb"""{
+ "version": "1.0.0",
+ "repository": "git+https://g ithub.com/foo/bar.git"
+}"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "version": "1.0.0",
+ }
+
def test_npm_invalid_licenses():
package_json = rb"""{
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 11:19 AM (3 d, 6 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3229679
Attached To
D8772: metadata_dictionary: Systematically check input URLs before adding to graph
Event Timeline
Log In to Comment