diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -6,6 +6,7 @@ import json import logging from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar +import urllib.parse import uuid import xml.parsers.expat @@ -240,11 +241,18 @@ for item in v: graph.add((root, codemeta_key, rdflib.Literal(item))) elif k in self.uri_fields and isinstance(v, str): - graph.add((root, codemeta_key, rdflib.URIRef(v))) + # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop + # URLs that are blatantly invalid early, so PyLD does not crash. + parsed_url = urllib.parse.urlparse(v) + if parsed_url.netloc: + graph.add((root, codemeta_key, rdflib.URIRef(v))) elif k in self.uri_fields and isinstance(v, list): for item in v: if isinstance(item, str): - graph.add((root, codemeta_key, rdflib.URIRef(item))) + # ditto + parsed_url = urllib.parse.urlparse(item) + if parsed_url.netloc: + graph.add((root, codemeta_key, rdflib.URIRef(item))) else: continue diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py --- a/swh/indexer/tests/metadata_dictionary/test_npm.py +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -294,7 +294,7 @@ } -def test_npm_empty_uris(): +def test_npm_invalid_uris(): package_json = rb"""{ "version": "1.0.0", "homepage": "", @@ -344,6 +344,22 @@ "version": "1.0.0", } + package_json = rb"""{ + "version": "1.0.0", + "homepage": "http:example.org", + "author": { + "name": "foo", + "url": "http:example.com" + } +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "foo", "type": "Person"}], + "version": "1.0.0", + } + @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping))) # type: ignore