diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -250,6 +250,8 @@ self.extra_translation(graph, root, content_dict) + self.sanitize(graph) + # Convert from rdflib's internal graph representation to JSON s = graph.serialize(format="application/ld+json") @@ -275,9 +277,22 @@ return self.normalize_translation(translated_metadata) + def sanitize(self, graph: rdflib.Graph) -> None: + # Remove triples that make PyLD crash + for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))): + graph.remove((subject, predicate, rdflib.URIRef(""))) + + # Should not happen, but we's better check as this may lead to incorrect data + invalid = False + for triple in graph.triples((rdflib.URIRef(""), None, None)): + invalid = True + logging.error("Empty triple subject URI: %r", triple) + if invalid: + raise ValueError("Empty triple subject(s)") + def extra_translation( self, graph: rdflib.Graph, root: rdflib.term.Node, d: Dict[str, Any] - ): + ) -> None: """Called at the end of the translation process, and may add arbitrary triples to ``graph`` based on the input dictionary (passed as ``d``). """ diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py --- a/swh/indexer/tests/metadata_dictionary/test_npm.py +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -294,6 +294,57 @@ } +def test_npm_empty_uris(): + package_json = rb"""{ + "version": "1.0.0", + "homepage": "", + "author": { + "name": "foo", + "url": "http://example.org" + } +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}], + "version": "1.0.0", + } + + package_json = rb"""{ + "version": "1.0.0", + "homepage": "http://example.org", + "author": { + "name": "foo", + "url": "" + } +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "foo", "type": "Person"}], + "url": "http://example.org", + "version": "1.0.0", + } + + package_json = rb"""{ + "version": "1.0.0", + "homepage": "", + "author": { + "name": "foo", + "url": "" + } +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "foo", "type": "Person"}], + "version": "1.0.0", + } + + @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping))) # type: ignore def test_npm_adversarial(doc):