diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py index f2eaa64..fd627b7 100644 --- a/swh/indexer/metadata_dictionary/npm.py +++ b/swh/indexer/metadata_dictionary/npm.py @@ -1,292 +1,298 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import urllib.parse from rdflib import RDF, BNode, Graph, Literal, URIRef from swh.indexer.codemeta import CROSSWALK_TABLE from swh.indexer.namespaces import SCHEMA from .base import JsonMapping, SingleFileIntrinsicMapping from .utils import add_list, prettyprint_graph # noqa SPDX = URIRef("https://spdx.org/licenses/") class NpmMapping(JsonMapping, SingleFileIntrinsicMapping): """ dedicated class for NPM (package.json) mapping and translation """ name = "npm" mapping = CROSSWALK_TABLE["NodeJS"] filename = b"package.json" string_fields = ["name", "version", "description", "email"] uri_fields = ["homepage"] _schema_shortcuts = { "github": "git+https://github.com/%s.git", "gist": "git+https://gist.github.com/%s.git", "gitlab": "git+https://gitlab.com/%s.git", # Bitbucket supports both hg and git, and the shortcut does not # tell which one to use. # 'bitbucket': 'https://bitbucket.org/', } def normalize_repository(self, d): """https://docs.npmjs.com/files/package.json#repository >>> NpmMapping().normalize_repository({ ... 'type': 'git', ... 'url': 'https://example.org/foo.git' ... }) rdflib.term.URIRef('git+https://example.org/foo.git') >>> NpmMapping().normalize_repository( ... 'gitlab:foo/bar') rdflib.term.URIRef('git+https://gitlab.com/foo/bar.git') >>> NpmMapping().normalize_repository( ... 'foo/bar') rdflib.term.URIRef('git+https://github.com/foo/bar.git') """ if ( isinstance(d, dict) and isinstance(d.get("type"), str) and isinstance(d.get("url"), str) ): url = "{type}+{url}".format(**d) elif isinstance(d, str): if "://" in d: url = d elif ":" in d: (schema, rest) = d.split(":", 1) if schema in self._schema_shortcuts: url = self._schema_shortcuts[schema] % rest else: return None else: url = self._schema_shortcuts["github"] % d else: return None return URIRef(url) def normalize_bugs(self, d): """https://docs.npmjs.com/files/package.json#bugs >>> NpmMapping().normalize_bugs({ ... 'url': 'https://example.org/bugs/', ... 'email': 'bugs@example.org' ... }) rdflib.term.URIRef('https://example.org/bugs/') >>> NpmMapping().normalize_bugs( ... 'https://example.org/bugs/') rdflib.term.URIRef('https://example.org/bugs/') """ if isinstance(d, dict) and isinstance(d.get("url"), str): - return URIRef(d["url"]) + url = d["url"] elif isinstance(d, str): - return URIRef(d) + url = d + else: + url = "" + + parsed_url = urllib.parse.urlparse(url) + if parsed_url.netloc: + return URIRef(url) else: return None _parse_author = re.compile( r"^ *" r"(?P.*?)" r"( +<(?P.*)>)?" r"( +\((?P.*)\))?" r" *$" ) def translate_author(self, graph: Graph, root, d): r"""https://docs.npmjs.com/files/package.json#people-fields-author-contributors' >>> from pprint import pprint >>> root = URIRef("http://example.org/test-software") >>> graph = Graph() >>> NpmMapping().translate_author(graph, root, { ... 'name': 'John Doe', ... 'email': 'john.doe@example.org', ... 'url': 'https://example.org/~john.doe', ... }) >>> prettyprint_graph(graph, root) { "@id": ..., "http://schema.org/author": { "@list": [ { "@type": "http://schema.org/Person", "http://schema.org/email": "john.doe@example.org", "http://schema.org/name": "John Doe", "http://schema.org/url": { "@id": "https://example.org/~john.doe" } } ] } } >>> graph = Graph() >>> NpmMapping().translate_author(graph, root, ... 'John Doe (https://example.org/~john.doe)' ... ) >>> prettyprint_graph(graph, root) { "@id": ..., "http://schema.org/author": { "@list": [ { "@type": "http://schema.org/Person", "http://schema.org/email": "john.doe@example.org", "http://schema.org/name": "John Doe", "http://schema.org/url": { "@id": "https://example.org/~john.doe" } } ] } } >>> graph = Graph() >>> NpmMapping().translate_author(graph, root, { ... 'name': 'John Doe', ... 'email': 'john.doe@example.org', ... 'url': 'https:\\\\example.invalid/~john.doe', ... }) >>> prettyprint_graph(graph, root) { "@id": ..., "http://schema.org/author": { "@list": [ { "@type": "http://schema.org/Person", "http://schema.org/email": "john.doe@example.org", "http://schema.org/name": "John Doe" } ] } } """ # noqa author = BNode() graph.add((author, RDF.type, SCHEMA.Person)) if isinstance(d, dict): name = d.get("name", None) email = d.get("email", None) url = d.get("url", None) elif isinstance(d, str): match = self._parse_author.match(d) if not match: return None name = match.group("name") email = match.group("email") url = match.group("url") else: return None if name and isinstance(name, str): graph.add((author, SCHEMA.name, Literal(name))) if email and isinstance(email, str): graph.add((author, SCHEMA.email, Literal(email))) if url and isinstance(url, str): # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop # URLs that are blatantly invalid early, so PyLD does not crash. parsed_url = urllib.parse.urlparse(url) if parsed_url.netloc: graph.add((author, SCHEMA.url, URIRef(url))) add_list(graph, root, SCHEMA.author, [author]) def normalize_description(self, description): r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common mistake that causes issues in the database because of null bytes in JSON. >>> NpmMapping().normalize_description("foo bar") rdflib.term.Literal('foo bar') >>> NpmMapping().normalize_description( ... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00" ... ) rdflib.term.Literal('foo bar') >>> NpmMapping().normalize_description( ... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 " ... ) rdflib.term.Literal('foo bar') >>> NpmMapping().normalize_description( ... # invalid UTF-16 and meaningless UTF-8: ... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00" ... ) is None True >>> NpmMapping().normalize_description( ... # ditto (ut looks like little-endian at first) ... "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00" ... ) is None True >>> NpmMapping().normalize_description(None) is None True """ if not isinstance(description, str): return None # XXX: if this function ever need to support more cases, consider # switching to https://pypi.org/project/ftfy/ instead of adding more hacks if description.startswith("\ufffd\ufffd") and "\x00" in description: # 2 unicode replacement characters followed by '# ' encoded as UTF-16 # is a common mistake, which indicates a README.md was saved as UTF-16, # and some NPM tool opened it as UTF-8 and used the first line as # description. description_bytes = description.encode() # Strip the the two unicode replacement characters assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd") description_bytes = description_bytes[6:] # If the following attempts fail to recover the description, discard it # entirely because the current indexer storage backend (postgresql) cannot # store zero bytes in JSON columns. description = None if not description_bytes.startswith(b"\x00"): # try UTF-16 little-endian (the most common) first try: description = description_bytes.decode("utf-16le") except UnicodeDecodeError: pass if description is None: # if it fails, try UTF-16 big-endian try: description = description_bytes.decode("utf-16be") except UnicodeDecodeError: pass if description: if description.startswith("# "): description = description[2:] return Literal(description.rstrip()) else: return None return Literal(description) def normalize_license(self, s): """https://docs.npmjs.com/files/package.json#license >>> NpmMapping().normalize_license('MIT') rdflib.term.URIRef('https://spdx.org/licenses/MIT') """ if isinstance(s, str): if s.startswith("SEE LICENSE IN "): # Very common pattern, because it is an example in the specification. # It is followed by the filename; and the indexer architecture currently # does not allow accessing that from metadata mappings. # (Plus, an hypothetical license mapping would eventually pick it up) return if " " in s: # Either an SPDX expression, or unusable data # TODO: handle it return return SPDX + s def normalize_keywords(self, lst): """https://docs.npmjs.com/files/package.json#homepage >>> NpmMapping().normalize_keywords(['foo', 'bar']) [rdflib.term.Literal('foo'), rdflib.term.Literal('bar')] """ if isinstance(lst, list): return [Literal(x) for x in lst if isinstance(x, str)] diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py index c57537b..804ac64 100644 --- a/swh/indexer/tests/metadata_dictionary/test_npm.py +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -1,434 +1,438 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from hypothesis import HealthCheck, given, settings import pytest from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.storage.model import ContentMetadataRow from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer from ..utils import ( BASE_TEST_CONFIG, MAPPING_DESCRIPTION_CONTENT_SHA1, json_document_strategy, ) def test_compute_metadata_none(): """ testing content empty content is empty should return None """ content = b"" # None if no metadata was found or an error occurred declared_metadata = None result = MAPPINGS["NpmMapping"]().translate(content) assert declared_metadata == result def test_compute_metadata_npm(): """ testing only computation of metadata with hard_mapping_npm """ content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "codeRepository": "git+https://github.com/moranegg/metadata_test", "author": [ { "type": "Person", "name": "Morane G", "email": "moranegg@example.com", } ], } result = MAPPINGS["NpmMapping"]().translate(content) assert declared_metadata == result def test_compute_metadata_invalid_description_npm(): """ testing only computation of metadata with hard_mapping_npm """ content = b""" { "name": "test_metadata", "version": "0.0.2", "description": 1234 } """ declared_metadata = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "test_metadata", "version": "0.0.2", } result = MAPPINGS["NpmMapping"]().translate(content) assert declared_metadata == result def test_index_content_metadata_npm(storage, obj_storage): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ sha1s = [ MAPPING_DESCRIPTION_CONTENT_SHA1["json:test-metadata-package.json"], MAPPING_DESCRIPTION_CONTENT_SHA1["json:npm-package.json"], MAPPING_DESCRIPTION_CONTENT_SHA1["python:code"], ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping config = BASE_TEST_CONFIG.copy() config["tools"] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) metadata_indexer.run(sha1s, log_suffix="unknown content") results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) expected_results = [ ContentMetadataRow( id=sha1s[0], tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "codeRepository": "git+https://github.com/moranegg/metadata_test", "description": "Simple package.json test for indexer", "name": "test_metadata", "version": "0.0.1", }, ), ContentMetadataRow( id=sha1s[1], tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "issueTracker": "https://github.com/npm/npm/issues", "author": [ { "type": "Person", "name": "Isaac Z. Schlueter", "email": "i@izs.me", "url": "http://blog.izs.me", } ], "codeRepository": "git+https://github.com/npm/npm", "description": "a package manager for JavaScript", "license": "https://spdx.org/licenses/Artistic-2.0", "version": "5.0.3", "name": "npm", "url": "https://docs.npmjs.com/", }, ), ] for result in results: del result.tool["id"] result.metadata.pop("keywords", None) # The assertion below returns False sometimes because of nested lists assert expected_results == results def test_npm_null_list_item_normalization(): package_json = b"""{ "name": "foo", "keywords": [ "foo", null ], "homepage": [ "http://example.org/", null ] }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", "url": "http://example.org/", "keywords": "foo", } def test_npm_bugs_normalization(): # valid dictionary package_json = b"""{ "name": "foo", "bugs": { "url": "https://github.com/owner/project/issues", "email": "foo@example.com" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", } # "invalid" dictionary package_json = b"""{ "name": "foo", "bugs": { "email": "foo@example.com" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", } # string package_json = b"""{ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", } def test_npm_repository_normalization(): # normal package_json = b"""{ "name": "foo", "repository": { "type" : "git", "url" : "https://github.com/npm/cli.git" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", } # missing url package_json = b"""{ "name": "foo", "repository": { "type" : "git" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", } # github shortcut package_json = b"""{ "name": "foo", "repository": "github:npm/cli" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", } assert result == expected_result # github shortshortcut package_json = b"""{ "name": "foo", "repository": "npm/cli" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == expected_result # gitlab shortcut package_json = b"""{ "name": "foo", "repository": "gitlab:user/repo" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://gitlab.com/user/repo.git", "type": "SoftwareSourceCode", } def test_npm_author(): package_json = rb"""{ "version": "1.0.0", "author": "Foo Bar (@example)" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "Foo Bar", "type": "Person"}], "version": "1.0.0", } def test_npm_invalid_uris(): package_json = rb"""{ "version": "1.0.0", "homepage": "", "author": { "name": "foo", "url": "http://example.org" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}], "version": "1.0.0", } package_json = rb"""{ "version": "1.0.0", "homepage": "http://example.org", "author": { "name": "foo", "url": "" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "foo", "type": "Person"}], "url": "http://example.org", "version": "1.0.0", } package_json = rb"""{ "version": "1.0.0", "homepage": "", "author": { "name": "foo", "url": "" - } + }, + "bugs": "" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "foo", "type": "Person"}], "version": "1.0.0", } package_json = rb"""{ "version": "1.0.0", "homepage": "http:example.org", "author": { "name": "foo", "url": "http:example.com" + }, + "bugs": { + "url": "http:example.com" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "foo", "type": "Person"}], "version": "1.0.0", } def test_npm_invalid_licenses(): package_json = rb"""{ "version": "1.0.0", "license": "SEE LICENSE IN LICENSE.md", "author": { "name": "foo", "url": "http://example.org" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}], "version": "1.0.0", } @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping))) # type: ignore def test_npm_adversarial(doc): raw = json.dumps(doc).encode() MAPPINGS["NpmMapping"]().translate(raw) @pytest.mark.parametrize( "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"] ) def test_detect_metadata_package_json(filename): df = [ { "sha1_git": b"abc", "name": b"index.js", "target": b"abc", "length": 897, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, { "sha1_git": b"aab", "name": filename, "target": b"aab", "length": 712, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"cde", }, ] results = detect_metadata(df) expected_results = {"NpmMapping": [b"cde"]} assert expected_results == results