diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
index a374a5e..8b3e48d 100644
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -1,159 +1,162 @@
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from typing import Any, Dict
from rdflib import Graph, Literal, URIRef
from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import SCHEMA
from .base import SingleFileIntrinsicMapping, XmlMapping
from .utils import prettyprint_graph # noqa
class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
"""
dedicated class for Maven (pom.xml) mapping and translation
"""
name = "maven"
filename = b"pom.xml"
mapping = CROSSWALK_TABLE["Java (Maven)"]
string_fields = ["name", "version", "description", "email"]
_default_repository = {"url": "https://repo.maven.apache.org/maven2/"}
def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]:
return super()._translate_dict(d.get("project") or {})
def extra_translation(self, graph: Graph, root, d):
self.parse_repositories(graph, root, d)
def parse_repositories(self, graph: Graph, root, d):
"""https://maven.apache.org/pom.html#Repositories
>>> import rdflib
>>> import xmltodict
>>> from pprint import pprint
>>> d = xmltodict.parse('''
...
...
... codehausSnapshots
... Codehaus Snapshots
... http://snapshots.maven.codehaus.org/maven2
... default
...
...
... ''')
>>> MavenMapping().parse_repositories(rdflib.Graph(), rdflib.BNode(), d)
"""
repositories = d.get("repositories")
if not repositories:
self.parse_repository(graph, root, d, self._default_repository)
elif isinstance(repositories, dict):
repositories = repositories.get("repository") or []
if not isinstance(repositories, list):
repositories = [repositories]
for repo in repositories:
self.parse_repository(graph, root, d, repo)
def parse_repository(self, graph: Graph, root, d, repo):
if not isinstance(repo, dict):
return
if repo.get("layout", "default") != "default":
return # TODO ?
url = repo.get("url")
group_id = d.get("groupId")
artifact_id = d.get("artifactId")
if (
isinstance(url, str)
and isinstance(group_id, str)
and isinstance(artifact_id, str)
):
repo = os.path.join(url, *group_id.split("."), artifact_id)
+ if "${" in repo:
+ # Often use as templating in pom.xml files collected from VCSs
+ return
graph.add((root, SCHEMA.codeRepository, URIRef(repo)))
def normalize_groupId(self, id_):
"""https://maven.apache.org/pom.html#Maven_Coordinates
>>> MavenMapping().normalize_groupId('org.example')
rdflib.term.Literal('org.example')
"""
if isinstance(id_, str):
return Literal(id_)
def translate_licenses(self, graph, root, licenses):
"""https://maven.apache.org/pom.html#Licenses
>>> import xmltodict
>>> import json
>>> d = xmltodict.parse('''
...
...
... Apache License, Version 2.0
... https://www.apache.org/licenses/LICENSE-2.0.txt
...
...
... ''')
>>> print(json.dumps(d, indent=4))
{
"licenses": {
"license": {
"name": "Apache License, Version 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0.txt"
}
}
}
>>> graph = Graph()
>>> root = URIRef("http://example.org/test-software")
>>> MavenMapping().translate_licenses(graph, root, d["licenses"])
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/license": {
"@id": "https://www.apache.org/licenses/LICENSE-2.0.txt"
}
}
or, if there are more than one license:
>>> import xmltodict
>>> from pprint import pprint
>>> d = xmltodict.parse('''
...
...
... Apache License, Version 2.0
... https://www.apache.org/licenses/LICENSE-2.0.txt
...
...
... MIT License
... https://opensource.org/licenses/MIT
...
...
... ''')
>>> graph = Graph()
>>> root = URIRef("http://example.org/test-software")
>>> MavenMapping().translate_licenses(graph, root, d["licenses"])
>>> pprint(set(graph.triples((root, URIRef("http://schema.org/license"), None))))
{(rdflib.term.URIRef('http://example.org/test-software'),
rdflib.term.URIRef('http://schema.org/license'),
rdflib.term.URIRef('https://opensource.org/licenses/MIT')),
(rdflib.term.URIRef('http://example.org/test-software'),
rdflib.term.URIRef('http://schema.org/license'),
rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}
"""
if not isinstance(licenses, dict):
return
licenses = licenses.get("license")
if isinstance(licenses, dict):
licenses = [licenses]
elif not isinstance(licenses, list):
return
for license in licenses:
if isinstance(license, dict) and isinstance(license.get("url"), str):
graph.add((root, SCHEMA.license, URIRef(license["url"])))
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
index 1540ef6..5486539 100644
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -1,282 +1,288 @@
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
import urllib.parse
from rdflib import RDF, BNode, Graph, Literal, URIRef
from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import SCHEMA
from .base import JsonMapping, SingleFileIntrinsicMapping
from .utils import add_list, prettyprint_graph # noqa
SPDX = URIRef("https://spdx.org/licenses/")
class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
"""
dedicated class for NPM (package.json) mapping and translation
"""
name = "npm"
mapping = CROSSWALK_TABLE["NodeJS"]
filename = b"package.json"
string_fields = ["name", "version", "description", "email"]
uri_fields = ["homepage"]
_schema_shortcuts = {
"github": "git+https://github.com/%s.git",
"gist": "git+https://gist.github.com/%s.git",
"gitlab": "git+https://gitlab.com/%s.git",
# Bitbucket supports both hg and git, and the shortcut does not
# tell which one to use.
# 'bitbucket': 'https://bitbucket.org/',
}
def normalize_repository(self, d):
"""https://docs.npmjs.com/files/package.json#repository
>>> NpmMapping().normalize_repository({
... 'type': 'git',
... 'url': 'https://example.org/foo.git'
... })
rdflib.term.URIRef('git+https://example.org/foo.git')
>>> NpmMapping().normalize_repository(
... 'gitlab:foo/bar')
rdflib.term.URIRef('git+https://gitlab.com/foo/bar.git')
>>> NpmMapping().normalize_repository(
... 'foo/bar')
rdflib.term.URIRef('git+https://github.com/foo/bar.git')
"""
if (
isinstance(d, dict)
and isinstance(d.get("type"), str)
and isinstance(d.get("url"), str)
):
url = "{type}+{url}".format(**d)
elif isinstance(d, str):
if "://" in d:
url = d
elif ":" in d:
(schema, rest) = d.split(":", 1)
if schema in self._schema_shortcuts:
url = self._schema_shortcuts[schema] % rest
else:
return None
else:
url = self._schema_shortcuts["github"] % d
else:
return None
return URIRef(url)
def normalize_bugs(self, d):
"""https://docs.npmjs.com/files/package.json#bugs
>>> NpmMapping().normalize_bugs({
... 'url': 'https://example.org/bugs/',
... 'email': 'bugs@example.org'
... })
rdflib.term.URIRef('https://example.org/bugs/')
>>> NpmMapping().normalize_bugs(
... 'https://example.org/bugs/')
rdflib.term.URIRef('https://example.org/bugs/')
"""
if isinstance(d, dict) and isinstance(d.get("url"), str):
return URIRef(d["url"])
elif isinstance(d, str):
return URIRef(d)
else:
return None
_parse_author = re.compile(
r"^ *" r"(?P.*?)" r"( +<(?P.*)>)?" r"( +\((?P.*)\))?" r" *$"
)
def translate_author(self, graph: Graph, root, d):
r"""https://docs.npmjs.com/files/package.json#people-fields-author-contributors'
>>> from pprint import pprint
>>> root = URIRef("http://example.org/test-software")
>>> graph = Graph()
>>> NpmMapping().translate_author(graph, root, {
... 'name': 'John Doe',
... 'email': 'john.doe@example.org',
... 'url': 'https://example.org/~john.doe',
... })
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/author": {
"@list": [
{
"@type": "http://schema.org/Person",
"http://schema.org/email": "john.doe@example.org",
"http://schema.org/name": "John Doe",
"http://schema.org/url": {
"@id": "https://example.org/~john.doe"
}
}
]
}
}
>>> graph = Graph()
>>> NpmMapping().translate_author(graph, root,
... 'John Doe (https://example.org/~john.doe)'
... )
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/author": {
"@list": [
{
"@type": "http://schema.org/Person",
"http://schema.org/email": "john.doe@example.org",
"http://schema.org/name": "John Doe",
"http://schema.org/url": {
"@id": "https://example.org/~john.doe"
}
}
]
}
}
>>> graph = Graph()
>>> NpmMapping().translate_author(graph, root, {
... 'name': 'John Doe',
... 'email': 'john.doe@example.org',
... 'url': 'https:\\\\example.invalid/~john.doe',
... })
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/author": {
"@list": [
{
"@type": "http://schema.org/Person",
"http://schema.org/email": "john.doe@example.org",
"http://schema.org/name": "John Doe"
}
]
}
}
""" # noqa
author = BNode()
graph.add((author, RDF.type, SCHEMA.Person))
if isinstance(d, dict):
name = d.get("name", None)
email = d.get("email", None)
url = d.get("url", None)
elif isinstance(d, str):
match = self._parse_author.match(d)
if not match:
return None
name = match.group("name")
email = match.group("email")
url = match.group("url")
else:
return None
if name and isinstance(name, str):
graph.add((author, SCHEMA.name, Literal(name)))
if email and isinstance(email, str):
graph.add((author, SCHEMA.email, Literal(email)))
if url and isinstance(url, str):
# Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
# URLs that are blatantly invalid early, so PyLD does not crash.
parsed_url = urllib.parse.urlparse(url)
if parsed_url.netloc:
graph.add((author, SCHEMA.url, URIRef(url)))
add_list(graph, root, SCHEMA.author, [author])
def normalize_description(self, description):
r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
mistake that causes issues in the database because of null bytes in JSON.
>>> NpmMapping().normalize_description("foo bar")
rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
... )
rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
... )
rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... # invalid UTF-16 and meaningless UTF-8:
... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
... ) is None
True
>>> NpmMapping().normalize_description(
... # ditto (ut looks like little-endian at first)
... "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00"
... ) is None
True
>>> NpmMapping().normalize_description(None) is None
True
"""
if not isinstance(description, str):
return None
# XXX: if this function ever need to support more cases, consider
# switching to https://pypi.org/project/ftfy/ instead of adding more hacks
if description.startswith("\ufffd\ufffd") and "\x00" in description:
# 2 unicode replacement characters followed by '# ' encoded as UTF-16
# is a common mistake, which indicates a README.md was saved as UTF-16,
# and some NPM tool opened it as UTF-8 and used the first line as
# description.
description_bytes = description.encode()
# Strip the the two unicode replacement characters
assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd")
description_bytes = description_bytes[6:]
# If the following attempts fail to recover the description, discard it
# entirely because the current indexer storage backend (postgresql) cannot
# store zero bytes in JSON columns.
description = None
if not description_bytes.startswith(b"\x00"):
# try UTF-16 little-endian (the most common) first
try:
description = description_bytes.decode("utf-16le")
except UnicodeDecodeError:
pass
if description is None:
# if it fails, try UTF-16 big-endian
try:
description = description_bytes.decode("utf-16be")
except UnicodeDecodeError:
pass
if description:
if description.startswith("# "):
description = description[2:]
return Literal(description.rstrip())
else:
return None
return Literal(description)
def normalize_license(self, s):
"""https://docs.npmjs.com/files/package.json#license
>>> NpmMapping().normalize_license('MIT')
rdflib.term.URIRef('https://spdx.org/licenses/MIT')
"""
if isinstance(s, str):
+ if s.startswith("SEE LICENSE IN "):
+ # Very common pattern, because it is an example in the specification.
+ # It is followed by the filename; and the indexer architecture currently
+ # does not allow accessing that from metadata mappings.
+ # (Plus, an hypothetical license mapping would eventually pick it up)
+ return
return SPDX + s
def normalize_keywords(self, lst):
"""https://docs.npmjs.com/files/package.json#homepage
>>> NpmMapping().normalize_keywords(['foo', 'bar'])
[rdflib.term.Literal('foo'), rdflib.term.Literal('bar')]
"""
if isinstance(lst, list):
return [Literal(x) for x in lst if isinstance(x, str)]
diff --git a/swh/indexer/tests/metadata_dictionary/test_maven.py b/swh/indexer/tests/metadata_dictionary/test_maven.py
index 0267e95..afde286 100644
--- a/swh/indexer/tests/metadata_dictionary/test_maven.py
+++ b/swh/indexer/tests/metadata_dictionary/test_maven.py
@@ -1,365 +1,406 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from hypothesis import HealthCheck, given, settings
from swh.indexer.metadata_dictionary import MAPPINGS
from ..utils import xml_document_strategy
def test_compute_metadata_maven():
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
"codeRepository": ("http://repo1.maven.org/maven2/com/mycompany/app/my-app"),
}
def test_compute_metadata_maven_empty():
raw_content = b"""
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
}
def test_compute_metadata_maven_almost_empty():
raw_content = b"""
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
}
def test_compute_metadata_maven_invalid_xml(caplog):
expected_warning = (
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error parsing XML from foo",
)
caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning], result
assert result is None
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning], result
assert result is None
def test_compute_metadata_maven_unknown_encoding(caplog):
expected_warning = (
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error detecting XML encoding from foo",
)
caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning], result
assert result is None
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning], result
assert result is None
def test_compute_metadata_maven_invalid_encoding(caplog):
expected_warning = [
# libexpat1 <= 2.2.10-2+deb11u1
[
(
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error unidecoding XML from foo",
)
],
# libexpat1 >= 2.2.10-2+deb11u2
[
(
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error parsing XML from foo",
)
],
]
caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples in expected_warning, result
assert result is None
def test_compute_metadata_maven_minimal():
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
def test_compute_metadata_maven_empty_nodes():
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
1.2.3
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"version": "1.2.3",
}
def test_compute_metadata_maven_invalid_licenses():
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
foo
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
def test_compute_metadata_maven_multiple():
"""Tests when there are multiple code repos and licenses."""
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
example
Example Maven Repo
default
http://example.org/maven2
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
MIT license
https://opensource.org/licenses/MIT
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert set(result.pop("license")) == {
"https://www.apache.org/licenses/LICENSE-2.0.txt",
"https://opensource.org/licenses/MIT",
}, result
assert set(result.pop("codeRepository")) == {
"http://repo1.maven.org/maven2/com/mycompany/app/my-app",
"http://example.org/maven2/com/mycompany/app/my-app",
}, result
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
}
+def test_compute_metadata_maven_invalid_repository():
+ raw_content = b"""
+
+ Maven Default Project
+ 4.0.0
+ com.mycompany.app
+ my-app
+ 1.2.3
+
+
+ tcc-transaction-internal-releases
+ internal repository for released artifacts
+ ${repo.internal.releases.url}
+
+ false
+
+
+ true
+
+
+
+
+
+ Apache License, Version 2.0
+ https://www.apache.org/licenses/LICENSE-2.0.txt
+ repo
+ A business-friendly OSS license
+
+
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "schema:identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
+ }
+
+
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(
xml_document_strategy(
keys=list(MAPPINGS["MavenMapping"].mapping), # type: ignore
root="project",
xmlns="http://maven.apache.org/POM/4.0.0",
)
)
def test_maven_adversarial(doc):
MAPPINGS["MavenMapping"]().translate(doc)
diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
index 64f4ed2..cdaf6b7 100644
--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -1,402 +1,420 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
from hypothesis import HealthCheck, given, settings
import pytest
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.storage.model import ContentMetadataRow
from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer
from ..utils import (
BASE_TEST_CONFIG,
MAPPING_DESCRIPTION_CONTENT_SHA1,
json_document_strategy,
)
def test_compute_metadata_none():
"""
testing content empty content is empty
should return None
"""
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
result = MAPPINGS["NpmMapping"]().translate(content)
assert declared_metadata == result
def test_compute_metadata_npm():
"""
testing only computation of metadata with hard_mapping_npm
"""
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
},
"author": {
"email": "moranegg@example.com",
"name": "Morane G"
}
}
"""
declared_metadata = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"author": [
{
"type": "Person",
"name": "Morane G",
"email": "moranegg@example.com",
}
],
}
result = MAPPINGS["NpmMapping"]().translate(content)
assert declared_metadata == result
def test_compute_metadata_invalid_description_npm():
"""
testing only computation of metadata with hard_mapping_npm
"""
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": 1234
}
"""
declared_metadata = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "test_metadata",
"version": "0.0.2",
}
result = MAPPINGS["NpmMapping"]().translate(content)
assert declared_metadata == result
def test_index_content_metadata_npm(storage, obj_storage):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
sha1s = [
MAPPING_DESCRIPTION_CONTENT_SHA1["json:test-metadata-package.json"],
MAPPING_DESCRIPTION_CONTENT_SHA1["json:npm-package.json"],
MAPPING_DESCRIPTION_CONTENT_SHA1["python:code"],
]
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
config = BASE_TEST_CONFIG.copy()
config["tools"] = [TRANSLATOR_TOOL]
metadata_indexer = ContentMetadataTestIndexer(config=config)
metadata_indexer.run(sha1s, log_suffix="unknown content")
results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
expected_results = [
ContentMetadataRow(
id=sha1s[0],
tool=TRANSLATOR_TOOL,
metadata={
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"description": "Simple package.json test for indexer",
"name": "test_metadata",
"version": "0.0.1",
},
),
ContentMetadataRow(
id=sha1s[1],
tool=TRANSLATOR_TOOL,
metadata={
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"issueTracker": "https://github.com/npm/npm/issues",
"author": [
{
"type": "Person",
"name": "Isaac Z. Schlueter",
"email": "i@izs.me",
"url": "http://blog.izs.me",
}
],
"codeRepository": "git+https://github.com/npm/npm",
"description": "a package manager for JavaScript",
"license": "https://spdx.org/licenses/Artistic-2.0",
"version": "5.0.3",
"name": "npm",
"url": "https://docs.npmjs.com/",
},
),
]
for result in results:
del result.tool["id"]
result.metadata.pop("keywords", None)
# The assertion below returns False sometimes because of nested lists
assert expected_results == results
def test_npm_null_list_item_normalization():
package_json = b"""{
"name": "foo",
"keywords": [
"foo",
null
],
"homepage": [
"http://example.org/",
null
]
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
"url": "http://example.org/",
"keywords": "foo",
}
def test_npm_bugs_normalization():
# valid dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"url": "https://github.com/owner/project/issues",
"email": "foo@example.com"
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
}
# "invalid" dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"email": "foo@example.com"
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
}
# string
package_json = b"""{
"name": "foo",
"bugs": "https://github.com/owner/project/issues"
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
}
def test_npm_repository_normalization():
# normal
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git",
"url" : "https://github.com/npm/cli.git"
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
# missing url
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git"
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
}
# github shortcut
package_json = b"""{
"name": "foo",
"repository": "github:npm/cli"
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
assert result == expected_result
# github shortshortcut
package_json = b"""{
"name": "foo",
"repository": "npm/cli"
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == expected_result
# gitlab shortcut
package_json = b"""{
"name": "foo",
"repository": "gitlab:user/repo"
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://gitlab.com/user/repo.git",
"type": "SoftwareSourceCode",
}
def test_npm_invalid_uris():
package_json = rb"""{
"version": "1.0.0",
"homepage": "",
"author": {
"name": "foo",
"url": "http://example.org"
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"name": "foo", "type": "Person", "url": "http://example.org"}],
"version": "1.0.0",
}
package_json = rb"""{
"version": "1.0.0",
"homepage": "http://example.org",
"author": {
"name": "foo",
"url": ""
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"name": "foo", "type": "Person"}],
"url": "http://example.org",
"version": "1.0.0",
}
package_json = rb"""{
"version": "1.0.0",
"homepage": "",
"author": {
"name": "foo",
"url": ""
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"name": "foo", "type": "Person"}],
"version": "1.0.0",
}
package_json = rb"""{
"version": "1.0.0",
"homepage": "http:example.org",
"author": {
"name": "foo",
"url": "http:example.com"
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"name": "foo", "type": "Person"}],
"version": "1.0.0",
}
+def test_npm_invalid_licenses():
+ package_json = rb"""{
+ "version": "1.0.0",
+ "license": "SEE LICENSE IN LICENSE.md",
+ "author": {
+ "name": "foo",
+ "url": "http://example.org"
+ }
+}"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}],
+ "version": "1.0.0",
+ }
+
+
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping))) # type: ignore
def test_npm_adversarial(doc):
raw = json.dumps(doc).encode()
MAPPINGS["NpmMapping"]().translate(raw)
@pytest.mark.parametrize(
"filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
)
def test_detect_metadata_package_json(filename):
df = [
{
"sha1_git": b"abc",
"name": b"index.js",
"target": b"abc",
"length": 897,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"bcd",
},
{
"sha1_git": b"aab",
"name": filename,
"target": b"aab",
"length": 712,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"cde",
},
]
results = detect_metadata(df)
expected_results = {"NpmMapping": [b"cde"]}
assert expected_results == results