diff --git a/swh/indexer/tests/metadata_dictionary/__init__.py b/swh/indexer/tests/metadata_dictionary/__init__.py
new file mode 100644
diff --git a/swh/indexer/tests/metadata_dictionary/test_cff.py b/swh/indexer/tests/metadata_dictionary/test_cff.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/metadata_dictionary/test_cff.py
@@ -0,0 +1,220 @@
+# Copyright (C) 2017-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.indexer.metadata_dictionary import MAPPINGS
+
+
+def test_compute_metadata_cff():
+ """
+ testing CITATION.cff translation
+ """
+ content = """# YAML 1.2
+---
+abstract: "Command line program to convert from Citation File \
+Format to various other formats such as BibTeX, EndNote, RIS, \
+schema.org, CodeMeta, and .zenodo.json."
+authors:
+ -
+ affiliation: "Netherlands eScience Center"
+ family-names: Klaver
+ given-names: Tom
+ -
+ affiliation: "Humboldt-Universität zu Berlin"
+ family-names: Druskat
+ given-names: Stephan
+ orcid: https://orcid.org/0000-0003-4925-7248
+cff-version: "1.0.3"
+date-released: 2019-11-12
+doi: 10.5281/zenodo.1162057
+keywords:
+ - "citation"
+ - "bibliography"
+ - "cff"
+ - "CITATION.cff"
+license: Apache-2.0
+message: "If you use this software, please cite it using these metadata."
+license: Apache-2.0
+message: "If you use this software, please cite it using these metadata."
+repository-code: "https://github.com/citation-file-format/cff-converter-python"
+title: cffconvert
+version: "1.4.0-alpha0"
+ """.encode(
+ "utf-8"
+ )
+
+ expected = {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [
+ {
+ "type": "Person",
+ "affiliation": {
+ "type": "Organization",
+ "name": "Netherlands eScience Center",
+ },
+ "familyName": "Klaver",
+ "givenName": "Tom",
+ },
+ {
+ "id": "https://orcid.org/0000-0003-4925-7248",
+ "type": "Person",
+ "affiliation": {
+ "type": "Organization",
+ "name": "Humboldt-Universität zu Berlin",
+ },
+ "familyName": "Druskat",
+ "givenName": "Stephan",
+ },
+ ],
+ "codeRepository": (
+ "https://github.com/citation-file-format/cff-converter-python"
+ ),
+ "datePublished": "2019-11-12",
+ "description": """Command line program to convert from \
+Citation File Format to various other formats such as BibTeX, EndNote, \
+RIS, schema.org, CodeMeta, and .zenodo.json.""",
+ "identifier": "https://doi.org/10.5281/zenodo.1162057",
+ "keywords": ["citation", "bibliography", "cff", "CITATION.cff"],
+ "license": "https://spdx.org/licenses/Apache-2.0",
+ "version": "1.4.0-alpha0",
+ }
+
+ result = MAPPINGS["CffMapping"]().translate(content)
+ assert expected == result
+
+
+def test_compute_metadata_cff_invalid_yaml():
+ """
+ test yaml translation for invalid yaml file
+ """
+ content = """cff-version: 1.0.3
+message: To cite the SigMF specification, please include the following:
+authors:
+ - name: The GNU Radio Foundation, Inc.
+ """.encode(
+ "utf-8"
+ )
+
+ expected = None
+
+ result = MAPPINGS["CffMapping"]().translate(content)
+ assert expected == result
+
+
+def test_compute_metadata_cff_empty():
+ """
+ test yaml translation for empty yaml file
+ """
+ content = """
+ """.encode(
+ "utf-8"
+ )
+
+ expected = None
+
+ result = MAPPINGS["CffMapping"]().translate(content)
+ assert expected == result
+
+
+def test_compute_metadata_cff_list():
+ """
+ test yaml translation for empty yaml file
+ """
+ content = """
+- Foo
+- Bar
+ """.encode(
+ "utf-8"
+ )
+
+ expected = None
+
+ result = MAPPINGS["CffMapping"]().translate(content)
+ assert expected == result
+
+
+def test_cff_empty_fields():
+ """
+ testing CITATION.cff translation
+ """
+ content = """# YAML 1.2
+ authors:
+ -
+ affiliation: "Hogwarts"
+ family-names:
+ given-names: Harry
+ -
+ affiliation: "Ministry of Magic"
+ family-names: Weasley
+ orcid:
+ given-names: Arthur
+
+
+ """.encode(
+ "utf-8"
+ )
+
+ expected = {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [
+ {
+ "type": "Person",
+ "affiliation": {
+ "type": "Organization",
+ "name": "Hogwarts",
+ },
+ "givenName": "Harry",
+ },
+ {
+ "type": "Person",
+ "affiliation": {
+ "type": "Organization",
+ "name": "Ministry of Magic",
+ },
+ "familyName": "Weasley",
+ "givenName": "Arthur",
+ },
+ ],
+ }
+
+ result = MAPPINGS["CffMapping"]().translate(content)
+ assert expected == result
+
+
+def test_cff_invalid_fields():
+ """
+ testing CITATION.cff translation
+ """
+ content = """# YAML 1.2
+ authors:
+ -
+ affiliation: "Hogwarts"
+ family-names:
+ - Potter
+ - James
+ given-names: Harry
+
+ """.encode(
+ "utf-8"
+ )
+
+ expected = {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [
+ {
+ "type": "Person",
+ "affiliation": {
+ "type": "Organization",
+ "name": "Hogwarts",
+ },
+ "givenName": "Harry",
+ },
+ ],
+ }
+
+ result = MAPPINGS["CffMapping"]().translate(content)
+ assert expected == result
diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
@@ -0,0 +1,175 @@
+# Copyright (C) 2017-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+
+from hypothesis import HealthCheck, given, settings
+
+from swh.indexer.codemeta import CODEMETA_TERMS
+from swh.indexer.metadata_detector import detect_metadata
+from swh.indexer.metadata_dictionary import MAPPINGS
+
+from ..utils import json_document_strategy
+
+
+def test_compute_metadata_valid_codemeta():
+ raw_content = b"""{
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "@type": "SoftwareSourceCode",
+ "identifier": "CodeMeta",
+ "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
+ "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
+ "codeRepository": "https://github.com/codemeta/codemeta",
+ "issueTracker": "https://github.com/codemeta/codemeta/issues",
+ "license": "https://spdx.org/licenses/Apache-2.0",
+ "version": "2.0",
+ "author": [
+ {
+ "@type": "Person",
+ "givenName": "Carl",
+ "familyName": "Boettiger",
+ "email": "cboettig@gmail.com",
+ "@id": "http://orcid.org/0000-0002-1642-628X"
+ },
+ {
+ "@type": "Person",
+ "givenName": "Matthew B.",
+ "familyName": "Jones",
+ "email": "jones@nceas.ucsb.edu",
+ "@id": "http://orcid.org/0000-0003-0077-4738"
+ }
+ ],
+ "maintainer": {
+ "@type": "Person",
+ "givenName": "Carl",
+ "familyName": "Boettiger",
+ "email": "cboettig@gmail.com",
+ "@id": "http://orcid.org/0000-0002-1642-628X"
+ },
+ "contIntegration": "https://travis-ci.org/codemeta/codemeta",
+ "developmentStatus": "active",
+ "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
+ "funder": {
+ "@id": "https://doi.org/10.13039/100000001",
+ "@type": "Organization",
+ "name": "National Science Foundation"
+ },
+ "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
+ "keywords": [
+ "metadata",
+ "software"
+ ],
+ "version":"2.0",
+ "dateCreated":"2017-06-05",
+ "datePublished":"2017-06-05",
+ "programmingLanguage": "JSON-LD"
+ }""" # noqa
+ expected_result = {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "identifier": "CodeMeta",
+ "description": "CodeMeta is a concept vocabulary that can "
+ "be used to standardize the exchange of software metadata "
+ "across repositories and organizations.",
+ "name": "CodeMeta: Minimal metadata schemas for science "
+ "software and code, in JSON-LD",
+ "codeRepository": "https://github.com/codemeta/codemeta",
+ "issueTracker": "https://github.com/codemeta/codemeta/issues",
+ "license": "https://spdx.org/licenses/Apache-2.0",
+ "version": "2.0",
+ "author": [
+ {
+ "type": "Person",
+ "givenName": "Carl",
+ "familyName": "Boettiger",
+ "email": "cboettig@gmail.com",
+ "id": "http://orcid.org/0000-0002-1642-628X",
+ },
+ {
+ "type": "Person",
+ "givenName": "Matthew B.",
+ "familyName": "Jones",
+ "email": "jones@nceas.ucsb.edu",
+ "id": "http://orcid.org/0000-0003-0077-4738",
+ },
+ ],
+ "maintainer": {
+ "type": "Person",
+ "givenName": "Carl",
+ "familyName": "Boettiger",
+ "email": "cboettig@gmail.com",
+ "id": "http://orcid.org/0000-0002-1642-628X",
+ },
+ "contIntegration": "https://travis-ci.org/codemeta/codemeta",
+ "developmentStatus": "active",
+ "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
+ "funder": {
+ "id": "https://doi.org/10.13039/100000001",
+ "type": "Organization",
+ "name": "National Science Foundation",
+ },
+ "funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
+ "in Scientific Software",
+ "keywords": ["metadata", "software"],
+ "version": "2.0",
+ "dateCreated": "2017-06-05",
+ "datePublished": "2017-06-05",
+ "programmingLanguage": "JSON-LD",
+ }
+ result = MAPPINGS["CodemetaMapping"]().translate(raw_content)
+ assert result == expected_result
+
+
+def test_compute_metadata_codemeta_alternate_context():
+ raw_content = b"""{
+ "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
+ "@type": "SoftwareSourceCode",
+ "identifier": "CodeMeta"
+ }""" # noqa
+ expected_result = {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "identifier": "CodeMeta",
+ }
+ result = MAPPINGS["CodemetaMapping"]().translate(raw_content)
+ assert result == expected_result
+
+
+@settings(suppress_health_check=[HealthCheck.too_slow])
+@given(json_document_strategy(keys=CODEMETA_TERMS))
+def test_codemeta_adversarial(doc):
+ raw = json.dumps(doc).encode()
+ MAPPINGS["CodemetaMapping"]().translate(raw)
+
+
+def test_detect_metadata_codemeta_json_uppercase():
+ df = [
+ {
+ "sha1_git": b"abc",
+ "name": b"index.html",
+ "target": b"abc",
+ "length": 897,
+ "status": "visible",
+ "type": "file",
+ "perms": 33188,
+ "dir_id": b"dir_a",
+ "sha1": b"bcd",
+ },
+ {
+ "sha1_git": b"aab",
+ "name": b"CODEMETA.json",
+ "target": b"aab",
+ "length": 712,
+ "status": "visible",
+ "type": "file",
+ "perms": 33188,
+ "dir_id": b"dir_a",
+ "sha1": b"bcd",
+ },
+ ]
+ results = detect_metadata(df)
+
+ expected_results = {"CodemetaMapping": [b"bcd"]}
+ assert expected_results == results
diff --git a/swh/indexer/tests/metadata_dictionary/test_maven.py b/swh/indexer/tests/metadata_dictionary/test_maven.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/metadata_dictionary/test_maven.py
@@ -0,0 +1,365 @@
+# Copyright (C) 2017-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+
+from hypothesis import HealthCheck, given, settings
+
+from swh.indexer.metadata_dictionary import MAPPINGS
+
+from ..utils import xml_document_strategy
+
+
+def test_compute_metadata_maven():
+ raw_content = b"""
+
+ Maven Default Project
+ 4.0.0
+ com.mycompany.app
+ my-app
+ 1.2.3
+
+
+ central
+ Maven Repository Switchboard
+ default
+ http://repo1.maven.org/maven2
+
+ false
+
+
+
+
+
+ Apache License, Version 2.0
+ https://www.apache.org/licenses/LICENSE-2.0.txt
+ repo
+ A business-friendly OSS license
+
+
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
+ "codeRepository": ("http://repo1.maven.org/maven2/com/mycompany/app/my-app"),
+ }
+
+
+def test_compute_metadata_maven_empty():
+ raw_content = b"""
+
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
+
+
+def test_compute_metadata_maven_almost_empty():
+ raw_content = b"""
+
+
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
+
+
+def test_compute_metadata_maven_invalid_xml(caplog):
+ expected_warning = (
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error parsing XML from foo",
+ )
+ caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
+
+ raw_content = b"""
+ """
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning], result
+ assert result is None
+
+ raw_content = b"""
+ """
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning], result
+ assert result is None
+
+
+def test_compute_metadata_maven_unknown_encoding(caplog):
+ expected_warning = (
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error detecting XML encoding from foo",
+ )
+ caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
+
+ raw_content = b"""
+
+ """
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning], result
+ assert result is None
+
+ raw_content = b"""
+
+ """
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning], result
+ assert result is None
+
+
+def test_compute_metadata_maven_invalid_encoding(caplog):
+ expected_warning = [
+ # libexpat1 <= 2.2.10-2+deb11u1
+ [
+ (
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error unidecoding XML from foo",
+ )
+ ],
+ # libexpat1 >= 2.2.10-2+deb11u2
+ [
+ (
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error parsing XML from foo",
+ )
+ ],
+ ]
+ caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
+
+ raw_content = b"""
+
+ """
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples in expected_warning, result
+ assert result is None
+
+
+def test_compute_metadata_maven_minimal():
+ raw_content = b"""
+
+ Maven Default Project
+ 4.0.0
+ com.mycompany.app
+ my-app
+ 1.2.3
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
+
+
+def test_compute_metadata_maven_empty_nodes():
+ raw_content = b"""
+
+ Maven Default Project
+ 4.0.0
+ com.mycompany.app
+ my-app
+ 1.2.3
+
+
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
+
+ raw_content = b"""
+
+ Maven Default Project
+ 4.0.0
+ com.mycompany.app
+ my-app
+
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
+
+ raw_content = b"""
+
+
+ 4.0.0
+ com.mycompany.app
+ my-app
+ 1.2.3
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
+
+ raw_content = b"""
+
+ Maven Default Project
+ 4.0.0
+ com.mycompany.app
+ my-app
+ 1.2.3
+
+
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
+
+ raw_content = b"""
+
+
+ 1.2.3
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "version": "1.2.3",
+ }
+
+
+def test_compute_metadata_maven_invalid_licenses():
+ raw_content = b"""
+
+ Maven Default Project
+ 4.0.0
+ com.mycompany.app
+ my-app
+ 1.2.3
+
+ foo
+
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
+
+
+def test_compute_metadata_maven_multiple():
+ """Tests when there are multiple code repos and licenses."""
+ raw_content = b"""
+
+ Maven Default Project
+ 4.0.0
+ com.mycompany.app
+ my-app
+ 1.2.3
+
+
+ central
+ Maven Repository Switchboard
+ default
+ http://repo1.maven.org/maven2
+
+ false
+
+
+
+ example
+ Example Maven Repo
+ default
+ http://example.org/maven2
+
+
+
+
+ Apache License, Version 2.0
+ https://www.apache.org/licenses/LICENSE-2.0.txt
+ repo
+ A business-friendly OSS license
+
+
+ MIT license
+ https://opensource.org/licenses/MIT
+
+
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "license": [
+ "https://www.apache.org/licenses/LICENSE-2.0.txt",
+ "https://opensource.org/licenses/MIT",
+ ],
+ "codeRepository": [
+ "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
+ "http://example.org/maven2/com/mycompany/app/my-app",
+ ],
+ }
+
+
+@settings(suppress_health_check=[HealthCheck.too_slow])
+@given(
+ xml_document_strategy(
+ keys=list(MAPPINGS["MavenMapping"].mapping), # type: ignore
+ root="project",
+ xmlns="http://maven.apache.org/POM/4.0.0",
+ )
+)
+def test_maven_adversarial(doc):
+ MAPPINGS["MavenMapping"]().translate(doc)
diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -0,0 +1,322 @@
+# Copyright (C) 2017-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+
+from hypothesis import HealthCheck, given, settings
+import pytest
+
+from swh.indexer.metadata_detector import detect_metadata
+from swh.indexer.metadata_dictionary import MAPPINGS
+from swh.indexer.storage.model import ContentMetadataRow
+from swh.model.hashutil import hash_to_bytes
+
+from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer
+from ..utils import (
+ BASE_TEST_CONFIG,
+ fill_obj_storage,
+ fill_storage,
+ json_document_strategy,
+)
+
+
+def test_compute_metadata_none():
+ """
+ testing content empty content is empty
+ should return None
+ """
+ content = b""
+
+ # None if no metadata was found or an error occurred
+ declared_metadata = None
+ result = MAPPINGS["NpmMapping"]().translate(content)
+ assert declared_metadata == result
+
+
+def test_compute_metadata_npm():
+ """
+ testing only computation of metadata with hard_mapping_npm
+ """
+ content = b"""
+ {
+ "name": "test_metadata",
+ "version": "0.0.2",
+ "description": "Simple package.json test for indexer",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/moranegg/metadata_test"
+ },
+ "author": {
+ "email": "moranegg@example.com",
+ "name": "Morane G"
+ }
+ }
+ """
+ declared_metadata = {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "test_metadata",
+ "version": "0.0.2",
+ "description": "Simple package.json test for indexer",
+ "codeRepository": "git+https://github.com/moranegg/metadata_test",
+ "author": [
+ {
+ "type": "Person",
+ "name": "Morane G",
+ "email": "moranegg@example.com",
+ }
+ ],
+ }
+
+ result = MAPPINGS["NpmMapping"]().translate(content)
+ assert declared_metadata == result
+
+
+def test_compute_metadata_invalid_description_npm():
+ """
+ testing only computation of metadata with hard_mapping_npm
+ """
+ content = b"""
+ {
+ "name": "test_metadata",
+ "version": "0.0.2",
+ "description": 1234
+ }
+ """
+ declared_metadata = {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "test_metadata",
+ "version": "0.0.2",
+ }
+
+ result = MAPPINGS["NpmMapping"]().translate(content)
+ assert declared_metadata == result
+
+
+def test_index_content_metadata_npm():
+ """
+ testing NPM with package.json
+ - one sha1 uses a file that can't be translated to metadata and
+ should return None in the translated metadata
+ """
+ sha1s = [
+ hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
+ hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
+ hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"),
+ ]
+ # this metadata indexer computes only metadata for package.json
+ # in npm context with a hard mapping
+ config = BASE_TEST_CONFIG.copy()
+ config["tools"] = [TRANSLATOR_TOOL]
+ metadata_indexer = ContentMetadataTestIndexer(config=config)
+ fill_obj_storage(metadata_indexer.objstorage)
+ fill_storage(metadata_indexer.storage)
+
+ metadata_indexer.run(sha1s)
+ results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
+
+ expected_results = [
+ ContentMetadataRow(
+ id=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
+ tool=TRANSLATOR_TOOL,
+ metadata={
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "codeRepository": "git+https://github.com/moranegg/metadata_test",
+ "description": "Simple package.json test for indexer",
+ "name": "test_metadata",
+ "version": "0.0.1",
+ },
+ ),
+ ContentMetadataRow(
+ id=hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
+ tool=TRANSLATOR_TOOL,
+ metadata={
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "issueTracker": "https://github.com/npm/npm/issues",
+ "author": [
+ {
+ "type": "Person",
+ "name": "Isaac Z. Schlueter",
+ "email": "i@izs.me",
+ "url": "http://blog.izs.me",
+ }
+ ],
+ "codeRepository": "git+https://github.com/npm/npm",
+ "description": "a package manager for JavaScript",
+ "license": "https://spdx.org/licenses/Artistic-2.0",
+ "version": "5.0.3",
+ "name": "npm",
+ "keywords": [
+ "install",
+ "modules",
+ "package manager",
+ "package.json",
+ ],
+ "url": "https://docs.npmjs.com/",
+ },
+ ),
+ ]
+
+ for result in results:
+ del result.tool["id"]
+
+ # The assertion below returns False sometimes because of nested lists
+ assert expected_results == results
+
+
+def test_npm_bugs_normalization():
+ # valid dictionary
+ package_json = b"""{
+ "name": "foo",
+ "bugs": {
+ "url": "https://github.com/owner/project/issues",
+ "email": "foo@example.com"
+ }
+ }"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "issueTracker": "https://github.com/owner/project/issues",
+ "type": "SoftwareSourceCode",
+ }
+
+ # "invalid" dictionary
+ package_json = b"""{
+ "name": "foo",
+ "bugs": {
+ "email": "foo@example.com"
+ }
+ }"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "type": "SoftwareSourceCode",
+ }
+
+ # string
+ package_json = b"""{
+ "name": "foo",
+ "bugs": "https://github.com/owner/project/issues"
+ }"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "issueTracker": "https://github.com/owner/project/issues",
+ "type": "SoftwareSourceCode",
+ }
+
+
+def test_npm_repository_normalization():
+ # normal
+ package_json = b"""{
+ "name": "foo",
+ "repository": {
+ "type" : "git",
+ "url" : "https://github.com/npm/cli.git"
+ }
+ }"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "codeRepository": "git+https://github.com/npm/cli.git",
+ "type": "SoftwareSourceCode",
+ }
+
+ # missing url
+ package_json = b"""{
+ "name": "foo",
+ "repository": {
+ "type" : "git"
+ }
+ }"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "type": "SoftwareSourceCode",
+ }
+
+ # github shortcut
+ package_json = b"""{
+ "name": "foo",
+ "repository": "github:npm/cli"
+ }"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ expected_result = {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "codeRepository": "git+https://github.com/npm/cli.git",
+ "type": "SoftwareSourceCode",
+ }
+ assert result == expected_result
+
+ # github shortshortcut
+ package_json = b"""{
+ "name": "foo",
+ "repository": "npm/cli"
+ }"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == expected_result
+
+ # gitlab shortcut
+ package_json = b"""{
+ "name": "foo",
+ "repository": "gitlab:user/repo"
+ }"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "codeRepository": "git+https://gitlab.com/user/repo.git",
+ "type": "SoftwareSourceCode",
+ }
+
+
+@settings(suppress_health_check=[HealthCheck.too_slow])
+@given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping))) # type: ignore
+def test_npm_adversarial(doc):
+ raw = json.dumps(doc).encode()
+ MAPPINGS["NpmMapping"]().translate(raw)
+
+
+@pytest.mark.parametrize(
+ "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
+)
+def test_detect_metadata_package_json(filename):
+ df = [
+ {
+ "sha1_git": b"abc",
+ "name": b"index.js",
+ "target": b"abc",
+ "length": 897,
+ "status": "visible",
+ "type": "file",
+ "perms": 33188,
+ "dir_id": b"dir_a",
+ "sha1": b"bcd",
+ },
+ {
+ "sha1_git": b"aab",
+ "name": filename,
+ "target": b"aab",
+ "length": 712,
+ "status": "visible",
+ "type": "file",
+ "perms": 33188,
+ "dir_id": b"dir_a",
+ "sha1": b"cde",
+ },
+ ]
+ results = detect_metadata(df)
+
+ expected_results = {"NpmMapping": [b"cde"]}
+ assert expected_results == results
diff --git a/swh/indexer/tests/metadata_dictionary/test_python.py b/swh/indexer/tests/metadata_dictionary/test_python.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/metadata_dictionary/test_python.py
@@ -0,0 +1,114 @@
+# Copyright (C) 2017-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.indexer.metadata_dictionary import MAPPINGS
+
+
+def test_compute_metadata_pkginfo():
+ raw_content = b"""\
+Metadata-Version: 2.1
+Name: swh.core
+Version: 0.0.49
+Summary: Software Heritage core utilities
+Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
+Author: Software Heritage developers
+Author-email: swh-devel@inria.fr
+License: UNKNOWN
+Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
+Project-URL: Funding, https://www.softwareheritage.org/donate
+Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
+Description: swh-core
+ ========
+ \x20
+ core library for swh's modules:
+ - config parser
+ - hash computations
+ - serialization
+ - logging mechanism
+ \x20
+Platform: UNKNOWN
+Classifier: Programming Language :: Python :: 3
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
+Classifier: Operating System :: OS Independent
+Classifier: Development Status :: 5 - Production/Stable
+Description-Content-Type: text/markdown
+Provides-Extra: testing
+""" # noqa
+ result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
+ assert result["description"] == [
+ "Software Heritage core utilities", # note the comma here
+ "swh-core\n"
+ "========\n"
+ "\n"
+ "core library for swh's modules:\n"
+ "- config parser\n"
+ "- hash computations\n"
+ "- serialization\n"
+ "- logging mechanism\n"
+ "",
+ ], result
+ del result["description"]
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "url": "https://forge.softwareheritage.org/diffusion/DCORE/",
+ "name": "swh.core",
+ "author": [
+ {
+ "type": "Person",
+ "name": "Software Heritage developers",
+ "email": "swh-devel@inria.fr",
+ }
+ ],
+ "version": "0.0.49",
+ }
+
+
+def test_compute_metadata_pkginfo_utf8():
+ raw_content = b"""\
+Metadata-Version: 1.1
+Name: snowpyt
+Description-Content-Type: UNKNOWN
+Description: foo
+ Hydrology N\xc2\xb083
+""" # noqa
+ result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "snowpyt",
+ "description": "foo\nHydrology N°83",
+ }
+
+
+def test_compute_metadata_pkginfo_keywords():
+ raw_content = b"""\
+Metadata-Version: 2.1
+Name: foo
+Keywords: foo bar baz
+""" # noqa
+ result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "foo",
+ "keywords": ["foo", "bar", "baz"],
+ }
+
+
+def test_compute_metadata_pkginfo_license():
+ raw_content = b"""\
+Metadata-Version: 2.1
+Name: foo
+License: MIT
+""" # noqa
+ result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "foo",
+ "license": "MIT",
+ }
diff --git a/swh/indexer/tests/metadata_dictionary/test_ruby.py b/swh/indexer/tests/metadata_dictionary/test_ruby.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/metadata_dictionary/test_ruby.py
@@ -0,0 +1,134 @@
+# Copyright (C) 2017-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from hypothesis import HealthCheck, given, settings, strategies
+
+from swh.indexer.metadata_dictionary import MAPPINGS
+
+
+def test_gemspec_base():
+ raw_content = b"""
+Gem::Specification.new do |s|
+s.name = 'example'
+s.version = '0.1.0'
+s.licenses = ['MIT']
+s.summary = "This is an example!"
+s.description = "Much longer explanation of the example!"
+s.authors = ["Ruby Coder"]
+s.email = 'rubycoder@example.com'
+s.files = ["lib/example.rb"]
+s.homepage = 'https://rubygems.org/gems/example'
+s.metadata = { "source_code_uri" => "https://github.com/example/example" }
+end"""
+ result = MAPPINGS["GemspecMapping"]().translate(raw_content)
+ assert set(result.pop("description")) == {
+ "This is an example!",
+ "Much longer explanation of the example!",
+ }
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"type": "Person", "name": "Ruby Coder"}],
+ "name": "example",
+ "license": "https://spdx.org/licenses/MIT",
+ "codeRepository": "https://rubygems.org/gems/example",
+ "email": "rubycoder@example.com",
+ "version": "0.1.0",
+ }
+
+
+def test_gemspec_two_author_fields():
+ raw_content = b"""
+Gem::Specification.new do |s|
+s.authors = ["Ruby Coder1"]
+s.author = "Ruby Coder2"
+end"""
+ result = MAPPINGS["GemspecMapping"]().translate(raw_content)
+ assert result.pop("author") in (
+ [
+ {"type": "Person", "name": "Ruby Coder1"},
+ {"type": "Person", "name": "Ruby Coder2"},
+ ],
+ [
+ {"type": "Person", "name": "Ruby Coder2"},
+ {"type": "Person", "name": "Ruby Coder1"},
+ ],
+ )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
+
+
+def test_gemspec_invalid_author():
+ raw_content = b"""
+Gem::Specification.new do |s|
+s.author = ["Ruby Coder"]
+end"""
+ result = MAPPINGS["GemspecMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
+ raw_content = b"""
+Gem::Specification.new do |s|
+s.author = "Ruby Coder1",
+end"""
+ result = MAPPINGS["GemspecMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
+ raw_content = b"""
+Gem::Specification.new do |s|
+s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
+end"""
+ result = MAPPINGS["GemspecMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"type": "Person", "name": "Ruby Coder1"}],
+ }
+
+
+def test_gemspec_alternative_header():
+ raw_content = b"""
+require './lib/version'
+
+Gem::Specification.new { |s|
+s.name = 'rb-system-with-aliases'
+s.summary = 'execute system commands with aliases'
+}
+"""
+ result = MAPPINGS["GemspecMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "rb-system-with-aliases",
+ "description": "execute system commands with aliases",
+ }
+
+
+@settings(suppress_health_check=[HealthCheck.too_slow])
+@given(
+ strategies.dictionaries(
+ # keys
+ strategies.one_of(
+ strategies.text(),
+ *map(strategies.just, MAPPINGS["GemspecMapping"].mapping), # type: ignore
+ ),
+ # values
+ strategies.recursive(
+ strategies.characters(),
+ lambda children: strategies.lists(children, min_size=1),
+ ),
+ )
+)
+def test_gemspec_adversarial(doc):
+ parts = [b"Gem::Specification.new do |s|\n"]
+ for (k, v) in doc.items():
+ parts.append(" s.{} = {}\n".format(k, repr(v)).encode())
+ parts.append(b"end\n")
+ MAPPINGS["GemspecMapping"]().translate(b"".join(parts))
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -3,22 +3,9 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import json
-import logging
-
-from hypothesis import HealthCheck, given, settings, strategies
-import pytest
-
-from swh.indexer.codemeta import CODEMETA_TERMS
from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer
-from swh.indexer.metadata_detector import detect_metadata
-from swh.indexer.metadata_dictionary import MAPPINGS
-from swh.indexer.metadata_dictionary.maven import MavenMapping
-from swh.indexer.metadata_dictionary.npm import NpmMapping
-from swh.indexer.metadata_dictionary.ruby import GemspecMapping
from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow
from swh.indexer.tests.utils import DIRECTORY2
-from swh.model.hashutil import hash_to_bytes
from swh.model.model import Directory, DirectoryEntry
from .utils import (
@@ -26,8 +13,6 @@
YARN_PARSER_METADATA,
fill_obj_storage,
fill_storage,
- json_document_strategy,
- xml_document_strategy,
)
TRANSLATOR_TOOL = {
@@ -57,1234 +42,6 @@
Tests metadata_mock_tool tool for Metadata detection
"""
- def setup_method(self):
- self.npm_mapping = MAPPINGS["NpmMapping"]()
- self.codemeta_mapping = MAPPINGS["CodemetaMapping"]()
- self.maven_mapping = MAPPINGS["MavenMapping"]()
- self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]()
- self.gemspec_mapping = MAPPINGS["GemspecMapping"]()
- self.cff_mapping = MAPPINGS["CffMapping"]()
-
- def test_compute_metadata_none(self):
- """
- testing content empty content is empty
- should return None
- """
- content = b""
-
- # None if no metadata was found or an error occurred
- declared_metadata = None
- result = self.npm_mapping.translate(content)
- assert declared_metadata == result
-
- def test_compute_metadata_cff(self):
- """
- testing CITATION.cff translation
- """
- content = """# YAML 1.2
----
-abstract: "Command line program to convert from Citation File \
-Format to various other formats such as BibTeX, EndNote, RIS, \
-schema.org, CodeMeta, and .zenodo.json."
-authors:
- -
- affiliation: "Netherlands eScience Center"
- family-names: Klaver
- given-names: Tom
- -
- affiliation: "Humboldt-Universität zu Berlin"
- family-names: Druskat
- given-names: Stephan
- orcid: https://orcid.org/0000-0003-4925-7248
-cff-version: "1.0.3"
-date-released: 2019-11-12
-doi: 10.5281/zenodo.1162057
-keywords:
- - "citation"
- - "bibliography"
- - "cff"
- - "CITATION.cff"
-license: Apache-2.0
-message: "If you use this software, please cite it using these metadata."
-repository-code: "https://github.com/citation-file-format/cff-converter-python"
-title: cffconvert
-version: "1.4.0-alpha0"
- """.encode(
- "utf-8"
- )
-
- expected = {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "author": [
- {
- "type": "Person",
- "affiliation": {
- "type": "Organization",
- "name": "Netherlands eScience Center",
- },
- "familyName": "Klaver",
- "givenName": "Tom",
- },
- {
- "id": "https://orcid.org/0000-0003-4925-7248",
- "type": "Person",
- "affiliation": {
- "type": "Organization",
- "name": "Humboldt-Universität zu Berlin",
- },
- "familyName": "Druskat",
- "givenName": "Stephan",
- },
- ],
- "codeRepository": (
- "https://github.com/citation-file-format/cff-converter-python"
- ),
- "datePublished": "2019-11-12",
- "description": """Command line program to convert from \
-Citation File Format to various other formats such as BibTeX, EndNote, \
-RIS, schema.org, CodeMeta, and .zenodo.json.""",
- "identifier": "https://doi.org/10.5281/zenodo.1162057",
- "keywords": ["citation", "bibliography", "cff", "CITATION.cff"],
- "license": "https://spdx.org/licenses/Apache-2.0",
- "version": "1.4.0-alpha0",
- }
-
- result = self.cff_mapping.translate(content)
- assert expected == result
-
- def test_compute_metadata_cff_invalid_yaml(self):
- """
- test yaml translation for invalid yaml file
- """
- content = """cff-version: 1.0.3
-message: To cite the SigMF specification, please include the following:
-authors:
- - name: The GNU Radio Foundation, Inc.
- """.encode(
- "utf-8"
- )
-
- expected = None
-
- result = self.cff_mapping.translate(content)
- assert expected == result
-
- def test_compute_metadata_cff_empty(self):
- """
- test yaml translation for empty yaml file
- """
- content = """
- """.encode(
- "utf-8"
- )
-
- expected = None
-
- result = self.cff_mapping.translate(content)
- assert expected == result
-
- def test_compute_metadata_cff_list(self):
- """
- test yaml translation for empty yaml file
- """
- content = """
-- Foo
-- Bar
- """.encode(
- "utf-8"
- )
-
- expected = None
-
- result = self.cff_mapping.translate(content)
- assert expected == result
-
- def test_cff_empty_fields(self):
- """
- testing CITATION.cff translation
- """
- content = """# YAML 1.2
- authors:
- -
- affiliation: "Hogwarts"
- family-names:
- given-names: Harry
- -
- affiliation: "Ministry of Magic"
- family-names: Weasley
- orcid:
- given-names: Arthur
- """.encode(
- "utf-8"
- )
-
- expected = {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "author": [
- {
- "type": "Person",
- "affiliation": {
- "type": "Organization",
- "name": "Hogwarts",
- },
- "givenName": "Harry",
- },
- {
- "type": "Person",
- "affiliation": {
- "type": "Organization",
- "name": "Ministry of Magic",
- },
- "familyName": "Weasley",
- "givenName": "Arthur",
- },
- ],
- }
-
- result = self.cff_mapping.translate(content)
- assert expected == result
-
- def test_cff_invalid_fields(self):
- """
- testing CITATION.cff translation
- """
- content = """# YAML 1.2
- authors:
- -
- affiliation: "Hogwarts"
- family-names:
- - Potter
- - James
- given-names: Harry
- """.encode(
- "utf-8"
- )
-
- expected = {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "author": [
- {
- "type": "Person",
- "affiliation": {
- "type": "Organization",
- "name": "Hogwarts",
- },
- "givenName": "Harry",
- },
- ],
- }
-
- result = self.cff_mapping.translate(content)
- assert expected == result
-
- def test_compute_metadata_npm(self):
- """
- testing only computation of metadata with hard_mapping_npm
- """
- content = b"""
- {
- "name": "test_metadata",
- "version": "0.0.2",
- "description": "Simple package.json test for indexer",
- "repository": {
- "type": "git",
- "url": "https://github.com/moranegg/metadata_test"
- },
- "author": {
- "email": "moranegg@example.com",
- "name": "Morane G"
- }
- }
- """
- declared_metadata = {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "test_metadata",
- "version": "0.0.2",
- "description": "Simple package.json test for indexer",
- "codeRepository": "git+https://github.com/moranegg/metadata_test",
- "author": [
- {
- "type": "Person",
- "name": "Morane G",
- "email": "moranegg@example.com",
- }
- ],
- }
-
- result = self.npm_mapping.translate(content)
- assert declared_metadata == result
-
- def test_compute_metadata_invalid_description_npm(self):
- """
- testing only computation of metadata with hard_mapping_npm
- """
- content = b"""
- {
- "name": "test_metadata",
- "version": "0.0.2",
- "description": 1234
- }
- """
- declared_metadata = {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "test_metadata",
- "version": "0.0.2",
- }
-
- result = self.npm_mapping.translate(content)
- assert declared_metadata == result
-
- def test_index_content_metadata_npm(self):
- """
- testing NPM with package.json
- - one sha1 uses a file that can't be translated to metadata and
- should return None in the translated metadata
- """
- sha1s = [
- hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
- hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
- hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"),
- ]
- # this metadata indexer computes only metadata for package.json
- # in npm context with a hard mapping
- config = BASE_TEST_CONFIG.copy()
- config["tools"] = [TRANSLATOR_TOOL]
- metadata_indexer = ContentMetadataTestIndexer(config=config)
- fill_obj_storage(metadata_indexer.objstorage)
- fill_storage(metadata_indexer.storage)
-
- metadata_indexer.run(sha1s)
- results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
-
- expected_results = [
- ContentMetadataRow(
- id=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
- tool=TRANSLATOR_TOOL,
- metadata={
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "codeRepository": "git+https://github.com/moranegg/metadata_test",
- "description": "Simple package.json test for indexer",
- "name": "test_metadata",
- "version": "0.0.1",
- },
- ),
- ContentMetadataRow(
- id=hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
- tool=TRANSLATOR_TOOL,
- metadata={
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "issueTracker": "https://github.com/npm/npm/issues",
- "author": [
- {
- "type": "Person",
- "name": "Isaac Z. Schlueter",
- "email": "i@izs.me",
- "url": "http://blog.izs.me",
- }
- ],
- "codeRepository": "git+https://github.com/npm/npm",
- "description": "a package manager for JavaScript",
- "license": "https://spdx.org/licenses/Artistic-2.0",
- "version": "5.0.3",
- "name": "npm",
- "keywords": [
- "install",
- "modules",
- "package manager",
- "package.json",
- ],
- "url": "https://docs.npmjs.com/",
- },
- ),
- ]
-
- for result in results:
- del result.tool["id"]
-
- # The assertion below returns False sometimes because of nested lists
- assert expected_results == results
-
- def test_npm_bugs_normalization(self):
- # valid dictionary
- package_json = b"""{
- "name": "foo",
- "bugs": {
- "url": "https://github.com/owner/project/issues",
- "email": "foo@example.com"
- }
- }"""
- result = self.npm_mapping.translate(package_json)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "issueTracker": "https://github.com/owner/project/issues",
- "type": "SoftwareSourceCode",
- }
-
- # "invalid" dictionary
- package_json = b"""{
- "name": "foo",
- "bugs": {
- "email": "foo@example.com"
- }
- }"""
- result = self.npm_mapping.translate(package_json)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "type": "SoftwareSourceCode",
- }
-
- # string
- package_json = b"""{
- "name": "foo",
- "bugs": "https://github.com/owner/project/issues"
- }"""
- result = self.npm_mapping.translate(package_json)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "issueTracker": "https://github.com/owner/project/issues",
- "type": "SoftwareSourceCode",
- }
-
- def test_npm_repository_normalization(self):
- # normal
- package_json = b"""{
- "name": "foo",
- "repository": {
- "type" : "git",
- "url" : "https://github.com/npm/cli.git"
- }
- }"""
- result = self.npm_mapping.translate(package_json)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "codeRepository": "git+https://github.com/npm/cli.git",
- "type": "SoftwareSourceCode",
- }
-
- # missing url
- package_json = b"""{
- "name": "foo",
- "repository": {
- "type" : "git"
- }
- }"""
- result = self.npm_mapping.translate(package_json)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "type": "SoftwareSourceCode",
- }
-
- # github shortcut
- package_json = b"""{
- "name": "foo",
- "repository": "github:npm/cli"
- }"""
- result = self.npm_mapping.translate(package_json)
- expected_result = {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "codeRepository": "git+https://github.com/npm/cli.git",
- "type": "SoftwareSourceCode",
- }
- assert result == expected_result
-
- # github shortshortcut
- package_json = b"""{
- "name": "foo",
- "repository": "npm/cli"
- }"""
- result = self.npm_mapping.translate(package_json)
- assert result == expected_result
-
- # gitlab shortcut
- package_json = b"""{
- "name": "foo",
- "repository": "gitlab:user/repo"
- }"""
- result = self.npm_mapping.translate(package_json)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "codeRepository": "git+https://gitlab.com/user/repo.git",
- "type": "SoftwareSourceCode",
- }
-
- @pytest.mark.parametrize(
- "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
- )
- def test_detect_metadata_package_json(self, filename):
- df = [
- {
- "sha1_git": b"abc",
- "name": b"index.js",
- "target": b"abc",
- "length": 897,
- "status": "visible",
- "type": "file",
- "perms": 33188,
- "dir_id": b"dir_a",
- "sha1": b"bcd",
- },
- {
- "sha1_git": b"aab",
- "name": filename,
- "target": b"aab",
- "length": 712,
- "status": "visible",
- "type": "file",
- "perms": 33188,
- "dir_id": b"dir_a",
- "sha1": b"cde",
- },
- ]
- results = detect_metadata(df)
-
- expected_results = {"NpmMapping": [b"cde"]}
- assert expected_results == results
-
- def test_detect_metadata_codemeta_json_uppercase(self):
- df = [
- {
- "sha1_git": b"abc",
- "name": b"index.html",
- "target": b"abc",
- "length": 897,
- "status": "visible",
- "type": "file",
- "perms": 33188,
- "dir_id": b"dir_a",
- "sha1": b"bcd",
- },
- {
- "sha1_git": b"aab",
- "name": b"CODEMETA.json",
- "target": b"aab",
- "length": 712,
- "status": "visible",
- "type": "file",
- "perms": 33188,
- "dir_id": b"dir_a",
- "sha1": b"bcd",
- },
- ]
- results = detect_metadata(df)
-
- expected_results = {"CodemetaMapping": [b"bcd"]}
- assert expected_results == results
-
- def test_compute_metadata_valid_codemeta(self):
- raw_content = b"""{
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "@type": "SoftwareSourceCode",
- "identifier": "CodeMeta",
- "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
- "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
- "codeRepository": "https://github.com/codemeta/codemeta",
- "issueTracker": "https://github.com/codemeta/codemeta/issues",
- "license": "https://spdx.org/licenses/Apache-2.0",
- "version": "2.0",
- "author": [
- {
- "@type": "Person",
- "givenName": "Carl",
- "familyName": "Boettiger",
- "email": "cboettig@gmail.com",
- "@id": "http://orcid.org/0000-0002-1642-628X"
- },
- {
- "@type": "Person",
- "givenName": "Matthew B.",
- "familyName": "Jones",
- "email": "jones@nceas.ucsb.edu",
- "@id": "http://orcid.org/0000-0003-0077-4738"
- }
- ],
- "maintainer": {
- "@type": "Person",
- "givenName": "Carl",
- "familyName": "Boettiger",
- "email": "cboettig@gmail.com",
- "@id": "http://orcid.org/0000-0002-1642-628X"
- },
- "contIntegration": "https://travis-ci.org/codemeta/codemeta",
- "developmentStatus": "active",
- "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
- "funder": {
- "@id": "https://doi.org/10.13039/100000001",
- "@type": "Organization",
- "name": "National Science Foundation"
- },
- "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
- "keywords": [
- "metadata",
- "software"
- ],
- "version":"2.0",
- "dateCreated":"2017-06-05",
- "datePublished":"2017-06-05",
- "programmingLanguage": "JSON-LD"
- }""" # noqa
- expected_result = {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "identifier": "CodeMeta",
- "description": "CodeMeta is a concept vocabulary that can "
- "be used to standardize the exchange of software metadata "
- "across repositories and organizations.",
- "name": "CodeMeta: Minimal metadata schemas for science "
- "software and code, in JSON-LD",
- "codeRepository": "https://github.com/codemeta/codemeta",
- "issueTracker": "https://github.com/codemeta/codemeta/issues",
- "license": "https://spdx.org/licenses/Apache-2.0",
- "version": "2.0",
- "author": [
- {
- "type": "Person",
- "givenName": "Carl",
- "familyName": "Boettiger",
- "email": "cboettig@gmail.com",
- "id": "http://orcid.org/0000-0002-1642-628X",
- },
- {
- "type": "Person",
- "givenName": "Matthew B.",
- "familyName": "Jones",
- "email": "jones@nceas.ucsb.edu",
- "id": "http://orcid.org/0000-0003-0077-4738",
- },
- ],
- "maintainer": {
- "type": "Person",
- "givenName": "Carl",
- "familyName": "Boettiger",
- "email": "cboettig@gmail.com",
- "id": "http://orcid.org/0000-0002-1642-628X",
- },
- "contIntegration": "https://travis-ci.org/codemeta/codemeta",
- "developmentStatus": "active",
- "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
- "funder": {
- "id": "https://doi.org/10.13039/100000001",
- "type": "Organization",
- "name": "National Science Foundation",
- },
- "funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
- "in Scientific Software",
- "keywords": ["metadata", "software"],
- "version": "2.0",
- "dateCreated": "2017-06-05",
- "datePublished": "2017-06-05",
- "programmingLanguage": "JSON-LD",
- }
- result = self.codemeta_mapping.translate(raw_content)
- assert result == expected_result
-
- def test_compute_metadata_codemeta_alternate_context(self):
- raw_content = b"""{
- "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
- "@type": "SoftwareSourceCode",
- "identifier": "CodeMeta"
- }""" # noqa
- expected_result = {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "identifier": "CodeMeta",
- }
- result = self.codemeta_mapping.translate(raw_content)
- assert result == expected_result
-
- def test_compute_metadata_maven(self):
- raw_content = b"""
-
- Maven Default Project
- 4.0.0
- com.mycompany.app
- my-app
- 1.2.3
-
-
- central
- Maven Repository Switchboard
- default
- http://repo1.maven.org/maven2
-
- false
-
-
-
-
-
- Apache License, Version 2.0
- https://www.apache.org/licenses/LICENSE-2.0.txt
- repo
- A business-friendly OSS license
-
-
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
- "codeRepository": (
- "http://repo1.maven.org/maven2/com/mycompany/app/my-app"
- ),
- }
-
- def test_compute_metadata_maven_empty(self):
- raw_content = b"""
-
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- }
-
- def test_compute_metadata_maven_almost_empty(self):
- raw_content = b"""
-
-
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- }
-
- def test_compute_metadata_maven_invalid_xml(self, caplog):
- expected_warning = (
- "swh.indexer.metadata_dictionary.maven.MavenMapping",
- logging.WARNING,
- "Error parsing XML from foo",
- )
- caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
-
- raw_content = b"""
- """
- caplog.clear()
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- assert caplog.record_tuples == [expected_warning]
- assert result is None
-
- raw_content = b"""
- """
- caplog.clear()
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- assert caplog.record_tuples == [expected_warning]
- assert result is None
-
- def test_compute_metadata_maven_unknown_encoding(self, caplog):
- expected_warning = (
- "swh.indexer.metadata_dictionary.maven.MavenMapping",
- logging.WARNING,
- "Error detecting XML encoding from foo",
- )
- caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
-
- raw_content = b"""
-
- """
- caplog.clear()
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- assert caplog.record_tuples == [expected_warning]
- assert result is None
-
- raw_content = b"""
-
- """
- caplog.clear()
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- assert caplog.record_tuples == [expected_warning]
- assert result is None
-
- def test_compute_metadata_maven_invalid_encoding(self, caplog):
- expected_warning = [
- # libexpat1 <= 2.2.10-2+deb11u1
- [
- (
- "swh.indexer.metadata_dictionary.maven.MavenMapping",
- logging.WARNING,
- "Error unidecoding XML from foo",
- )
- ],
- # libexpat1 >= 2.2.10-2+deb11u2
- [
- (
- "swh.indexer.metadata_dictionary.maven.MavenMapping",
- logging.WARNING,
- "Error parsing XML from foo",
- )
- ],
- ]
- caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
-
- raw_content = b"""
-
- """
- caplog.clear()
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- assert caplog.record_tuples in expected_warning
- assert result is None
-
- def test_compute_metadata_maven_minimal(self):
- raw_content = b"""
-
- Maven Default Project
- 4.0.0
- com.mycompany.app
- my-app
- 1.2.3
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- }
-
- def test_compute_metadata_maven_empty_nodes(self):
- raw_content = b"""
-
- Maven Default Project
- 4.0.0
- com.mycompany.app
- my-app
- 1.2.3
-
-
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- }
-
- raw_content = b"""
-
- Maven Default Project
- 4.0.0
- com.mycompany.app
- my-app
-
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- }
-
- raw_content = b"""
-
-
- 4.0.0
- com.mycompany.app
- my-app
- 1.2.3
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- }
-
- raw_content = b"""
-
- Maven Default Project
- 4.0.0
- com.mycompany.app
- my-app
- 1.2.3
-
-
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- }
-
- raw_content = b"""
-
-
- 1.2.3
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "version": "1.2.3",
- }
-
- def test_compute_metadata_maven_invalid_licenses(self):
- raw_content = b"""
-
- Maven Default Project
- 4.0.0
- com.mycompany.app
- my-app
- 1.2.3
-
- foo
-
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- }
-
- def test_compute_metadata_maven_multiple(self):
- """Tests when there are multiple code repos and licenses."""
- raw_content = b"""
-
- Maven Default Project
- 4.0.0
- com.mycompany.app
- my-app
- 1.2.3
-
-
- central
- Maven Repository Switchboard
- default
- http://repo1.maven.org/maven2
-
- false
-
-
-
- example
- Example Maven Repo
- default
- http://example.org/maven2
-
-
-
-
- Apache License, Version 2.0
- https://www.apache.org/licenses/LICENSE-2.0.txt
- repo
- A business-friendly OSS license
-
-
- MIT license
- https://opensource.org/licenses/MIT
-
-
- """
- result = self.maven_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "license": [
- "https://www.apache.org/licenses/LICENSE-2.0.txt",
- "https://opensource.org/licenses/MIT",
- ],
- "codeRepository": [
- "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
- "http://example.org/maven2/com/mycompany/app/my-app",
- ],
- }
-
- def test_compute_metadata_pkginfo(self):
- raw_content = b"""\
-Metadata-Version: 2.1
-Name: swh.core
-Version: 0.0.49
-Summary: Software Heritage core utilities
-Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
-Author: Software Heritage developers
-Author-email: swh-devel@inria.fr
-License: UNKNOWN
-Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
-Project-URL: Funding, https://www.softwareheritage.org/donate
-Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
-Description: swh-core
- ========
- \x20
- core library for swh's modules:
- - config parser
- - hash computations
- - serialization
- - logging mechanism
- \x20
-Platform: UNKNOWN
-Classifier: Programming Language :: Python :: 3
-Classifier: Intended Audience :: Developers
-Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
-Classifier: Operating System :: OS Independent
-Classifier: Development Status :: 5 - Production/Stable
-Description-Content-Type: text/markdown
-Provides-Extra: testing
-""" # noqa
- result = self.pkginfo_mapping.translate(raw_content)
- assert result["description"] == [
- "Software Heritage core utilities", # note the comma here
- "swh-core\n"
- "========\n"
- "\n"
- "core library for swh's modules:\n"
- "- config parser\n"
- "- hash computations\n"
- "- serialization\n"
- "- logging mechanism\n"
- "",
- ], result
- del result["description"]
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "url": "https://forge.softwareheritage.org/diffusion/DCORE/",
- "name": "swh.core",
- "author": [
- {
- "type": "Person",
- "name": "Software Heritage developers",
- "email": "swh-devel@inria.fr",
- }
- ],
- "version": "0.0.49",
- }
-
- def test_compute_metadata_pkginfo_utf8(self):
- raw_content = b"""\
-Metadata-Version: 1.1
-Name: snowpyt
-Description-Content-Type: UNKNOWN
-Description: foo
- Hydrology N\xc2\xb083
-""" # noqa
- result = self.pkginfo_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "snowpyt",
- "description": "foo\nHydrology N°83",
- }
-
- def test_compute_metadata_pkginfo_keywords(self):
- raw_content = b"""\
-Metadata-Version: 2.1
-Name: foo
-Keywords: foo bar baz
-""" # noqa
- result = self.pkginfo_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "foo",
- "keywords": ["foo", "bar", "baz"],
- }
-
- def test_compute_metadata_pkginfo_license(self):
- raw_content = b"""\
-Metadata-Version: 2.1
-Name: foo
-License: MIT
-""" # noqa
- result = self.pkginfo_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "foo",
- "license": "MIT",
- }
-
- def test_gemspec_base(self):
- raw_content = b"""
-Gem::Specification.new do |s|
- s.name = 'example'
- s.version = '0.1.0'
- s.licenses = ['MIT']
- s.summary = "This is an example!"
- s.description = "Much longer explanation of the example!"
- s.authors = ["Ruby Coder"]
- s.email = 'rubycoder@example.com'
- s.files = ["lib/example.rb"]
- s.homepage = 'https://rubygems.org/gems/example'
- s.metadata = { "source_code_uri" => "https://github.com/example/example" }
-end"""
- result = self.gemspec_mapping.translate(raw_content)
- assert set(result.pop("description")) == {
- "This is an example!",
- "Much longer explanation of the example!",
- }
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "author": [{"type": "Person", "name": "Ruby Coder"}],
- "name": "example",
- "license": "https://spdx.org/licenses/MIT",
- "codeRepository": "https://rubygems.org/gems/example",
- "email": "rubycoder@example.com",
- "version": "0.1.0",
- }
-
- def test_gemspec_two_author_fields(self):
- raw_content = b"""
-Gem::Specification.new do |s|
- s.authors = ["Ruby Coder1"]
- s.author = "Ruby Coder2"
-end"""
- result = self.gemspec_mapping.translate(raw_content)
- assert result.pop("author") in (
- [
- {"type": "Person", "name": "Ruby Coder1"},
- {"type": "Person", "name": "Ruby Coder2"},
- ],
- [
- {"type": "Person", "name": "Ruby Coder2"},
- {"type": "Person", "name": "Ruby Coder1"},
- ],
- )
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- }
-
- def test_gemspec_invalid_author(self):
- raw_content = b"""
-Gem::Specification.new do |s|
- s.author = ["Ruby Coder"]
-end"""
- result = self.gemspec_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- }
- raw_content = b"""
-Gem::Specification.new do |s|
- s.author = "Ruby Coder1",
-end"""
- result = self.gemspec_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- }
- raw_content = b"""
-Gem::Specification.new do |s|
- s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
-end"""
- result = self.gemspec_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "author": [{"type": "Person", "name": "Ruby Coder1"}],
- }
-
- def test_gemspec_alternative_header(self):
- raw_content = b"""
-require './lib/version'
-
-Gem::Specification.new { |s|
- s.name = 'rb-system-with-aliases'
- s.summary = 'execute system commands with aliases'
-}
-"""
- result = self.gemspec_mapping.translate(raw_content)
- assert result == {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "rb-system-with-aliases",
- "description": "execute system commands with aliases",
- }
-
- @settings(suppress_health_check=[HealthCheck.too_slow])
- @given(json_document_strategy(keys=list(NpmMapping.mapping)))
- def test_npm_adversarial(self, doc):
- raw = json.dumps(doc).encode()
- self.npm_mapping.translate(raw)
-
- @settings(suppress_health_check=[HealthCheck.too_slow])
- @given(json_document_strategy(keys=CODEMETA_TERMS))
- def test_codemeta_adversarial(self, doc):
- raw = json.dumps(doc).encode()
- self.codemeta_mapping.translate(raw)
-
- @settings(suppress_health_check=[HealthCheck.too_slow])
- @given(
- xml_document_strategy(
- keys=list(MavenMapping.mapping),
- root="project",
- xmlns="http://maven.apache.org/POM/4.0.0",
- )
- )
- def test_maven_adversarial(self, doc):
- self.maven_mapping.translate(doc)
-
- @settings(suppress_health_check=[HealthCheck.too_slow])
- @given(
- strategies.dictionaries(
- # keys
- strategies.one_of(
- strategies.text(), *map(strategies.just, GemspecMapping.mapping)
- ),
- # values
- strategies.recursive(
- strategies.characters(),
- lambda children: strategies.lists(children, min_size=1),
- ),
- )
- )
- def test_gemspec_adversarial(self, doc):
- parts = [b"Gem::Specification.new do |s|\n"]
- for (k, v) in doc.items():
- parts.append(" s.{} = {}\n".format(k, repr(v)).encode())
- parts.append(b"end\n")
- self.gemspec_mapping.translate(b"".join(parts))
-
def test_directory_metadata_indexer(self):
metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
diff --git a/swh/indexer/tests/zz_celery/README b/swh/indexer/tests/zz_celery/README
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/zz_celery/README
@@ -0,0 +1,2 @@
+this directory is named "zz_celery" so pytest runs it last, to prevent
+Celery-related fixtures from interfering with other tests
diff --git a/swh/indexer/tests/zz_celery/__init__.py b/swh/indexer/tests/zz_celery/__init__.py
new file mode 100644
diff --git a/swh/indexer/tests/test_tasks.py b/swh/indexer/tests/zz_celery/test_tasks.py
rename from swh/indexer/tests/test_tasks.py
rename to swh/indexer/tests/zz_celery/test_tasks.py