diff --git a/swh/indexer/tests/metadata_dictionary/__init__.py b/swh/indexer/tests/metadata_dictionary/__init__.py new file mode 100644 diff --git a/swh/indexer/tests/metadata_dictionary/test_cff.py b/swh/indexer/tests/metadata_dictionary/test_cff.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_cff.py @@ -0,0 +1,220 @@ +# Copyright (C) 2017-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.indexer.metadata_dictionary import MAPPINGS + + +def test_compute_metadata_cff(): + """ + testing CITATION.cff translation + """ + content = """# YAML 1.2 +--- +abstract: "Command line program to convert from Citation File \ +Format to various other formats such as BibTeX, EndNote, RIS, \ +schema.org, CodeMeta, and .zenodo.json." +authors: + - + affiliation: "Netherlands eScience Center" + family-names: Klaver + given-names: Tom + - + affiliation: "Humboldt-Universität zu Berlin" + family-names: Druskat + given-names: Stephan + orcid: https://orcid.org/0000-0003-4925-7248 +cff-version: "1.0.3" +date-released: 2019-11-12 +doi: 10.5281/zenodo.1162057 +keywords: + - "citation" + - "bibliography" + - "cff" + - "CITATION.cff" +license: Apache-2.0 +message: "If you use this software, please cite it using these metadata." +license: Apache-2.0 +message: "If you use this software, please cite it using these metadata." +repository-code: "https://github.com/citation-file-format/cff-converter-python" +title: cffconvert +version: "1.4.0-alpha0" + """.encode( + "utf-8" + ) + + expected = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [ + { + "type": "Person", + "affiliation": { + "type": "Organization", + "name": "Netherlands eScience Center", + }, + "familyName": "Klaver", + "givenName": "Tom", + }, + { + "id": "https://orcid.org/0000-0003-4925-7248", + "type": "Person", + "affiliation": { + "type": "Organization", + "name": "Humboldt-Universität zu Berlin", + }, + "familyName": "Druskat", + "givenName": "Stephan", + }, + ], + "codeRepository": ( + "https://github.com/citation-file-format/cff-converter-python" + ), + "datePublished": "2019-11-12", + "description": """Command line program to convert from \ +Citation File Format to various other formats such as BibTeX, EndNote, \ +RIS, schema.org, CodeMeta, and .zenodo.json.""", + "identifier": "https://doi.org/10.5281/zenodo.1162057", + "keywords": ["citation", "bibliography", "cff", "CITATION.cff"], + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "1.4.0-alpha0", + } + + result = MAPPINGS["CffMapping"]().translate(content) + assert expected == result + + +def test_compute_metadata_cff_invalid_yaml(): + """ + test yaml translation for invalid yaml file + """ + content = """cff-version: 1.0.3 +message: To cite the SigMF specification, please include the following: +authors: + - name: The GNU Radio Foundation, Inc. + """.encode( + "utf-8" + ) + + expected = None + + result = MAPPINGS["CffMapping"]().translate(content) + assert expected == result + + +def test_compute_metadata_cff_empty(): + """ + test yaml translation for empty yaml file + """ + content = """ + """.encode( + "utf-8" + ) + + expected = None + + result = MAPPINGS["CffMapping"]().translate(content) + assert expected == result + + +def test_compute_metadata_cff_list(): + """ + test yaml translation for empty yaml file + """ + content = """ +- Foo +- Bar + """.encode( + "utf-8" + ) + + expected = None + + result = MAPPINGS["CffMapping"]().translate(content) + assert expected == result + + +def test_cff_empty_fields(): + """ + testing CITATION.cff translation + """ + content = """# YAML 1.2 + authors: + - + affiliation: "Hogwarts" + family-names: + given-names: Harry + - + affiliation: "Ministry of Magic" + family-names: Weasley + orcid: + given-names: Arthur + + + """.encode( + "utf-8" + ) + + expected = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [ + { + "type": "Person", + "affiliation": { + "type": "Organization", + "name": "Hogwarts", + }, + "givenName": "Harry", + }, + { + "type": "Person", + "affiliation": { + "type": "Organization", + "name": "Ministry of Magic", + }, + "familyName": "Weasley", + "givenName": "Arthur", + }, + ], + } + + result = MAPPINGS["CffMapping"]().translate(content) + assert expected == result + + +def test_cff_invalid_fields(): + """ + testing CITATION.cff translation + """ + content = """# YAML 1.2 + authors: + - + affiliation: "Hogwarts" + family-names: + - Potter + - James + given-names: Harry + + """.encode( + "utf-8" + ) + + expected = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [ + { + "type": "Person", + "affiliation": { + "type": "Organization", + "name": "Hogwarts", + }, + "givenName": "Harry", + }, + ], + } + + result = MAPPINGS["CffMapping"]().translate(content) + assert expected == result diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -0,0 +1,175 @@ +# Copyright (C) 2017-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json + +from hypothesis import HealthCheck, given, settings + +from swh.indexer.codemeta import CODEMETA_TERMS +from swh.indexer.metadata_detector import detect_metadata +from swh.indexer.metadata_dictionary import MAPPINGS + +from ..utils import json_document_strategy + + +def test_compute_metadata_valid_codemeta(): + raw_content = b"""{ + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "@type": "SoftwareSourceCode", + "identifier": "CodeMeta", + "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", + "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", + "codeRepository": "https://github.com/codemeta/codemeta", + "issueTracker": "https://github.com/codemeta/codemeta/issues", + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "2.0", + "author": [ + { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + { + "@type": "Person", + "givenName": "Matthew B.", + "familyName": "Jones", + "email": "jones@nceas.ucsb.edu", + "@id": "http://orcid.org/0000-0003-0077-4738" + } + ], + "maintainer": { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + "contIntegration": "https://travis-ci.org/codemeta/codemeta", + "developmentStatus": "active", + "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", + "funder": { + "@id": "https://doi.org/10.13039/100000001", + "@type": "Organization", + "name": "National Science Foundation" + }, + "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", + "keywords": [ + "metadata", + "software" + ], + "version":"2.0", + "dateCreated":"2017-06-05", + "datePublished":"2017-06-05", + "programmingLanguage": "JSON-LD" + }""" # noqa + expected_result = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "identifier": "CodeMeta", + "description": "CodeMeta is a concept vocabulary that can " + "be used to standardize the exchange of software metadata " + "across repositories and organizations.", + "name": "CodeMeta: Minimal metadata schemas for science " + "software and code, in JSON-LD", + "codeRepository": "https://github.com/codemeta/codemeta", + "issueTracker": "https://github.com/codemeta/codemeta/issues", + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "2.0", + "author": [ + { + "type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "id": "http://orcid.org/0000-0002-1642-628X", + }, + { + "type": "Person", + "givenName": "Matthew B.", + "familyName": "Jones", + "email": "jones@nceas.ucsb.edu", + "id": "http://orcid.org/0000-0003-0077-4738", + }, + ], + "maintainer": { + "type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "id": "http://orcid.org/0000-0002-1642-628X", + }, + "contIntegration": "https://travis-ci.org/codemeta/codemeta", + "developmentStatus": "active", + "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", + "funder": { + "id": "https://doi.org/10.13039/100000001", + "type": "Organization", + "name": "National Science Foundation", + }, + "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " + "in Scientific Software", + "keywords": ["metadata", "software"], + "version": "2.0", + "dateCreated": "2017-06-05", + "datePublished": "2017-06-05", + "programmingLanguage": "JSON-LD", + } + result = MAPPINGS["CodemetaMapping"]().translate(raw_content) + assert result == expected_result + + +def test_compute_metadata_codemeta_alternate_context(): + raw_content = b"""{ + "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", + "@type": "SoftwareSourceCode", + "identifier": "CodeMeta" + }""" # noqa + expected_result = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "identifier": "CodeMeta", + } + result = MAPPINGS["CodemetaMapping"]().translate(raw_content) + assert result == expected_result + + +@settings(suppress_health_check=[HealthCheck.too_slow]) +@given(json_document_strategy(keys=CODEMETA_TERMS)) +def test_codemeta_adversarial(doc): + raw = json.dumps(doc).encode() + MAPPINGS["CodemetaMapping"]().translate(raw) + + +def test_detect_metadata_codemeta_json_uppercase(): + df = [ + { + "sha1_git": b"abc", + "name": b"index.html", + "target": b"abc", + "length": 897, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"bcd", + }, + { + "sha1_git": b"aab", + "name": b"CODEMETA.json", + "target": b"aab", + "length": 712, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"bcd", + }, + ] + results = detect_metadata(df) + + expected_results = {"CodemetaMapping": [b"bcd"]} + assert expected_results == results diff --git a/swh/indexer/tests/metadata_dictionary/test_maven.py b/swh/indexer/tests/metadata_dictionary/test_maven.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_maven.py @@ -0,0 +1,365 @@ +# Copyright (C) 2017-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging + +from hypothesis import HealthCheck, given, settings + +from swh.indexer.metadata_dictionary import MAPPINGS + +from ..utils import xml_document_strategy + + +def test_compute_metadata_maven(): + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + + + central + Maven Repository Switchboard + default + http://repo1.maven.org/maven2 + + false + + + + + + Apache License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "license": "https://www.apache.org/licenses/LICENSE-2.0.txt", + "codeRepository": ("http://repo1.maven.org/maven2/com/mycompany/app/my-app"), + } + + +def test_compute_metadata_maven_empty(): + raw_content = b""" + + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } + + +def test_compute_metadata_maven_almost_empty(): + raw_content = b""" + + + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } + + +def test_compute_metadata_maven_invalid_xml(caplog): + expected_warning = ( + "swh.indexer.metadata_dictionary.maven.MavenMapping", + logging.WARNING, + "Error parsing XML from foo", + ) + caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary") + + raw_content = b""" + """ + caplog.clear() + result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) + assert caplog.record_tuples == [expected_warning], result + assert result is None + + raw_content = b""" + """ + caplog.clear() + result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) + assert caplog.record_tuples == [expected_warning], result + assert result is None + + +def test_compute_metadata_maven_unknown_encoding(caplog): + expected_warning = ( + "swh.indexer.metadata_dictionary.maven.MavenMapping", + logging.WARNING, + "Error detecting XML encoding from foo", + ) + caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary") + + raw_content = b""" + + """ + caplog.clear() + result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) + assert caplog.record_tuples == [expected_warning], result + assert result is None + + raw_content = b""" + + """ + caplog.clear() + result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) + assert caplog.record_tuples == [expected_warning], result + assert result is None + + +def test_compute_metadata_maven_invalid_encoding(caplog): + expected_warning = [ + # libexpat1 <= 2.2.10-2+deb11u1 + [ + ( + "swh.indexer.metadata_dictionary.maven.MavenMapping", + logging.WARNING, + "Error unidecoding XML from foo", + ) + ], + # libexpat1 >= 2.2.10-2+deb11u2 + [ + ( + "swh.indexer.metadata_dictionary.maven.MavenMapping", + logging.WARNING, + "Error parsing XML from foo", + ) + ], + ] + caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary") + + raw_content = b""" + + """ + caplog.clear() + result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) + assert caplog.record_tuples in expected_warning, result + assert result is None + + +def test_compute_metadata_maven_minimal(): + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } + + +def test_compute_metadata_maven_empty_nodes(): + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + + + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } + + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } + + raw_content = b""" + + + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } + + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + + + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } + + raw_content = b""" + + + 1.2.3 + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "version": "1.2.3", + } + + +def test_compute_metadata_maven_invalid_licenses(): + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + + foo + + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } + + +def test_compute_metadata_maven_multiple(): + """Tests when there are multiple code repos and licenses.""" + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + + + central + Maven Repository Switchboard + default + http://repo1.maven.org/maven2 + + false + + + + example + Example Maven Repo + default + http://example.org/maven2 + + + + + Apache License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + + MIT license + https://opensource.org/licenses/MIT + + + """ + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "license": [ + "https://www.apache.org/licenses/LICENSE-2.0.txt", + "https://opensource.org/licenses/MIT", + ], + "codeRepository": [ + "http://repo1.maven.org/maven2/com/mycompany/app/my-app", + "http://example.org/maven2/com/mycompany/app/my-app", + ], + } + + +@settings(suppress_health_check=[HealthCheck.too_slow]) +@given( + xml_document_strategy( + keys=list(MAPPINGS["MavenMapping"].mapping), # type: ignore + root="project", + xmlns="http://maven.apache.org/POM/4.0.0", + ) +) +def test_maven_adversarial(doc): + MAPPINGS["MavenMapping"]().translate(doc) diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -0,0 +1,322 @@ +# Copyright (C) 2017-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json + +from hypothesis import HealthCheck, given, settings +import pytest + +from swh.indexer.metadata_detector import detect_metadata +from swh.indexer.metadata_dictionary import MAPPINGS +from swh.indexer.storage.model import ContentMetadataRow +from swh.model.hashutil import hash_to_bytes + +from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer +from ..utils import ( + BASE_TEST_CONFIG, + fill_obj_storage, + fill_storage, + json_document_strategy, +) + + +def test_compute_metadata_none(): + """ + testing content empty content is empty + should return None + """ + content = b"" + + # None if no metadata was found or an error occurred + declared_metadata = None + result = MAPPINGS["NpmMapping"]().translate(content) + assert declared_metadata == result + + +def test_compute_metadata_npm(): + """ + testing only computation of metadata with hard_mapping_npm + """ + content = b""" + { + "name": "test_metadata", + "version": "0.0.2", + "description": "Simple package.json test for indexer", + "repository": { + "type": "git", + "url": "https://github.com/moranegg/metadata_test" + }, + "author": { + "email": "moranegg@example.com", + "name": "Morane G" + } + } + """ + declared_metadata = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "test_metadata", + "version": "0.0.2", + "description": "Simple package.json test for indexer", + "codeRepository": "git+https://github.com/moranegg/metadata_test", + "author": [ + { + "type": "Person", + "name": "Morane G", + "email": "moranegg@example.com", + } + ], + } + + result = MAPPINGS["NpmMapping"]().translate(content) + assert declared_metadata == result + + +def test_compute_metadata_invalid_description_npm(): + """ + testing only computation of metadata with hard_mapping_npm + """ + content = b""" + { + "name": "test_metadata", + "version": "0.0.2", + "description": 1234 + } + """ + declared_metadata = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "test_metadata", + "version": "0.0.2", + } + + result = MAPPINGS["NpmMapping"]().translate(content) + assert declared_metadata == result + + +def test_index_content_metadata_npm(): + """ + testing NPM with package.json + - one sha1 uses a file that can't be translated to metadata and + should return None in the translated metadata + """ + sha1s = [ + hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), + hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), + hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"), + ] + # this metadata indexer computes only metadata for package.json + # in npm context with a hard mapping + config = BASE_TEST_CONFIG.copy() + config["tools"] = [TRANSLATOR_TOOL] + metadata_indexer = ContentMetadataTestIndexer(config=config) + fill_obj_storage(metadata_indexer.objstorage) + fill_storage(metadata_indexer.storage) + + metadata_indexer.run(sha1s) + results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) + + expected_results = [ + ContentMetadataRow( + id=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), + tool=TRANSLATOR_TOOL, + metadata={ + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "codeRepository": "git+https://github.com/moranegg/metadata_test", + "description": "Simple package.json test for indexer", + "name": "test_metadata", + "version": "0.0.1", + }, + ), + ContentMetadataRow( + id=hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), + tool=TRANSLATOR_TOOL, + metadata={ + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "issueTracker": "https://github.com/npm/npm/issues", + "author": [ + { + "type": "Person", + "name": "Isaac Z. Schlueter", + "email": "i@izs.me", + "url": "http://blog.izs.me", + } + ], + "codeRepository": "git+https://github.com/npm/npm", + "description": "a package manager for JavaScript", + "license": "https://spdx.org/licenses/Artistic-2.0", + "version": "5.0.3", + "name": "npm", + "keywords": [ + "install", + "modules", + "package manager", + "package.json", + ], + "url": "https://docs.npmjs.com/", + }, + ), + ] + + for result in results: + del result.tool["id"] + + # The assertion below returns False sometimes because of nested lists + assert expected_results == results + + +def test_npm_bugs_normalization(): + # valid dictionary + package_json = b"""{ + "name": "foo", + "bugs": { + "url": "https://github.com/owner/project/issues", + "email": "foo@example.com" + } + }""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "issueTracker": "https://github.com/owner/project/issues", + "type": "SoftwareSourceCode", + } + + # "invalid" dictionary + package_json = b"""{ + "name": "foo", + "bugs": { + "email": "foo@example.com" + } + }""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "type": "SoftwareSourceCode", + } + + # string + package_json = b"""{ + "name": "foo", + "bugs": "https://github.com/owner/project/issues" + }""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "issueTracker": "https://github.com/owner/project/issues", + "type": "SoftwareSourceCode", + } + + +def test_npm_repository_normalization(): + # normal + package_json = b"""{ + "name": "foo", + "repository": { + "type" : "git", + "url" : "https://github.com/npm/cli.git" + } + }""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "codeRepository": "git+https://github.com/npm/cli.git", + "type": "SoftwareSourceCode", + } + + # missing url + package_json = b"""{ + "name": "foo", + "repository": { + "type" : "git" + } + }""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "type": "SoftwareSourceCode", + } + + # github shortcut + package_json = b"""{ + "name": "foo", + "repository": "github:npm/cli" + }""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + expected_result = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "codeRepository": "git+https://github.com/npm/cli.git", + "type": "SoftwareSourceCode", + } + assert result == expected_result + + # github shortshortcut + package_json = b"""{ + "name": "foo", + "repository": "npm/cli" + }""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == expected_result + + # gitlab shortcut + package_json = b"""{ + "name": "foo", + "repository": "gitlab:user/repo" + }""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "codeRepository": "git+https://gitlab.com/user/repo.git", + "type": "SoftwareSourceCode", + } + + +@settings(suppress_health_check=[HealthCheck.too_slow]) +@given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping))) # type: ignore +def test_npm_adversarial(doc): + raw = json.dumps(doc).encode() + MAPPINGS["NpmMapping"]().translate(raw) + + +@pytest.mark.parametrize( + "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"] +) +def test_detect_metadata_package_json(filename): + df = [ + { + "sha1_git": b"abc", + "name": b"index.js", + "target": b"abc", + "length": 897, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"bcd", + }, + { + "sha1_git": b"aab", + "name": filename, + "target": b"aab", + "length": 712, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"cde", + }, + ] + results = detect_metadata(df) + + expected_results = {"NpmMapping": [b"cde"]} + assert expected_results == results diff --git a/swh/indexer/tests/metadata_dictionary/test_python.py b/swh/indexer/tests/metadata_dictionary/test_python.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_python.py @@ -0,0 +1,114 @@ +# Copyright (C) 2017-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.indexer.metadata_dictionary import MAPPINGS + + +def test_compute_metadata_pkginfo(): + raw_content = b"""\ +Metadata-Version: 2.1 +Name: swh.core +Version: 0.0.49 +Summary: Software Heritage core utilities +Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ +Author: Software Heritage developers +Author-email: swh-devel@inria.fr +License: UNKNOWN +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Funding, https://www.softwareheritage.org/donate +Project-URL: Source, https://forge.softwareheritage.org/source/swh-core +Description: swh-core + ======== + \x20 + core library for swh's modules: + - config parser + - hash computations + - serialization + - logging mechanism + \x20 +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) +Classifier: Operating System :: OS Independent +Classifier: Development Status :: 5 - Production/Stable +Description-Content-Type: text/markdown +Provides-Extra: testing +""" # noqa + result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content) + assert result["description"] == [ + "Software Heritage core utilities", # note the comma here + "swh-core\n" + "========\n" + "\n" + "core library for swh's modules:\n" + "- config parser\n" + "- hash computations\n" + "- serialization\n" + "- logging mechanism\n" + "", + ], result + del result["description"] + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "url": "https://forge.softwareheritage.org/diffusion/DCORE/", + "name": "swh.core", + "author": [ + { + "type": "Person", + "name": "Software Heritage developers", + "email": "swh-devel@inria.fr", + } + ], + "version": "0.0.49", + } + + +def test_compute_metadata_pkginfo_utf8(): + raw_content = b"""\ +Metadata-Version: 1.1 +Name: snowpyt +Description-Content-Type: UNKNOWN +Description: foo + Hydrology N\xc2\xb083 +""" # noqa + result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "snowpyt", + "description": "foo\nHydrology N°83", + } + + +def test_compute_metadata_pkginfo_keywords(): + raw_content = b"""\ +Metadata-Version: 2.1 +Name: foo +Keywords: foo bar baz +""" # noqa + result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "foo", + "keywords": ["foo", "bar", "baz"], + } + + +def test_compute_metadata_pkginfo_license(): + raw_content = b"""\ +Metadata-Version: 2.1 +Name: foo +License: MIT +""" # noqa + result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "foo", + "license": "MIT", + } diff --git a/swh/indexer/tests/metadata_dictionary/test_ruby.py b/swh/indexer/tests/metadata_dictionary/test_ruby.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_ruby.py @@ -0,0 +1,134 @@ +# Copyright (C) 2017-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from hypothesis import HealthCheck, given, settings, strategies + +from swh.indexer.metadata_dictionary import MAPPINGS + + +def test_gemspec_base(): + raw_content = b""" +Gem::Specification.new do |s| +s.name = 'example' +s.version = '0.1.0' +s.licenses = ['MIT'] +s.summary = "This is an example!" +s.description = "Much longer explanation of the example!" +s.authors = ["Ruby Coder"] +s.email = 'rubycoder@example.com' +s.files = ["lib/example.rb"] +s.homepage = 'https://rubygems.org/gems/example' +s.metadata = { "source_code_uri" => "https://github.com/example/example" } +end""" + result = MAPPINGS["GemspecMapping"]().translate(raw_content) + assert set(result.pop("description")) == { + "This is an example!", + "Much longer explanation of the example!", + } + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"type": "Person", "name": "Ruby Coder"}], + "name": "example", + "license": "https://spdx.org/licenses/MIT", + "codeRepository": "https://rubygems.org/gems/example", + "email": "rubycoder@example.com", + "version": "0.1.0", + } + + +def test_gemspec_two_author_fields(): + raw_content = b""" +Gem::Specification.new do |s| +s.authors = ["Ruby Coder1"] +s.author = "Ruby Coder2" +end""" + result = MAPPINGS["GemspecMapping"]().translate(raw_content) + assert result.pop("author") in ( + [ + {"type": "Person", "name": "Ruby Coder1"}, + {"type": "Person", "name": "Ruby Coder2"}, + ], + [ + {"type": "Person", "name": "Ruby Coder2"}, + {"type": "Person", "name": "Ruby Coder1"}, + ], + ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } + + +def test_gemspec_invalid_author(): + raw_content = b""" +Gem::Specification.new do |s| +s.author = ["Ruby Coder"] +end""" + result = MAPPINGS["GemspecMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } + raw_content = b""" +Gem::Specification.new do |s| +s.author = "Ruby Coder1", +end""" + result = MAPPINGS["GemspecMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } + raw_content = b""" +Gem::Specification.new do |s| +s.authors = ["Ruby Coder1", ["Ruby Coder2"]] +end""" + result = MAPPINGS["GemspecMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"type": "Person", "name": "Ruby Coder1"}], + } + + +def test_gemspec_alternative_header(): + raw_content = b""" +require './lib/version' + +Gem::Specification.new { |s| +s.name = 'rb-system-with-aliases' +s.summary = 'execute system commands with aliases' +} +""" + result = MAPPINGS["GemspecMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "rb-system-with-aliases", + "description": "execute system commands with aliases", + } + + +@settings(suppress_health_check=[HealthCheck.too_slow]) +@given( + strategies.dictionaries( + # keys + strategies.one_of( + strategies.text(), + *map(strategies.just, MAPPINGS["GemspecMapping"].mapping), # type: ignore + ), + # values + strategies.recursive( + strategies.characters(), + lambda children: strategies.lists(children, min_size=1), + ), + ) +) +def test_gemspec_adversarial(doc): + parts = [b"Gem::Specification.new do |s|\n"] + for (k, v) in doc.items(): + parts.append(" s.{} = {}\n".format(k, repr(v)).encode()) + parts.append(b"end\n") + MAPPINGS["GemspecMapping"]().translate(b"".join(parts)) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -3,22 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json -import logging - -from hypothesis import HealthCheck, given, settings, strategies -import pytest - -from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer -from swh.indexer.metadata_detector import detect_metadata -from swh.indexer.metadata_dictionary import MAPPINGS -from swh.indexer.metadata_dictionary.maven import MavenMapping -from swh.indexer.metadata_dictionary.npm import NpmMapping -from swh.indexer.metadata_dictionary.ruby import GemspecMapping from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow from swh.indexer.tests.utils import DIRECTORY2 -from swh.model.hashutil import hash_to_bytes from swh.model.model import Directory, DirectoryEntry from .utils import ( @@ -26,8 +13,6 @@ YARN_PARSER_METADATA, fill_obj_storage, fill_storage, - json_document_strategy, - xml_document_strategy, ) TRANSLATOR_TOOL = { @@ -57,1234 +42,6 @@ Tests metadata_mock_tool tool for Metadata detection """ - def setup_method(self): - self.npm_mapping = MAPPINGS["NpmMapping"]() - self.codemeta_mapping = MAPPINGS["CodemetaMapping"]() - self.maven_mapping = MAPPINGS["MavenMapping"]() - self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]() - self.gemspec_mapping = MAPPINGS["GemspecMapping"]() - self.cff_mapping = MAPPINGS["CffMapping"]() - - def test_compute_metadata_none(self): - """ - testing content empty content is empty - should return None - """ - content = b"" - - # None if no metadata was found or an error occurred - declared_metadata = None - result = self.npm_mapping.translate(content) - assert declared_metadata == result - - def test_compute_metadata_cff(self): - """ - testing CITATION.cff translation - """ - content = """# YAML 1.2 ---- -abstract: "Command line program to convert from Citation File \ -Format to various other formats such as BibTeX, EndNote, RIS, \ -schema.org, CodeMeta, and .zenodo.json." -authors: - - - affiliation: "Netherlands eScience Center" - family-names: Klaver - given-names: Tom - - - affiliation: "Humboldt-Universität zu Berlin" - family-names: Druskat - given-names: Stephan - orcid: https://orcid.org/0000-0003-4925-7248 -cff-version: "1.0.3" -date-released: 2019-11-12 -doi: 10.5281/zenodo.1162057 -keywords: - - "citation" - - "bibliography" - - "cff" - - "CITATION.cff" -license: Apache-2.0 -message: "If you use this software, please cite it using these metadata." -repository-code: "https://github.com/citation-file-format/cff-converter-python" -title: cffconvert -version: "1.4.0-alpha0" - """.encode( - "utf-8" - ) - - expected = { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "author": [ - { - "type": "Person", - "affiliation": { - "type": "Organization", - "name": "Netherlands eScience Center", - }, - "familyName": "Klaver", - "givenName": "Tom", - }, - { - "id": "https://orcid.org/0000-0003-4925-7248", - "type": "Person", - "affiliation": { - "type": "Organization", - "name": "Humboldt-Universität zu Berlin", - }, - "familyName": "Druskat", - "givenName": "Stephan", - }, - ], - "codeRepository": ( - "https://github.com/citation-file-format/cff-converter-python" - ), - "datePublished": "2019-11-12", - "description": """Command line program to convert from \ -Citation File Format to various other formats such as BibTeX, EndNote, \ -RIS, schema.org, CodeMeta, and .zenodo.json.""", - "identifier": "https://doi.org/10.5281/zenodo.1162057", - "keywords": ["citation", "bibliography", "cff", "CITATION.cff"], - "license": "https://spdx.org/licenses/Apache-2.0", - "version": "1.4.0-alpha0", - } - - result = self.cff_mapping.translate(content) - assert expected == result - - def test_compute_metadata_cff_invalid_yaml(self): - """ - test yaml translation for invalid yaml file - """ - content = """cff-version: 1.0.3 -message: To cite the SigMF specification, please include the following: -authors: - - name: The GNU Radio Foundation, Inc. - """.encode( - "utf-8" - ) - - expected = None - - result = self.cff_mapping.translate(content) - assert expected == result - - def test_compute_metadata_cff_empty(self): - """ - test yaml translation for empty yaml file - """ - content = """ - """.encode( - "utf-8" - ) - - expected = None - - result = self.cff_mapping.translate(content) - assert expected == result - - def test_compute_metadata_cff_list(self): - """ - test yaml translation for empty yaml file - """ - content = """ -- Foo -- Bar - """.encode( - "utf-8" - ) - - expected = None - - result = self.cff_mapping.translate(content) - assert expected == result - - def test_cff_empty_fields(self): - """ - testing CITATION.cff translation - """ - content = """# YAML 1.2 - authors: - - - affiliation: "Hogwarts" - family-names: - given-names: Harry - - - affiliation: "Ministry of Magic" - family-names: Weasley - orcid: - given-names: Arthur - """.encode( - "utf-8" - ) - - expected = { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "author": [ - { - "type": "Person", - "affiliation": { - "type": "Organization", - "name": "Hogwarts", - }, - "givenName": "Harry", - }, - { - "type": "Person", - "affiliation": { - "type": "Organization", - "name": "Ministry of Magic", - }, - "familyName": "Weasley", - "givenName": "Arthur", - }, - ], - } - - result = self.cff_mapping.translate(content) - assert expected == result - - def test_cff_invalid_fields(self): - """ - testing CITATION.cff translation - """ - content = """# YAML 1.2 - authors: - - - affiliation: "Hogwarts" - family-names: - - Potter - - James - given-names: Harry - """.encode( - "utf-8" - ) - - expected = { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "author": [ - { - "type": "Person", - "affiliation": { - "type": "Organization", - "name": "Hogwarts", - }, - "givenName": "Harry", - }, - ], - } - - result = self.cff_mapping.translate(content) - assert expected == result - - def test_compute_metadata_npm(self): - """ - testing only computation of metadata with hard_mapping_npm - """ - content = b""" - { - "name": "test_metadata", - "version": "0.0.2", - "description": "Simple package.json test for indexer", - "repository": { - "type": "git", - "url": "https://github.com/moranegg/metadata_test" - }, - "author": { - "email": "moranegg@example.com", - "name": "Morane G" - } - } - """ - declared_metadata = { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "test_metadata", - "version": "0.0.2", - "description": "Simple package.json test for indexer", - "codeRepository": "git+https://github.com/moranegg/metadata_test", - "author": [ - { - "type": "Person", - "name": "Morane G", - "email": "moranegg@example.com", - } - ], - } - - result = self.npm_mapping.translate(content) - assert declared_metadata == result - - def test_compute_metadata_invalid_description_npm(self): - """ - testing only computation of metadata with hard_mapping_npm - """ - content = b""" - { - "name": "test_metadata", - "version": "0.0.2", - "description": 1234 - } - """ - declared_metadata = { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "test_metadata", - "version": "0.0.2", - } - - result = self.npm_mapping.translate(content) - assert declared_metadata == result - - def test_index_content_metadata_npm(self): - """ - testing NPM with package.json - - one sha1 uses a file that can't be translated to metadata and - should return None in the translated metadata - """ - sha1s = [ - hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), - hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), - hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"), - ] - # this metadata indexer computes only metadata for package.json - # in npm context with a hard mapping - config = BASE_TEST_CONFIG.copy() - config["tools"] = [TRANSLATOR_TOOL] - metadata_indexer = ContentMetadataTestIndexer(config=config) - fill_obj_storage(metadata_indexer.objstorage) - fill_storage(metadata_indexer.storage) - - metadata_indexer.run(sha1s) - results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) - - expected_results = [ - ContentMetadataRow( - id=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), - tool=TRANSLATOR_TOOL, - metadata={ - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "codeRepository": "git+https://github.com/moranegg/metadata_test", - "description": "Simple package.json test for indexer", - "name": "test_metadata", - "version": "0.0.1", - }, - ), - ContentMetadataRow( - id=hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), - tool=TRANSLATOR_TOOL, - metadata={ - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "issueTracker": "https://github.com/npm/npm/issues", - "author": [ - { - "type": "Person", - "name": "Isaac Z. Schlueter", - "email": "i@izs.me", - "url": "http://blog.izs.me", - } - ], - "codeRepository": "git+https://github.com/npm/npm", - "description": "a package manager for JavaScript", - "license": "https://spdx.org/licenses/Artistic-2.0", - "version": "5.0.3", - "name": "npm", - "keywords": [ - "install", - "modules", - "package manager", - "package.json", - ], - "url": "https://docs.npmjs.com/", - }, - ), - ] - - for result in results: - del result.tool["id"] - - # The assertion below returns False sometimes because of nested lists - assert expected_results == results - - def test_npm_bugs_normalization(self): - # valid dictionary - package_json = b"""{ - "name": "foo", - "bugs": { - "url": "https://github.com/owner/project/issues", - "email": "foo@example.com" - } - }""" - result = self.npm_mapping.translate(package_json) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "issueTracker": "https://github.com/owner/project/issues", - "type": "SoftwareSourceCode", - } - - # "invalid" dictionary - package_json = b"""{ - "name": "foo", - "bugs": { - "email": "foo@example.com" - } - }""" - result = self.npm_mapping.translate(package_json) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "type": "SoftwareSourceCode", - } - - # string - package_json = b"""{ - "name": "foo", - "bugs": "https://github.com/owner/project/issues" - }""" - result = self.npm_mapping.translate(package_json) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "issueTracker": "https://github.com/owner/project/issues", - "type": "SoftwareSourceCode", - } - - def test_npm_repository_normalization(self): - # normal - package_json = b"""{ - "name": "foo", - "repository": { - "type" : "git", - "url" : "https://github.com/npm/cli.git" - } - }""" - result = self.npm_mapping.translate(package_json) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "codeRepository": "git+https://github.com/npm/cli.git", - "type": "SoftwareSourceCode", - } - - # missing url - package_json = b"""{ - "name": "foo", - "repository": { - "type" : "git" - } - }""" - result = self.npm_mapping.translate(package_json) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "type": "SoftwareSourceCode", - } - - # github shortcut - package_json = b"""{ - "name": "foo", - "repository": "github:npm/cli" - }""" - result = self.npm_mapping.translate(package_json) - expected_result = { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "codeRepository": "git+https://github.com/npm/cli.git", - "type": "SoftwareSourceCode", - } - assert result == expected_result - - # github shortshortcut - package_json = b"""{ - "name": "foo", - "repository": "npm/cli" - }""" - result = self.npm_mapping.translate(package_json) - assert result == expected_result - - # gitlab shortcut - package_json = b"""{ - "name": "foo", - "repository": "gitlab:user/repo" - }""" - result = self.npm_mapping.translate(package_json) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "codeRepository": "git+https://gitlab.com/user/repo.git", - "type": "SoftwareSourceCode", - } - - @pytest.mark.parametrize( - "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"] - ) - def test_detect_metadata_package_json(self, filename): - df = [ - { - "sha1_git": b"abc", - "name": b"index.js", - "target": b"abc", - "length": 897, - "status": "visible", - "type": "file", - "perms": 33188, - "dir_id": b"dir_a", - "sha1": b"bcd", - }, - { - "sha1_git": b"aab", - "name": filename, - "target": b"aab", - "length": 712, - "status": "visible", - "type": "file", - "perms": 33188, - "dir_id": b"dir_a", - "sha1": b"cde", - }, - ] - results = detect_metadata(df) - - expected_results = {"NpmMapping": [b"cde"]} - assert expected_results == results - - def test_detect_metadata_codemeta_json_uppercase(self): - df = [ - { - "sha1_git": b"abc", - "name": b"index.html", - "target": b"abc", - "length": 897, - "status": "visible", - "type": "file", - "perms": 33188, - "dir_id": b"dir_a", - "sha1": b"bcd", - }, - { - "sha1_git": b"aab", - "name": b"CODEMETA.json", - "target": b"aab", - "length": 712, - "status": "visible", - "type": "file", - "perms": 33188, - "dir_id": b"dir_a", - "sha1": b"bcd", - }, - ] - results = detect_metadata(df) - - expected_results = {"CodemetaMapping": [b"bcd"]} - assert expected_results == results - - def test_compute_metadata_valid_codemeta(self): - raw_content = b"""{ - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "@type": "SoftwareSourceCode", - "identifier": "CodeMeta", - "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", - "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", - "codeRepository": "https://github.com/codemeta/codemeta", - "issueTracker": "https://github.com/codemeta/codemeta/issues", - "license": "https://spdx.org/licenses/Apache-2.0", - "version": "2.0", - "author": [ - { - "@type": "Person", - "givenName": "Carl", - "familyName": "Boettiger", - "email": "cboettig@gmail.com", - "@id": "http://orcid.org/0000-0002-1642-628X" - }, - { - "@type": "Person", - "givenName": "Matthew B.", - "familyName": "Jones", - "email": "jones@nceas.ucsb.edu", - "@id": "http://orcid.org/0000-0003-0077-4738" - } - ], - "maintainer": { - "@type": "Person", - "givenName": "Carl", - "familyName": "Boettiger", - "email": "cboettig@gmail.com", - "@id": "http://orcid.org/0000-0002-1642-628X" - }, - "contIntegration": "https://travis-ci.org/codemeta/codemeta", - "developmentStatus": "active", - "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", - "funder": { - "@id": "https://doi.org/10.13039/100000001", - "@type": "Organization", - "name": "National Science Foundation" - }, - "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", - "keywords": [ - "metadata", - "software" - ], - "version":"2.0", - "dateCreated":"2017-06-05", - "datePublished":"2017-06-05", - "programmingLanguage": "JSON-LD" - }""" # noqa - expected_result = { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "identifier": "CodeMeta", - "description": "CodeMeta is a concept vocabulary that can " - "be used to standardize the exchange of software metadata " - "across repositories and organizations.", - "name": "CodeMeta: Minimal metadata schemas for science " - "software and code, in JSON-LD", - "codeRepository": "https://github.com/codemeta/codemeta", - "issueTracker": "https://github.com/codemeta/codemeta/issues", - "license": "https://spdx.org/licenses/Apache-2.0", - "version": "2.0", - "author": [ - { - "type": "Person", - "givenName": "Carl", - "familyName": "Boettiger", - "email": "cboettig@gmail.com", - "id": "http://orcid.org/0000-0002-1642-628X", - }, - { - "type": "Person", - "givenName": "Matthew B.", - "familyName": "Jones", - "email": "jones@nceas.ucsb.edu", - "id": "http://orcid.org/0000-0003-0077-4738", - }, - ], - "maintainer": { - "type": "Person", - "givenName": "Carl", - "familyName": "Boettiger", - "email": "cboettig@gmail.com", - "id": "http://orcid.org/0000-0002-1642-628X", - }, - "contIntegration": "https://travis-ci.org/codemeta/codemeta", - "developmentStatus": "active", - "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", - "funder": { - "id": "https://doi.org/10.13039/100000001", - "type": "Organization", - "name": "National Science Foundation", - }, - "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " - "in Scientific Software", - "keywords": ["metadata", "software"], - "version": "2.0", - "dateCreated": "2017-06-05", - "datePublished": "2017-06-05", - "programmingLanguage": "JSON-LD", - } - result = self.codemeta_mapping.translate(raw_content) - assert result == expected_result - - def test_compute_metadata_codemeta_alternate_context(self): - raw_content = b"""{ - "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", - "@type": "SoftwareSourceCode", - "identifier": "CodeMeta" - }""" # noqa - expected_result = { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "identifier": "CodeMeta", - } - result = self.codemeta_mapping.translate(raw_content) - assert result == expected_result - - def test_compute_metadata_maven(self): - raw_content = b""" - - Maven Default Project - 4.0.0 - com.mycompany.app - my-app - 1.2.3 - - - central - Maven Repository Switchboard - default - http://repo1.maven.org/maven2 - - false - - - - - - Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0.txt - repo - A business-friendly OSS license - - - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "license": "https://www.apache.org/licenses/LICENSE-2.0.txt", - "codeRepository": ( - "http://repo1.maven.org/maven2/com/mycompany/app/my-app" - ), - } - - def test_compute_metadata_maven_empty(self): - raw_content = b""" - - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - } - - def test_compute_metadata_maven_almost_empty(self): - raw_content = b""" - - - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - } - - def test_compute_metadata_maven_invalid_xml(self, caplog): - expected_warning = ( - "swh.indexer.metadata_dictionary.maven.MavenMapping", - logging.WARNING, - "Error parsing XML from foo", - ) - caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary") - - raw_content = b""" - """ - caplog.clear() - result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - assert caplog.record_tuples == [expected_warning] - assert result is None - - raw_content = b""" - """ - caplog.clear() - result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - assert caplog.record_tuples == [expected_warning] - assert result is None - - def test_compute_metadata_maven_unknown_encoding(self, caplog): - expected_warning = ( - "swh.indexer.metadata_dictionary.maven.MavenMapping", - logging.WARNING, - "Error detecting XML encoding from foo", - ) - caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary") - - raw_content = b""" - - """ - caplog.clear() - result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - assert caplog.record_tuples == [expected_warning] - assert result is None - - raw_content = b""" - - """ - caplog.clear() - result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - assert caplog.record_tuples == [expected_warning] - assert result is None - - def test_compute_metadata_maven_invalid_encoding(self, caplog): - expected_warning = [ - # libexpat1 <= 2.2.10-2+deb11u1 - [ - ( - "swh.indexer.metadata_dictionary.maven.MavenMapping", - logging.WARNING, - "Error unidecoding XML from foo", - ) - ], - # libexpat1 >= 2.2.10-2+deb11u2 - [ - ( - "swh.indexer.metadata_dictionary.maven.MavenMapping", - logging.WARNING, - "Error parsing XML from foo", - ) - ], - ] - caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary") - - raw_content = b""" - - """ - caplog.clear() - result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - assert caplog.record_tuples in expected_warning - assert result is None - - def test_compute_metadata_maven_minimal(self): - raw_content = b""" - - Maven Default Project - 4.0.0 - com.mycompany.app - my-app - 1.2.3 - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - } - - def test_compute_metadata_maven_empty_nodes(self): - raw_content = b""" - - Maven Default Project - 4.0.0 - com.mycompany.app - my-app - 1.2.3 - - - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - } - - raw_content = b""" - - Maven Default Project - 4.0.0 - com.mycompany.app - my-app - - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - } - - raw_content = b""" - - - 4.0.0 - com.mycompany.app - my-app - 1.2.3 - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - } - - raw_content = b""" - - Maven Default Project - 4.0.0 - com.mycompany.app - my-app - 1.2.3 - - - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - } - - raw_content = b""" - - - 1.2.3 - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "version": "1.2.3", - } - - def test_compute_metadata_maven_invalid_licenses(self): - raw_content = b""" - - Maven Default Project - 4.0.0 - com.mycompany.app - my-app - 1.2.3 - - foo - - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - } - - def test_compute_metadata_maven_multiple(self): - """Tests when there are multiple code repos and licenses.""" - raw_content = b""" - - Maven Default Project - 4.0.0 - com.mycompany.app - my-app - 1.2.3 - - - central - Maven Repository Switchboard - default - http://repo1.maven.org/maven2 - - false - - - - example - Example Maven Repo - default - http://example.org/maven2 - - - - - Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0.txt - repo - A business-friendly OSS license - - - MIT license - https://opensource.org/licenses/MIT - - - """ - result = self.maven_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "license": [ - "https://www.apache.org/licenses/LICENSE-2.0.txt", - "https://opensource.org/licenses/MIT", - ], - "codeRepository": [ - "http://repo1.maven.org/maven2/com/mycompany/app/my-app", - "http://example.org/maven2/com/mycompany/app/my-app", - ], - } - - def test_compute_metadata_pkginfo(self): - raw_content = b"""\ -Metadata-Version: 2.1 -Name: swh.core -Version: 0.0.49 -Summary: Software Heritage core utilities -Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ -Author: Software Heritage developers -Author-email: swh-devel@inria.fr -License: UNKNOWN -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest -Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Source, https://forge.softwareheritage.org/source/swh-core -Description: swh-core - ======== - \x20 - core library for swh's modules: - - config parser - - hash computations - - serialization - - logging mechanism - \x20 -Platform: UNKNOWN -Classifier: Programming Language :: Python :: 3 -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) -Classifier: Operating System :: OS Independent -Classifier: Development Status :: 5 - Production/Stable -Description-Content-Type: text/markdown -Provides-Extra: testing -""" # noqa - result = self.pkginfo_mapping.translate(raw_content) - assert result["description"] == [ - "Software Heritage core utilities", # note the comma here - "swh-core\n" - "========\n" - "\n" - "core library for swh's modules:\n" - "- config parser\n" - "- hash computations\n" - "- serialization\n" - "- logging mechanism\n" - "", - ], result - del result["description"] - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "url": "https://forge.softwareheritage.org/diffusion/DCORE/", - "name": "swh.core", - "author": [ - { - "type": "Person", - "name": "Software Heritage developers", - "email": "swh-devel@inria.fr", - } - ], - "version": "0.0.49", - } - - def test_compute_metadata_pkginfo_utf8(self): - raw_content = b"""\ -Metadata-Version: 1.1 -Name: snowpyt -Description-Content-Type: UNKNOWN -Description: foo - Hydrology N\xc2\xb083 -""" # noqa - result = self.pkginfo_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "snowpyt", - "description": "foo\nHydrology N°83", - } - - def test_compute_metadata_pkginfo_keywords(self): - raw_content = b"""\ -Metadata-Version: 2.1 -Name: foo -Keywords: foo bar baz -""" # noqa - result = self.pkginfo_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "foo", - "keywords": ["foo", "bar", "baz"], - } - - def test_compute_metadata_pkginfo_license(self): - raw_content = b"""\ -Metadata-Version: 2.1 -Name: foo -License: MIT -""" # noqa - result = self.pkginfo_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "foo", - "license": "MIT", - } - - def test_gemspec_base(self): - raw_content = b""" -Gem::Specification.new do |s| - s.name = 'example' - s.version = '0.1.0' - s.licenses = ['MIT'] - s.summary = "This is an example!" - s.description = "Much longer explanation of the example!" - s.authors = ["Ruby Coder"] - s.email = 'rubycoder@example.com' - s.files = ["lib/example.rb"] - s.homepage = 'https://rubygems.org/gems/example' - s.metadata = { "source_code_uri" => "https://github.com/example/example" } -end""" - result = self.gemspec_mapping.translate(raw_content) - assert set(result.pop("description")) == { - "This is an example!", - "Much longer explanation of the example!", - } - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "author": [{"type": "Person", "name": "Ruby Coder"}], - "name": "example", - "license": "https://spdx.org/licenses/MIT", - "codeRepository": "https://rubygems.org/gems/example", - "email": "rubycoder@example.com", - "version": "0.1.0", - } - - def test_gemspec_two_author_fields(self): - raw_content = b""" -Gem::Specification.new do |s| - s.authors = ["Ruby Coder1"] - s.author = "Ruby Coder2" -end""" - result = self.gemspec_mapping.translate(raw_content) - assert result.pop("author") in ( - [ - {"type": "Person", "name": "Ruby Coder1"}, - {"type": "Person", "name": "Ruby Coder2"}, - ], - [ - {"type": "Person", "name": "Ruby Coder2"}, - {"type": "Person", "name": "Ruby Coder1"}, - ], - ) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - } - - def test_gemspec_invalid_author(self): - raw_content = b""" -Gem::Specification.new do |s| - s.author = ["Ruby Coder"] -end""" - result = self.gemspec_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - } - raw_content = b""" -Gem::Specification.new do |s| - s.author = "Ruby Coder1", -end""" - result = self.gemspec_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - } - raw_content = b""" -Gem::Specification.new do |s| - s.authors = ["Ruby Coder1", ["Ruby Coder2"]] -end""" - result = self.gemspec_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "author": [{"type": "Person", "name": "Ruby Coder1"}], - } - - def test_gemspec_alternative_header(self): - raw_content = b""" -require './lib/version' - -Gem::Specification.new { |s| - s.name = 'rb-system-with-aliases' - s.summary = 'execute system commands with aliases' -} -""" - result = self.gemspec_mapping.translate(raw_content) - assert result == { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "rb-system-with-aliases", - "description": "execute system commands with aliases", - } - - @settings(suppress_health_check=[HealthCheck.too_slow]) - @given(json_document_strategy(keys=list(NpmMapping.mapping))) - def test_npm_adversarial(self, doc): - raw = json.dumps(doc).encode() - self.npm_mapping.translate(raw) - - @settings(suppress_health_check=[HealthCheck.too_slow]) - @given(json_document_strategy(keys=CODEMETA_TERMS)) - def test_codemeta_adversarial(self, doc): - raw = json.dumps(doc).encode() - self.codemeta_mapping.translate(raw) - - @settings(suppress_health_check=[HealthCheck.too_slow]) - @given( - xml_document_strategy( - keys=list(MavenMapping.mapping), - root="project", - xmlns="http://maven.apache.org/POM/4.0.0", - ) - ) - def test_maven_adversarial(self, doc): - self.maven_mapping.translate(doc) - - @settings(suppress_health_check=[HealthCheck.too_slow]) - @given( - strategies.dictionaries( - # keys - strategies.one_of( - strategies.text(), *map(strategies.just, GemspecMapping.mapping) - ), - # values - strategies.recursive( - strategies.characters(), - lambda children: strategies.lists(children, min_size=1), - ), - ) - ) - def test_gemspec_adversarial(self, doc): - parts = [b"Gem::Specification.new do |s|\n"] - for (k, v) in doc.items(): - parts.append(" s.{} = {}\n".format(k, repr(v)).encode()) - parts.append(b"end\n") - self.gemspec_mapping.translate(b"".join(parts)) - def test_directory_metadata_indexer(self): metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) diff --git a/swh/indexer/tests/zz_celery/README b/swh/indexer/tests/zz_celery/README new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/zz_celery/README @@ -0,0 +1,2 @@ +this directory is named "zz_celery" so pytest runs it last, to prevent +Celery-related fixtures from interfering with other tests diff --git a/swh/indexer/tests/zz_celery/__init__.py b/swh/indexer/tests/zz_celery/__init__.py new file mode 100644 diff --git a/swh/indexer/tests/test_tasks.py b/swh/indexer/tests/zz_celery/test_tasks.py rename from swh/indexer/tests/test_tasks.py rename to swh/indexer/tests/zz_celery/test_tasks.py