diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py index fa85d75..1ba1528 100644 --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -1,77 +1,74 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta from unittest.mock import patch import pytest from swh.objstorage import get_objstorage from swh.storage import get_storage from swh.indexer.storage import get_indexer_storage from .utils import fill_storage, fill_obj_storage TASK_NAMES = ["revision_intrinsic_metadata", "origin_intrinsic_metadata"] -storage_config = {"cls": "pipeline", "steps": [{"cls": "validate"}, {"cls": "memory"},]} - - @pytest.fixture def indexer_scheduler(swh_scheduler): for taskname in TASK_NAMES: swh_scheduler.create_task_type( { "type": taskname, "description": "The {} indexer testing task".format(taskname), "backend_name": "swh.indexer.tests.tasks.{}".format(taskname), "default_interval": timedelta(days=1), "min_interval": timedelta(hours=6), "max_interval": timedelta(days=12), "num_retries": 3, } ) return swh_scheduler @pytest.fixture def idx_storage(): """An instance of in-memory indexer storage that gets injected into all indexers classes. """ idx_storage = get_indexer_storage("memory", {}) with patch("swh.indexer.storage.in_memory.IndexerStorage") as idx_storage_mock: idx_storage_mock.return_value = idx_storage yield idx_storage @pytest.fixture def storage(): """An instance of in-memory storage that gets injected into all indexers classes. """ - storage = get_storage(**storage_config) + storage = get_storage(cls="memory") fill_storage(storage) with patch("swh.storage.in_memory.InMemoryStorage") as storage_mock: storage_mock.return_value = storage yield storage @pytest.fixture def obj_storage(): """An instance of in-memory objstorage that gets injected into all indexers classes. """ objstorage = get_objstorage("memory", {}) fill_obj_storage(objstorage) with patch.dict( "swh.objstorage.factory._STORAGE_CLASSES", {"memory": lambda: objstorage} ): yield objstorage diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index c3ef250..7abb4ed 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,1210 +1,1205 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import unittest -import attr - from hypothesis import given, strategies, settings, HealthCheck from swh.model.hashutil import hash_to_bytes +from swh.model.model import Directory, DirectoryEntry, Revision from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_dictionary.maven import MavenMapping from swh.indexer.metadata_dictionary.npm import NpmMapping from swh.indexer.metadata_dictionary.ruby import GemspecMapping from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer +from swh.indexer.tests.utils import REVISION, DIRECTORY2 + from .utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage, YARN_PARSER_METADATA, json_document_strategy, xml_document_strategy, ) TRANSLATOR_TOOL = { "name": "swh-metadata-translator", "version": "0.0.2", "configuration": {"type": "local", "context": "NpmMapping"}, } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, "should not be called; the rev indexer configures it." REVISION_METADATA_CONFIG = { **BASE_TEST_CONFIG, "tools": TRANSLATOR_TOOL, } class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.npm_mapping = MAPPINGS["NpmMapping"]() self.codemeta_mapping = MAPPINGS["CodemetaMapping"]() self.maven_mapping = MAPPINGS["MavenMapping"]() self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]() self.gemspec_mapping = MAPPINGS["GemspecMapping"]() def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "codeRepository": "git+https://github.com/moranegg/metadata_test", "author": [ {"type": "Person", "name": "Morane G", "email": "moranegg@example.com",} ], } # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping config = BASE_TEST_CONFIG.copy() config["tools"] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s, policy_update="ignore-dups") results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) expected_results = [ { "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "codeRepository": "git+https://github.com/moranegg/metadata_test", "description": "Simple package.json test for indexer", "name": "test_metadata", "version": "0.0.1", }, "id": hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), }, { "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "issueTracker": "https://github.com/npm/npm/issues", "author": [ { "type": "Person", "name": "Isaac Z. Schlueter", "email": "i@izs.me", "url": "http://blog.izs.me", } ], "codeRepository": "git+https://github.com/npm/npm", "description": "a package manager for JavaScript", "license": "https://spdx.org/licenses/Artistic-2.0", "version": "5.0.3", "name": "npm", "keywords": [ "install", "modules", "package manager", "package.json", ], "url": "https://docs.npmjs.com/", }, "id": hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), }, ] for result in results: del result["tool"] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_npm_bugs_normalization(self): # valid dictionary package_json = b"""{ "name": "foo", "bugs": { "url": "https://github.com/owner/project/issues", "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", }, ) # "invalid" dictionary package_json = b"""{ "name": "foo", "bugs": { "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", }, ) # string package_json = b"""{ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", }, ) def test_npm_repository_normalization(self): # normal package_json = b"""{ "name": "foo", "repository": { "type" : "git", "url" : "https://github.com/npm/cli.git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", }, ) # missing url package_json = b"""{ "name": "foo", "repository": { "type" : "git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", }, ) # github shortcut package_json = b"""{ "name": "foo", "repository": "github:npm/cli" }""" result = self.npm_mapping.translate(package_json) expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", } self.assertEqual(result, expected_result) # github shortshortcut package_json = b"""{ "name": "foo", "repository": "npm/cli" }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, expected_result) # gitlab shortcut package_json = b"""{ "name": "foo", "repository": "gitlab:user/repo" }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://gitlab.com/user/repo.git", "type": "SoftwareSourceCode", }, ) def test_detect_metadata_package_json(self): # given df = [ { "sha1_git": b"abc", "name": b"index.js", "target": b"abc", "length": 897, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, { "sha1_git": b"aab", "name": b"package.json", "target": b"aab", "length": 712, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"cde", }, ] # when results = detect_metadata(df) expected_results = {"NpmMapping": [b"cde"]} # then self.assertEqual(expected_results, results) def test_compute_metadata_valid_codemeta(self): raw_content = b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""" # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X", }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738", }, ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X", }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation", }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": ["metadata", "software"], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD", } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_codemeta_alternate_context(self): raw_content = b"""{ "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", "@type": "SoftwareSourceCode", "identifier": "CodeMeta" }""" # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "license": "https://www.apache.org/licenses/LICENSE-2.0.txt", "codeRepository": ( "http://repo1.maven.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_compute_metadata_maven_almost_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_compute_metadata_maven_invalid_xml(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error parsing XML from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_unknown_encoding(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error detecting XML encoding from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_invalid_encoding(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error unidecoding XML from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_minimal(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_empty_nodes(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "version": "1.2.3", }, ) def test_compute_metadata_maven_invalid_licenses(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 foo """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_multiple(self): """Tests when there are multiple code repos and licenses.""" raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false example Example Maven Repo default http://example.org/maven2 Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license MIT license https://opensource.org/licenses/MIT """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "license": [ "https://www.apache.org/licenses/LICENSE-2.0.txt", "https://opensource.org/licenses/MIT", ], "codeRepository": [ "http://repo1.maven.org/maven2/com/mycompany/app/my-app", "http://example.org/maven2/com/mycompany/app/my-app", ], }, ) def test_compute_metadata_pkginfo(self): raw_content = b"""\ Metadata-Version: 2.1 Name: swh.core Version: 0.0.49 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== \x20 core library for swh's modules: - config parser - hash computations - serialization - logging mechanism \x20 Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertCountEqual( result["description"], [ "Software Heritage core utilities", # note the comma here "swh-core\n" "========\n" "\n" "core library for swh's modules:\n" "- config parser\n" "- hash computations\n" "- serialization\n" "- logging mechanism\n" "", ], result, ) del result["description"] self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "url": "https://forge.softwareheritage.org/diffusion/DCORE/", "name": "swh.core", "author": [ { "type": "Person", "name": "Software Heritage developers", "email": "swh-devel@inria.fr", } ], "version": "0.0.49", }, ) def test_compute_metadata_pkginfo_utf8(self): raw_content = b"""\ Metadata-Version: 1.1 Name: snowpyt Description-Content-Type: UNKNOWN Description: foo Hydrology N\xc2\xb083 """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "snowpyt", "description": "foo\nHydrology N°83", }, ) def test_compute_metadata_pkginfo_keywords(self): raw_content = b"""\ Metadata-Version: 2.1 Name: foo Keywords: foo bar baz """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "foo", "keywords": ["foo", "bar", "baz"], }, ) def test_compute_metadata_pkginfo_license(self): raw_content = b"""\ Metadata-Version: 2.1 Name: foo License: MIT """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "foo", "license": "MIT", }, ) def test_gemspec_base(self): raw_content = b""" Gem::Specification.new do |s| s.name = 'example' s.version = '0.1.0' s.licenses = ['MIT'] s.summary = "This is an example!" s.description = "Much longer explanation of the example!" s.authors = ["Ruby Coder"] s.email = 'rubycoder@example.com' s.files = ["lib/example.rb"] s.homepage = 'https://rubygems.org/gems/example' s.metadata = { "source_code_uri" => "https://github.com/example/example" } end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual( result.pop("description"), ["This is an example!", "Much longer explanation of the example!"], ) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"type": "Person", "name": "Ruby Coder"}], "name": "example", "license": "https://spdx.org/licenses/MIT", "codeRepository": "https://rubygems.org/gems/example", "email": "rubycoder@example.com", "version": "0.1.0", }, ) def test_gemspec_two_author_fields(self): raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1"] s.author = "Ruby Coder2" end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual( result.pop("author"), [ {"type": "Person", "name": "Ruby Coder1"}, {"type": "Person", "name": "Ruby Coder2"}, ], ) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_gemspec_invalid_author(self): raw_content = b""" Gem::Specification.new do |s| s.author = ["Ruby Coder"] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) raw_content = b""" Gem::Specification.new do |s| s.author = "Ruby Coder1", end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1", ["Ruby Coder2"]] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"type": "Person", "name": "Ruby Coder1"}], }, ) def test_gemspec_alternative_header(self): raw_content = b""" require './lib/version' Gem::Specification.new { |s| s.name = 'rb-system-with-aliases' s.summary = 'execute system commands with aliases' } """ result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "rb-system-with-aliases", "description": "execute system commands with aliases", }, ) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=list(NpmMapping.mapping))) def test_npm_adversarial(self, doc): raw = json.dumps(doc).encode() self.npm_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=CODEMETA_TERMS)) def test_codemeta_adversarial(self, doc): raw = json.dumps(doc).encode() self.codemeta_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given( xml_document_strategy( keys=list(MavenMapping.mapping), root="project", xmlns="http://maven.apache.org/POM/4.0.0", ) ) def test_maven_adversarial(self, doc): self.maven_mapping.translate(doc) @settings(suppress_health_check=[HealthCheck.too_slow]) @given( strategies.dictionaries( # keys strategies.one_of( strategies.text(), *map(strategies.just, GemspecMapping.mapping) ), # values strategies.recursive( strategies.characters(), lambda children: strategies.lists(children, min_size=1), ), ) ) def test_gemspec_adversarial(self, doc): parts = [b"Gem::Specification.new do |s|\n"] for (k, v) in doc.items(): parts.append(" s.{} = {}\n".format(k, repr(v)).encode()) parts.append(b"end\n") self.gemspec_mapping.translate(b"".join(parts)) def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( - {"tool_" + k: v for (k, v) in TRANSLATOR_TOOL.items()} + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None + rev = REVISION + assert rev.directory == DIRECTORY2.id metadata_indexer.idx_storage.content_metadata_add( [ { "indexer_configuration_id": tool["id"], - "id": b"cde", + "id": DIRECTORY2.entries[0].target, "metadata": YARN_PARSER_METADATA, } ] ) - sha1_gits = [ - hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), - ] - metadata_indexer.run(sha1_gits, "update-dups") + metadata_indexer.run([rev.id], "update-dups") results = list( - metadata_indexer.idx_storage.revision_intrinsic_metadata_get(sha1_gits) + metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id]) ) expected_results = [ { - "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), + "id": rev.id, "tool": TRANSLATOR_TOOL, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"], } ] for result in results: del result["tool"]["id"] # then - self.assertEqual(expected_results, results) + self.assertEqual(results, expected_results) def test_revision_metadata_indexer_single_root_dir(self): metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root # of the revision - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") - rev = metadata_indexer.storage._revisions[rev_id] - subdir_id = rev.directory - rev = attr.evolve(rev, directory=b"123456") - metadata_indexer.storage.directory_add( - [ - { - "id": b"123456", - "entries": [ - { - "name": b"foobar-1.0.0", - "type": "dir", - "target": subdir_id, - "perms": 16384, - } - ], - } - ] + rev = REVISION + assert rev.directory == DIRECTORY2.id + + directory = Directory( + entries=( + DirectoryEntry( + name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384, + ), + ), ) + assert directory.id is not None + metadata_indexer.storage.directory_add([directory]) + + new_rev_dict = {**rev.to_dict(), "directory": directory.id} + new_rev_dict.pop("id") + new_rev = Revision.from_dict(new_rev_dict) + metadata_indexer.storage.revision_add([new_rev]) tool = metadata_indexer.idx_storage.indexer_configuration_get( - {"tool_" + k: v for (k, v) in TRANSLATOR_TOOL.items()} + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None metadata_indexer.idx_storage.content_metadata_add( [ { "indexer_configuration_id": tool["id"], - "id": b"cde", + "id": DIRECTORY2.entries[0].target, "metadata": YARN_PARSER_METADATA, } ] ) - sha1_gits = [ - hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), - ] - metadata_indexer.run(sha1_gits, "update-dups") + metadata_indexer.run([new_rev.id], "update-dups") results = list( - metadata_indexer.idx_storage.revision_intrinsic_metadata_get(sha1_gits) + metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id]) ) expected_results = [ { - "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), + "id": new_rev.id, "tool": TRANSLATOR_TOOL, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"], } ] for result in results: del result["tool"]["id"] # then - self.assertEqual(expected_results, results) + self.assertEqual(results, expected_results) diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py index f87cf81..c137dd0 100644 --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -1,191 +1,170 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from datetime import datetime, timezone from swh.model.model import OriginVisit, OriginVisitStatus from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.tests.utils import BASE_TEST_CONFIG, fill_storage from swh.storage.utils import now +from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType + ORIGIN_HEAD_CONFIG = { **BASE_TEST_CONFIG, "tools": {"name": "origin-metadata", "version": "0.0.1", "configuration": {},}, "tasks": {"revision_intrinsic_metadata": None, "origin_intrinsic_metadata": None,}, } class OriginHeadTestIndexer(OriginHeadIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return ORIGIN_HEAD_CONFIG def persist_index_computations(self, results, policy_update): self.results = results class OriginHead(unittest.TestCase): def setUp(self): self.indexer = OriginHeadTestIndexer() self.indexer.catch_exceptions = False fill_storage(self.indexer.storage) def test_git(self): - self.indexer.run(["https://github.com/SoftwareHeritage/swh-storage"]) + origin_url = "https://github.com/SoftwareHeritage/swh-storage" + self.indexer.run([origin_url]) + rev_id = b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm" self.assertEqual( - self.indexer.results, - [ - { - "revision_id": b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{" - b"\xd7}\xac\xefrm", - "origin_url": "https://github.com/SoftwareHeritage/swh-storage", - } - ], + self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}], ) def test_git_partial_snapshot(self): """Checks partial snapshots are ignored.""" origin_url = "https://github.com/SoftwareHeritage/swh-core" - self.indexer.storage.origin_add([{"url": origin_url,}]) + self.indexer.storage.origin_add([Origin(url=origin_url)]) visit = self.indexer.storage.origin_visit_add( [ OriginVisit( origin=origin_url, date=datetime(2019, 2, 27, tzinfo=timezone.utc), type="git", ) ] )[0] self.indexer.storage.snapshot_add( [ - { - "id": b"foo", - "branches": { + Snapshot( + branches={ b"foo": None, - b"HEAD": {"target_type": "alias", "target": b"foo",}, + b"HEAD": SnapshotBranch( + target_type=TargetType.ALIAS, target=b"foo", + ), }, - } + ), ] ) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=now(), status="partial", snapshot=b"foo", ) self.indexer.storage.origin_visit_status_add([visit_status]) self.indexer.run([origin_url]) self.assertEqual(self.indexer.results, []) def test_vcs_missing_snapshot(self): - self.indexer.storage.origin_add( - [{"url": "https://github.com/SoftwareHeritage/swh-indexer",}] - ) - self.indexer.run(["https://github.com/SoftwareHeritage/swh-indexer"]) + origin_url = "https://github.com/SoftwareHeritage/swh-indexer" + self.indexer.storage.origin_add([Origin(url=origin_url)]) + self.indexer.run([origin_url]) self.assertEqual(self.indexer.results, []) def test_pypi_missing_branch(self): origin_url = "https://pypi.org/project/abcdef/" - self.indexer.storage.origin_add([{"url": origin_url,}]) + self.indexer.storage.origin_add([Origin(url=origin_url,)]) visit = self.indexer.storage.origin_visit_add( [ OriginVisit( origin=origin_url, date=datetime(2019, 2, 27, tzinfo=timezone.utc), type="pypi", ) ] )[0] self.indexer.storage.snapshot_add( [ - { - "id": b"foo", - "branches": { + Snapshot( + branches={ b"foo": None, - b"HEAD": {"target_type": "alias", "target": b"foo",}, + b"HEAD": SnapshotBranch( + target_type=TargetType.ALIAS, target=b"foo", + ), }, - } + ) ] ) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=now(), status="full", snapshot=b"foo", ) self.indexer.storage.origin_visit_status_add([visit_status]) self.indexer.run(["https://pypi.org/project/abcdef/"]) self.assertEqual(self.indexer.results, []) def test_ftp(self): - self.indexer.run(["rsync://ftp.gnu.org/gnu/3dldf"]) + origin_url = "rsync://ftp.gnu.org/gnu/3dldf" + self.indexer.run([origin_url]) + rev_id = b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by" self.assertEqual( - self.indexer.results, - [ - { - "revision_id": b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee" - b"\xcc\x1a\xb4`\x8c\x8by", - "origin_url": "rsync://ftp.gnu.org/gnu/3dldf", - } - ], + self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}], ) def test_ftp_missing_snapshot(self): - self.indexer.storage.origin_add([{"url": "rsync://ftp.gnu.org/gnu/foobar",}]) - self.indexer.run(["rsync://ftp.gnu.org/gnu/foobar"]) + origin_url = "rsync://ftp.gnu.org/gnu/foobar" + self.indexer.storage.origin_add([Origin(url=origin_url)]) + self.indexer.run([origin_url]) self.assertEqual(self.indexer.results, []) def test_deposit(self): - self.indexer.run(["https://forge.softwareheritage.org/source/jesuisgpl/"]) + origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/" + self.indexer.storage.origin_add([Origin(url=origin_url)]) + self.indexer.run([origin_url]) + rev_id = b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb" self.assertEqual( - self.indexer.results, - [ - { - "revision_id": b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{" - b"\xa6\xe9\x99\xb1\x9e]q\xeb", - "origin_url": "https://forge.softwareheritage.org/source/" - "jesuisgpl/", - } - ], + self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}], ) def test_deposit_missing_snapshot(self): - self.indexer.storage.origin_add( - [{"url": "https://forge.softwareheritage.org/source/foobar",}] - ) - self.indexer.run(["https://forge.softwareheritage.org/source/foobar"]) + origin_url = "https://forge.softwareheritage.org/source/foobar" + self.indexer.storage.origin_add([Origin(url=origin_url,)]) + self.indexer.run([origin_url]) self.assertEqual(self.indexer.results, []) def test_pypi(self): - self.indexer.run(["https://pypi.org/project/limnoria/"]) + origin_url = "https://pypi.org/project/limnoria/" + self.indexer.run([origin_url]) + + rev_id = b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t" self.assertEqual( - self.indexer.results, - [ - { - "revision_id": b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k" - b"A\x10\x9d\xc5\xfa2\xf8t", - "origin_url": "https://pypi.org/project/limnoria/", - } - ], + self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url}], ) def test_svn(self): - self.indexer.run(["http://0-512-md.googlecode.com/svn/"]) + origin_url = "http://0-512-md.googlecode.com/svn/" + self.indexer.run([origin_url]) + rev_id = b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18" self.assertEqual( - self.indexer.results, - [ - { - "revision_id": b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8" - b"\xc9\xad#.\x1bw=\x18", - "origin_url": "http://0-512-md.googlecode.com/svn/", - } - ], + self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}], ) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index 79e8de3..2533981 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,224 +1,212 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch -from swh.model.hashutil import hash_to_bytes - from swh.indexer.metadata import OriginMetadataIndexer -from .utils import YARN_PARSER_METADATA +from swh.model.model import Origin + +from .utils import YARN_PARSER_METADATA, REVISION from .test_metadata import REVISION_METADATA_CONFIG def test_origin_metadata_indexer(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["https://github.com/librariesio/yarn-parser"]) - origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + indexer.run([origin]) + rev_id = REVISION.id rev_metadata = { "id": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"], } origin_metadata = { "id": origin, "from_revision": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"], } results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result["tool"] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) for result in results: del result["tool"] assert results == [origin_metadata] def test_origin_metadata_indexer_duplicate_origin(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage indexer.run(["https://github.com/librariesio/yarn-parser"]) - indexer.run(["https://github.com/librariesio/yarn-parser"] * 2) origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert len(results) == 1 def test_origin_metadata_indexer_missing_head(idx_storage, storage, obj_storage): - - storage.origin_add([{"url": "https://example.com"}]) + storage.origin_add([Origin(url="https://example.com")]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["https://example.com"]) origin = "https://example.com" results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_partial_missing_head( idx_storage, storage, obj_storage ): - storage.origin_add([{"url": "https://example.com"}]) - - indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["https://example.com", "https://github.com/librariesio/yarn-parser"]) - origin1 = "https://example.com" origin2 = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + storage.origin_add([Origin(url=origin1)]) + indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + indexer.run([origin1, origin2]) - rev_metadata = { - "id": rev_id, - "metadata": YARN_PARSER_METADATA, - "mappings": ["npm"], - } - origin_metadata = { - "id": origin2, - "from_revision": rev_id, - "metadata": YARN_PARSER_METADATA, - "mappings": ["npm"], - } + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result["tool"] - assert results == [rev_metadata] + assert results == [ + {"id": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"],} + ] results = list( indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]) ) for result in results: del result["tool"] - assert results == [origin_metadata] + assert results == [ + { + "id": origin2, + "from_revision": rev_id, + "metadata": YARN_PARSER_METADATA, + "mappings": ["npm"], + } + ] def test_origin_metadata_indexer_duplicate_revision(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage - indexer.run( - [ - "https://github.com/librariesio/yarn-parser", - "https://github.com/librariesio/yarn-parser.git", - ] - ) - origin1 = "https://github.com/librariesio/yarn-parser" origin2 = "https://github.com/librariesio/yarn-parser.git" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + indexer.run([origin1, origin2]) + + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list( indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]) ) assert len(results) == 2 def test_origin_metadata_indexer_no_metadata_file(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + origin = "https://github.com/librariesio/yarn-parser" with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"): - indexer.run(["https://github.com/librariesio/yarn-parser"]) + indexer.run([origin]) - origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_no_metadata(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + origin = "https://github.com/librariesio/yarn-parser" with patch( "swh.indexer.metadata.RevisionMetadataIndexer" ".translate_revision_intrinsic_metadata", return_value=(["npm"], {"@context": "foo"}), ): - indexer.run(["https://github.com/librariesio/yarn-parser"]) + indexer.run([origin]) - origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_error(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + origin = "https://github.com/librariesio/yarn-parser" with patch( "swh.indexer.metadata.RevisionMetadataIndexer" ".translate_revision_intrinsic_metadata", return_value=None, ): - indexer.run(["https://github.com/librariesio/yarn-parser"]) + indexer.run([origin]) - origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_delete_metadata(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["https://github.com/librariesio/yarn-parser"]) - origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + indexer.run([origin]) + + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results != [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results != [] with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"): - indexer.run(["https://github.com/librariesio/yarn-parser"]) + indexer.run([origin]) results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_unknown_origin(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) result = indexer.index_list(["https://unknown.org/foo"]) assert not result diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py index a22211e..b3f0612 100644 --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -1,732 +1,774 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import functools -import random from typing import Dict, Any import unittest from hypothesis import strategies from swh.model import hashutil from swh.model.hashutil import hash_to_bytes, hash_to_hex -from swh.model.model import OriginVisit, OriginVisitStatus +from swh.model.model import ( + Content, + Directory, + DirectoryEntry, + Origin, + OriginVisit, + OriginVisitStatus, + Person, + Revision, + RevisionType, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, + TimestampWithTimezone, +) from swh.storage.utils import now from swh.indexer.storage import INDEXER_CFG_KEY BASE_TEST_CONFIG: Dict[str, Dict[str, Any]] = { - "storage": {"cls": "pipeline", "steps": [{"cls": "validate"}, {"cls": "memory"},]}, + "storage": {"cls": "memory"}, "objstorage": {"cls": "memory", "args": {},}, INDEXER_CFG_KEY: {"cls": "memory", "args": {},}, } + +ORIGINS = [ + Origin(url="https://github.com/SoftwareHeritage/swh-storage"), + Origin(url="rsync://ftp.gnu.org/gnu/3dldf"), + Origin(url="https://forge.softwareheritage.org/source/jesuisgpl/"), + Origin(url="https://pypi.org/project/limnoria/"), + Origin(url="http://0-512-md.googlecode.com/svn/"), + Origin(url="https://github.com/librariesio/yarn-parser"), + Origin(url="https://github.com/librariesio/yarn-parser.git"), +] + + ORIGIN_VISITS = [ - {"type": "git", "url": "https://github.com/SoftwareHeritage/swh-storage"}, - {"type": "ftp", "url": "rsync://ftp.gnu.org/gnu/3dldf"}, - {"type": "deposit", "url": "https://forge.softwareheritage.org/source/jesuisgpl/"}, - {"type": "pypi", "url": "https://pypi.org/project/limnoria/"}, - {"type": "svn", "url": "http://0-512-md.googlecode.com/svn/"}, - {"type": "git", "url": "https://github.com/librariesio/yarn-parser"}, - {"type": "git", "url": "https://github.com/librariesio/yarn-parser.git"}, + {"type": "git", "origin": ORIGINS[0].url}, + {"type": "ftp", "origin": ORIGINS[1].url}, + {"type": "deposit", "origin": ORIGINS[2].url}, + {"type": "pypi", "origin": ORIGINS[3].url}, + {"type": "svn", "origin": ORIGINS[4].url}, + {"type": "git", "origin": ORIGINS[5].url}, + {"type": "git", "origin": ORIGINS[6].url}, ] + +DIRECTORY = Directory( + id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), + entries=( + DirectoryEntry( + name=b"index.js", + type="file", + target=hash_to_bytes("01c9379dfc33803963d07c1ccc748d3fe4c96bb5"), + perms=0o100644, + ), + DirectoryEntry( + name=b"package.json", + type="file", + target=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), + perms=0o100644, + ), + DirectoryEntry( + name=b".github", + type="dir", + target=Directory(entries=()).id, + perms=0o040000, + ), + ), +) + +DIRECTORY2 = Directory( + id=b"\xf8zz\xa1\x12`<1$\xfav\xf9\x01\xfd5\x85F`\xf2\xb6", + entries=( + DirectoryEntry( + name=b"package.json", + type="file", + target=hash_to_bytes("f5305243b3ce7ef8dc864ebc73794da304025beb"), + perms=0o100644, + ), + ), +) + +REVISION = Revision( + id=hash_to_bytes("c6201cb1b9b9df9a7542f9665c3b5dfab85e9775"), + message=b"Improve search functionality", + author=Person( + name=b"Andrew Nesbitt", + fullname=b"Andrew Nesbitt ", + email=b"andrewnez@gmail.com", + ), + committer=Person( + name=b"Andrew Nesbitt", + fullname=b"Andrew Nesbitt ", + email=b"andrewnez@gmail.com", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1380883849, microseconds=0,), + offset=120, + negative_utc=False, + ), + type=RevisionType.GIT, + synthetic=False, + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1487596456, microseconds=0,), + offset=0, + negative_utc=False, + ), + directory=DIRECTORY2.id, + parents=(), +) + +REVISIONS = [REVISION] + SNAPSHOTS = [ - { - "origin": "https://github.com/SoftwareHeritage/swh-storage", - "branches": { - b"refs/heads/add-revision-origin-cache": { - "target": b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' - b"s\xe7/\xe9l\x1e", - "target_type": "revision", - }, - b"refs/head/master": { - "target": b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}" b"\xac\xefrm", - "target_type": "revision", - }, - b"HEAD": {"target": b"refs/head/master", "target_type": "alias"}, - b"refs/tags/v0.0.103": { - "target": b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b"\x0f\xdd", - "target_type": "release", - }, - }, - }, - { - "origin": "rsync://ftp.gnu.org/gnu/3dldf", - "branches": { - b"3DLDF-1.1.4.tar.gz": { - "target": b"dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc" b'"G\x99\x11', - "target_type": "revision", - }, - b"3DLDF-2.0.2.tar.gz": { - "target": b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=" - b"\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V", - "target_type": "revision", - }, - b"3DLDF-2.0.3-examples.tar.gz": { - "target": b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97" - b"\xfe\xadZ\x80\x80\xc1\x83\xff", - "target_type": "revision", - }, - b"3DLDF-2.0.3.tar.gz": { - "target": b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee" - b"\xcc\x1a\xb4`\x8c\x8by", - "target_type": "revision", - }, - b"3DLDF-2.0.tar.gz": { - "target": b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G" b"\xd3\xd1m", - "target_type": "revision", - }, - }, - }, - { - "origin": "https://forge.softwareheritage.org/source/jesuisgpl/", - "branches": { - b"master": { - "target": b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{" - b"\xa6\xe9\x99\xb1\x9e]q\xeb", - "target_type": "revision", - } - }, - "id": b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r ", - }, - { - "origin": "https://pypi.org/project/limnoria/", - "branches": { - b"HEAD": {"target": b"releases/2018.09.09", "target_type": "alias"}, - b"releases/2018.09.01": { - "target": b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d" - b"\xbb\xdfF\xfdw\xcf", - "target_type": "revision", - }, - b"releases/2018.09.09": { - "target": b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k" - b"A\x10\x9d\xc5\xfa2\xf8t", - "target_type": "revision", - }, - }, - "id": b"{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay" b"\x12\x9e\xd6\xb3", - }, - { - "origin": "http://0-512-md.googlecode.com/svn/", - "branches": { - b"master": { - "target": b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8" - b"\xc9\xad#.\x1bw=\x18", - "target_type": "revision", - } + Snapshot( + id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"), + branches={ + b"refs/heads/add-revision-origin-cache": SnapshotBranch( + target=b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0s\xe7/\xe9l\x1e', + target_type=TargetType.REVISION, + ), + b"refs/head/master": SnapshotBranch( + target=b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm", + target_type=TargetType.REVISION, + ), + b"HEAD": SnapshotBranch( + target=b"refs/head/master", target_type=TargetType.ALIAS + ), + b"refs/tags/v0.0.103": SnapshotBranch( + target=b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+\x0f\xdd', + target_type=TargetType.RELEASE, + ), }, - "id": b"\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7" - b"\x05\xea\xb8\x1f\xc4H\xf4s", - }, - { - "origin": "https://github.com/librariesio/yarn-parser", - "branches": { - b"HEAD": { - "target": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), - "target_type": "revision", - } + ), + Snapshot( + id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"), + branches={ + b"3DLDF-1.1.4.tar.gz": SnapshotBranch( + target=b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc"G\x99\x11', + target_type=TargetType.REVISION, + ), + b"3DLDF-2.0.2.tar.gz": SnapshotBranch( + target=b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V", # noqa + target_type=TargetType.REVISION, + ), + b"3DLDF-2.0.3-examples.tar.gz": SnapshotBranch( + target=b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97\xfe\xadZ\x80\x80\xc1\x83\xff", # noqa + target_type=TargetType.REVISION, + ), + b"3DLDF-2.0.3.tar.gz": SnapshotBranch( + target=b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by", # noqa + target_type=TargetType.REVISION, + ), + b"3DLDF-2.0.tar.gz": SnapshotBranch( + target=b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G\xd3\xd1m", + target_type=TargetType.REVISION, + ), }, - }, - { - "origin": "https://github.com/librariesio/yarn-parser.git", - "branches": { - b"HEAD": { - "target": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), - "target_type": "revision", - } + ), + Snapshot( + id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"), + branches={ + b"master": SnapshotBranch( + target=b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb", # noqa + target_type=TargetType.REVISION, + ) }, - }, -] - - -REVISIONS = [ - { - "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), - "message": b"Improve search functionality", - "author": { - "name": b"Andrew Nesbitt", - "fullname": b"Andrew Nesbitt ", - "email": b"andrewnez@gmail.com", + ), + Snapshot( + id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"), + branches={ + b"HEAD": SnapshotBranch( + target=b"releases/2018.09.09", target_type=TargetType.ALIAS + ), + b"releases/2018.09.01": SnapshotBranch( + target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf", + target_type=TargetType.REVISION, + ), + b"releases/2018.09.09": SnapshotBranch( + target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t", # noqa + target_type=TargetType.REVISION, + ), }, - "committer": { - "name": b"Andrew Nesbitt", - "fullname": b"Andrew Nesbitt ", - "email": b"andrewnez@gmail.com", + ), + Snapshot( + id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"), + branches={ + b"master": SnapshotBranch( + target=b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18", + target_type=TargetType.REVISION, + ) }, - "committer_date": { - "negative_utc": False, - "offset": 120, - "timestamp": {"microseconds": 0, "seconds": 1380883849,}, + ), + Snapshot( + id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), + branches={ + b"HEAD": SnapshotBranch( + target=REVISION.id, target_type=TargetType.REVISION, + ) }, - "type": "git", - "synthetic": False, - "date": { - "negative_utc": False, - "timestamp": {"seconds": 1487596456, "microseconds": 0,}, - "offset": 0, + ), + Snapshot( + id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), + branches={ + b"HEAD": SnapshotBranch( + target=REVISION.id, target_type=TargetType.REVISION, + ) }, - "directory": b"10", - "parents": (), - } + ), ] -DIRECTORY_ID = b"10" - -DIRECTORY_ENTRIES = [ - {"name": b"index.js", "type": "file", "target": b"abc", "perms": 33188,}, - {"name": b"package.json", "type": "file", "target": b"cde", "perms": 33188,}, - {"name": b".github", "type": "dir", "target": b"11", "perms": 16384,}, -] SHA1_TO_LICENSES = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"], "02fb2c89e14f7fab46701478c83779c7beb7b069": ["Apache2.0"], "103bc087db1d26afc3a0283f38663d081e9b01e6": ["MIT"], "688a5ef812c53907562fe379d4b3851e69c7cb15": ["AGPL"], "da39a3ee5e6b4b0d3255bfef95601890afd80709": [], } SHA1_TO_CTAGS = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": [ {"name": "foo", "kind": "str", "line": 10, "lang": "bar",} ], "d4c647f0fc257591cc9ba1722484229780d1c607": [ {"name": "let", "kind": "int", "line": 100, "lang": "haskell",} ], "688a5ef812c53907562fe379d4b3851e69c7cb15": [ {"name": "symbol", "kind": "float", "line": 99, "lang": "python",} ], } OBJ_STORAGE_DATA = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": b"this is some text", "688a5ef812c53907562fe379d4b3851e69c7cb15": b"another text", "8986af901dd2043044ce8f0d8fc039153641cf17": b"yet another text", "02fb2c89e14f7fab46701478c83779c7beb7b069": b""" import unittest import logging from swh.indexer.mimetype import MimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, "103bc087db1d26afc3a0283f38663d081e9b01e6": b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, "93666f74f1cf635c8c8ac118879da6ec5623c410": b""" (should 'pygments (recognize 'lisp 'easily)) """, "26a9f72a7c87cc9205725cfd879f514ff4f3d8d5": b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, "d4c647f0fc257591cc9ba1722484229780d1c607": b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, "a7ab314d8a11d2c93e3dcf528ca294e7b431c449": b""" """, "da39a3ee5e6b4b0d3255bfef95601890afd80709": b"", - # 626364 - hash_to_hex(b"bcd"): b"unimportant content for bcd", - # 636465 - hash_to_hex( - b"cde" - ): b""" + # was 626364 / b'bcd' + "e3e40fee6ff8a52f06c3b428bfe7c0ed2ef56e92": b"unimportant content for bcd", + # was 636465 / b'cde' now yarn-parser package.json + "f5305243b3ce7ef8dc864ebc73794da304025beb": b""" { "name": "yarn-parser", "version": "1.0.0", "description": "Tiny web service for parsing yarn.lock files", "main": "index.js", "scripts": { "start": "node index.js", "test": "mocha" }, "engines": { "node": "9.8.0" }, "repository": { "type": "git", "url": "git+https://github.com/librariesio/yarn-parser.git" }, "keywords": [ "yarn", "parse", "lock", "dependencies" ], "author": "Andrew Nesbitt", "license": "AGPL-3.0", "bugs": { "url": "https://github.com/librariesio/yarn-parser/issues" }, "homepage": "https://github.com/librariesio/yarn-parser#readme", "dependencies": { "@yarnpkg/lockfile": "^1.0.0", "body-parser": "^1.15.2", "express": "^4.14.0" }, "devDependencies": { "chai": "^4.1.2", "mocha": "^5.2.0", "request": "^2.87.0", "test": "^0.6.0" } } """, } + YARN_PARSER_METADATA = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "url": "https://github.com/librariesio/yarn-parser#readme", "codeRepository": "git+git+https://github.com/librariesio/yarn-parser.git", "author": [{"type": "Person", "name": "Andrew Nesbitt"}], "license": "https://spdx.org/licenses/AGPL-3.0", "version": "1.0.0", "description": "Tiny web service for parsing yarn.lock files", "issueTracker": "https://github.com/librariesio/yarn-parser/issues", "name": "yarn-parser", "keywords": ["yarn", "parse", "lock", "dependencies"], "type": "SoftwareSourceCode", } json_dict_keys = strategies.one_of( strategies.characters(), strategies.just("type"), strategies.just("url"), strategies.just("name"), strategies.just("email"), strategies.just("@id"), strategies.just("@context"), strategies.just("repository"), strategies.just("license"), strategies.just("repositories"), strategies.just("licenses"), ) """Hypothesis strategy that generates strings, with an emphasis on those that are often used as dictionary keys in metadata files.""" generic_json_document = strategies.recursive( strategies.none() | strategies.booleans() | strategies.floats() | strategies.characters(), lambda children: ( strategies.lists(children, min_size=1) | strategies.dictionaries(json_dict_keys, children, min_size=1) ), ) """Hypothesis strategy that generates possible values for values of JSON metadata files.""" def json_document_strategy(keys=None): """Generates an hypothesis strategy that generates metadata files for a JSON-based format that uses the given keys.""" if keys is None: keys = strategies.characters() else: keys = strategies.one_of(map(strategies.just, keys)) return strategies.dictionaries(keys, generic_json_document, min_size=1) def _tree_to_xml(root, xmlns, data): def encode(s): "Skips unpaired surrogates generated by json_document_strategy" return s.encode("utf8", "replace") def to_xml(data, indent=b" "): if data is None: return b"" elif isinstance(data, (bool, str, int, float)): return indent + encode(str(data)) elif isinstance(data, list): return b"\n".join(to_xml(v, indent=indent) for v in data) elif isinstance(data, dict): lines = [] for (key, value) in data.items(): lines.append(indent + encode("<{}>".format(key))) lines.append(to_xml(value, indent=indent + b" ")) lines.append(indent + encode("".format(key))) return b"\n".join(lines) else: raise TypeError(data) return b"\n".join( [ '<{} xmlns="{}">'.format(root, xmlns).encode(), to_xml(data), "".format(root).encode(), ] ) class TreeToXmlTest(unittest.TestCase): def test_leaves(self): self.assertEqual( _tree_to_xml("root", "http://example.com", None), b'\n\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", True), b'\n True\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", "abc"), b'\n abc\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", 42), b'\n 42\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", 3.14), b'\n 3.14\n', ) def test_dict(self): self.assertIn( _tree_to_xml("root", "http://example.com", {"foo": "bar", "baz": "qux"}), [ b'\n' b" \n bar\n \n" b" \n qux\n \n" b"", b'\n' b" \n qux\n \n" b" \n bar\n \n" b"", ], ) def test_list(self): self.assertEqual( _tree_to_xml( "root", "http://example.com", [{"foo": "bar"}, {"foo": "baz"},] ), b'\n' b" \n bar\n \n" b" \n baz\n \n" b"", ) def xml_document_strategy(keys, root, xmlns): """Generates an hypothesis strategy that generates metadata files for an XML format that uses the given keys.""" return strategies.builds( functools.partial(_tree_to_xml, root, xmlns), json_document_strategy(keys) ) def filter_dict(d, keys): "return a copy of the dict with keys deleted" if not isinstance(keys, (list, tuple)): keys = (keys,) return dict((k, v) for (k, v) in d.items() if k not in keys) def fill_obj_storage(obj_storage): """Add some content in an object storage.""" for (obj_id, content) in OBJ_STORAGE_DATA.items(): obj_storage.add(content, obj_id=hash_to_bytes(obj_id)) def fill_storage(storage): - visit_types = {} - for visit in ORIGIN_VISITS: - storage.origin_add([{"url": visit["url"]}]) - visit_types[visit["url"]] = visit["type"] - for snap in SNAPSHOTS: - origin_url = snap["origin"] + storage.origin_add(ORIGINS) + storage.directory_add([DIRECTORY, DIRECTORY2]) + storage.revision_add(REVISIONS) + storage.snapshot_add(SNAPSHOTS) + + for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS): + assert snapshot.id is not None + visit = storage.origin_visit_add( - [OriginVisit(origin=origin_url, date=now(), type=visit_types[origin_url],)] + [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])] )[0] - snap_id = snap.get("id") or bytes([random.randint(0, 255) for _ in range(32)]) - storage.snapshot_add([{"id": snap_id, "branches": snap["branches"]}]) visit_status = OriginVisitStatus( - origin=origin_url, + origin=visit.origin, visit=visit.visit, date=now(), status="full", - snapshot=snap_id, + snapshot=snapshot.id, ) storage.origin_visit_status_add([visit_status]) - storage.revision_add(REVISIONS) contents = [] for (obj_id, content) in OBJ_STORAGE_DATA.items(): content_hashes = hashutil.MultiHash.from_data(content).digest() contents.append( - { - "data": content, - "length": len(content), - "status": "visible", - "sha1": hash_to_bytes(obj_id), - "sha1_git": hash_to_bytes(obj_id), - "sha256": content_hashes["sha256"], - "blake2s256": content_hashes["blake2s256"], - } + Content( + data=content, + length=len(content), + status="visible", + sha1=hash_to_bytes(obj_id), + sha1_git=hash_to_bytes(obj_id), + sha256=content_hashes["sha256"], + blake2s256=content_hashes["blake2s256"], + ) ) storage.content_add(contents) - storage.directory_add([{"id": DIRECTORY_ID, "entries": DIRECTORY_ENTRIES,}]) class CommonContentIndexerTest(metaclass=abc.ABCMeta): legacy_get_format = False """True if and only if the tested indexer uses the legacy format. see: https://forge.softwareheritage.org/T1433 """ def get_indexer_results(self, ids): """Override this for indexers that don't have a mock storage.""" return self.indexer.idx_storage.state def assert_legacy_results_ok(self, sha1s, expected_results=None): # XXX old format, remove this when all endpoints are # updated to the new one # see: https://forge.softwareheritage.org/T1433 sha1s = [ sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s ] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: expected_results = self.expected_results self.assertEqual( len(expected_results), len(actual_results), (expected_results, actual_results), ) for indexed_data in actual_results: _id = indexed_data["id"] expected_data = expected_results[hashutil.hash_to_hex(_id)].copy() expected_data["id"] = _id self.assertEqual(indexed_data, expected_data) def assert_results_ok(self, sha1s, expected_results=None): if self.legacy_get_format: self.assert_legacy_results_ok(sha1s, expected_results) return sha1s = [ sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s ] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: expected_results = self.expected_results self.assertEqual( len(expected_results), len(actual_results), (expected_results, actual_results), ) for indexed_data in actual_results: (_id, indexed_data) = list(indexed_data.items())[0] expected_data = expected_results[hashutil.hash_to_hex(_id)].copy() expected_data = [expected_data] self.assertEqual(indexed_data, expected_data) def test_index(self): """Known sha1 have their data indexed """ sha1s = [self.id0, self.id1, self.id2] # when self.indexer.run(sha1s, policy_update="update-dups") self.assert_results_ok(sha1s) # 2nd pass self.indexer.run(sha1s, policy_update="ignore-dups") self.assert_results_ok(sha1s) def test_index_one_unknown_sha1(self): """Unknown sha1 are not indexed""" sha1s = [ self.id1, "799a5ef812c53907562fe379d4b3851e69c7cb15", # unknown "800a5ef812c53907562fe379d4b3851e69c7cb15", ] # unknown # when self.indexer.run(sha1s, policy_update="update-dups") # then expected_results = { k: v for k, v in self.expected_results.items() if k in sha1s } self.assert_results_ok(sha1s, expected_results) class CommonContentIndexerRangeTest: """Allows to factorize tests on range indexer. """ def setUp(self): self.contents = sorted(OBJ_STORAGE_DATA) def assert_results_ok(self, start, end, actual_results, expected_results=None): if expected_results is None: expected_results = self.expected_results actual_results = list(actual_results) for indexed_data in actual_results: _id = indexed_data["id"] assert isinstance(_id, bytes) indexed_data = indexed_data.copy() indexed_data["id"] = hash_to_hex(indexed_data["id"]) self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)]) self.assertTrue(start <= _id <= end) _tool_id = indexed_data["indexer_configuration_id"] self.assertEqual(_tool_id, self.indexer.tool["id"]) def test__index_contents(self): """Indexing contents without existing data results in indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = list(self.indexer._index_contents(start, end, indexed={})) self.assert_results_ok(start, end, actual_results) def test__index_contents_with_indexed_data(self): """Indexing contents with existing data results in less indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) data_indexed = [self.id0, self.id2] # given actual_results = self.indexer._index_contents( start, end, indexed=set(map(hash_to_bytes, data_indexed)) ) # craft the expected results expected_results = self.expected_results.copy() for already_indexed_key in data_indexed: expected_results.pop(already_indexed_key) self.assert_results_ok(start, end, actual_results, expected_results) def test_generate_content_get(self): """Optimal indexing should result in indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run(start, end) # then self.assertEqual(actual_results, {"status": "uneventful"}) def test_generate_content_get_input_as_bytes(self): """Optimal indexing should result in indexed data Input are in bytes here. """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run(start, end, skip_existing=False) # no already indexed data so same result as prior test # then self.assertEqual(actual_results, {"status": "uneventful"}) def test_generate_content_get_no_result(self): """No result indexed returns False""" _start, _end = [ "0000000000000000000000000000000000000000", "0000000000000000000000000000000000000001", ] start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run(start, end, incremental=False) # then self.assertEqual(actual_results, {"status": "uneventful"})