diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 76dc3c6..c440a88 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,811 +1,804 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.model.hashutil import hash_to_bytes from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS from swh.indexer.metadata_detector import ( detect_metadata, extract_minimal_metadata_dict ) from swh.indexer.metadata import ( ContentMetadataIndexer, RevisionMetadataIndexer ) from .utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage ) TRANSLATOR_TOOL = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, 'should not be called; the rev indexer configures it.' -class RevisionMetadataTestIndexer(RevisionMetadataIndexer): - """Specific indexer whose configuration is enough to satisfy the - indexing tests. - """ - - ContentMetadataIndexer = ContentMetadataTestIndexer - - def parse_config_file(self, *args, **kwargs): - return { - **BASE_TEST_CONFIG, - 'tools': TRANSLATOR_TOOL, - } +REVISION_METADATA_CONFIG = { + **BASE_TEST_CONFIG, + 'tools': TRANSLATOR_TOOL, +} class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { 'repository': 'http://schema.org/codeRepository', 'os': 'http://schema.org/operatingSystem', 'cpu': 'http://schema.org/processorRequirements', 'engines': 'http://schema.org/processorRequirements', 'author': 'http://schema.org/author', 'author.email': 'http://schema.org/email', 'author.name': 'http://schema.org/name', 'contributor': 'http://schema.org/contributor', 'keywords': 'http://schema.org/keywords', 'license': 'http://schema.org/license', 'version': 'http://schema.org/version', 'description': 'http://schema.org/description', 'name': 'http://schema.org/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', 'homepage': 'http://schema.org/url' }) def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'test_metadata', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'author': [{ 'type': 'Person', 'name': 'Morane G', 'email': 'moranegg@example.com', }], } # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_extract_minimal_metadata_dict(self): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_0_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_metadata', 'version': '0.0.2', 'author': 'moranegg', }] # when results = extract_minimal_metadata_dict(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', "version": '0.0.2', "description": 'Simple package.json test for indexer', "name": ['test_1', 'test_0_1', 'test_metadata'], "author": ['moranegg'], "codeRepository": 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'), hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping config = BASE_TEST_CONFIG.copy() config['tools'] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = list(metadata_indexer.idx_storage.content_metadata_get( sha1s)) expected_results = [{ 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5') }, { 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': 'https://github.com/npm/npm/issues', 'author': [{ 'type': 'Person', 'name': 'Isaac Z. Schlueter', 'email': 'i@izs.me', 'url': 'http://blog.izs.me', }], 'codeRepository': 'git+https://github.com/npm/npm', 'description': 'a package manager for JavaScript', 'license': 'https://spdx.org/licenses/Artistic-2.0', 'version': '5.0.3', 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607') }] for result in results: del result['tool'] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_npm_bugs_normalization(self): # valid dictionary package_json = b"""{ "name": "foo", "bugs": { "url": "https://github.com/owner/project/issues", "email": "foo@example.com" } }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'issueTracker': 'https://github.com/owner/project/issues', 'type': 'SoftwareSourceCode', }) # "invalid" dictionary package_json = b"""{ "name": "foo", "bugs": { "email": "foo@example.com" } }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'type': 'SoftwareSourceCode', }) # string package_json = b"""{ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'issueTracker': 'https://github.com/owner/project/issues', 'type': 'SoftwareSourceCode', }) def test_npm_repository_normalization(self): # normal package_json = b"""{ "name": "foo", "repository": { "type" : "git", "url" : "https://github.com/npm/cli.git" } }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://github.com/npm/cli.git', 'type': 'SoftwareSourceCode', }) # missing url package_json = b"""{ "name": "foo", "repository": { "type" : "git" } }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'type': 'SoftwareSourceCode', }) # github shortcut package_json = b"""{ "name": "foo", "repository": "github:npm/cli" }""" result = MAPPINGS["NpmMapping"].translate(package_json) expected_result = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://github.com/npm/cli.git', 'type': 'SoftwareSourceCode', } self.assertEqual(result, expected_result) # github shortshortcut package_json = b"""{ "name": "foo", "repository": "npm/cli" }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, expected_result) # gitlab shortcut package_json = b"""{ "name": "foo", "repository": "gitlab:user/repo" }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://gitlab.com/user/repo.git', 'type': 'SoftwareSourceCode', }) def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'NpmMapping': [ b'cde' ] } # then self.assertEqual(expected_results, results) def test_compute_metadata_valid_codemeta(self): raw_content = ( b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""") # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation" }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": [ "metadata", "software" ], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD" } result = MAPPINGS["CodemetaMapping"].translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'codeRepository': 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_empty(self): raw_content = b""" """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) def test_compute_metadata_maven_almost_empty(self): raw_content = b""" """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) def test_compute_metadata_maven_invalid_xml(self): raw_content = b""" """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, None) raw_content = b""" """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, None) def test_compute_metadata_maven_minimal(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_multiple(self): '''Tests when there are multiple code repos and licenses.''' raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false example Example Maven Repo default http://example.org/maven2 Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license MIT license https://opensource.org/licenses/MIT """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': [ 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'https://opensource.org/licenses/MIT', ], 'codeRepository': [ 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', 'http://example.org/maven2/com/mycompany/app/my-app', ] }) def test_compute_metadata_pkginfo(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: swh.core Version: 0.0.49 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== \x20 core library for swh's modules: - config parser - hash computations - serialization - logging mechanism \x20 Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing """) # noqa result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) self.assertCountEqual(result['description'], [ 'Software Heritage core utilities', # note the comma here 'swh-core\n' '========\n' '\n' "core library for swh's modules:\n" '- config parser\n' '- hash computations\n' '- serialization\n' '- logging mechanism\n' ''], result) del result['description'] self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'url': 'https://forge.softwareheritage.org/diffusion/DCORE/', 'name': 'swh.core', 'author': [{ 'type': 'Person', 'name': 'Software Heritage developers', 'email': 'swh-devel@inria.fr', }], 'version': '0.0.49', }) def test_compute_metadata_pkginfo_utf8(self): raw_content = (b'''\ Metadata-Version: 1.1 Name: snowpyt Description-Content-Type: UNKNOWN Description: foo Hydrology N\xc2\xb083 ''') # noqa result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'snowpyt', 'description': 'foo\nHydrology N°83', }) def test_compute_metadata_pkginfo_license(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: foo License: MIT """) # noqa result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'foo', 'license': 'MIT', }) def test_revision_metadata_indexer(self): - metadata_indexer = RevisionMetadataTestIndexer() + metadata_indexer = RevisionMetadataIndexer( + config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) assert tool is not None metadata_indexer.idx_storage.content_metadata_add([{ 'indexer_configuration_id': tool['id'], 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', 'author': ['Andrew Nesbitt'], 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'description': 'Tiny web service for parsing yarn.lock files', } }]) sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] metadata_indexer.run(sha1_gits, 'update-dups') results = list(metadata_indexer.idx_storage.revision_metadata_get( sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'author': ['Andrew Nesbitt'], 'license': 'AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, }] for result in results: del result['tool']['id'] # then self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py index 6bd7806..5c1825f 100644 --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -1,95 +1,97 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, fill_storage ) +ORIGIN_HEAD_CONFIG = { + **BASE_TEST_CONFIG, + 'tools': { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {}, + }, + 'tasks': { + 'revision_metadata': None, + 'origin_intrinsic_metadata': None, + } +} + class OriginHeadTestIndexer(OriginHeadIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): - return { - **BASE_TEST_CONFIG, - 'tools': { - 'name': 'origin-metadata', - 'version': '0.0.1', - 'configuration': {}, - }, - 'tasks': { - 'revision_metadata': None, - 'origin_intrinsic_metadata': None, - } - } + return ORIGIN_HEAD_CONFIG def persist_index_computations(self, results, policy_update): self.results = results class OriginHead(unittest.TestCase): def setUp(self): self.indexer = OriginHeadTestIndexer() fill_storage(self.indexer.storage) def _get_origin_id(self, type_, url): origin = self.indexer.storage.origin_get({ 'type': type_, 'url': url}) return origin['id'] def test_git(self): self.indexer.run( ['git+https://github.com/SoftwareHeritage/swh-storage']) origin_id = self._get_origin_id( 'git', 'https://github.com/SoftwareHeritage/swh-storage') self.assertEqual(self.indexer.results, [{ 'revision_id': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{' b'\xd7}\xac\xefrm', 'origin_id': origin_id}]) def test_ftp(self): self.indexer.run( ['ftp+rsync://ftp.gnu.org/gnu/3dldf']) origin_id = self._get_origin_id( 'ftp', 'rsync://ftp.gnu.org/gnu/3dldf') self.assertEqual(self.indexer.results, [{ 'revision_id': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', 'origin_id': origin_id}]) def test_deposit(self): self.indexer.run( ['deposit+https://forge.softwareheritage.org/source/' 'jesuisgpl/']) origin_id = self._get_origin_id( 'deposit', 'https://forge.softwareheritage.org/source/jesuisgpl/') self.assertEqual(self.indexer.results, [{ 'revision_id': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', 'origin_id': origin_id}]) def test_pypi(self): self.indexer.run( ['pypi+https://pypi.org/project/limnoria/']) origin_id = self._get_origin_id( 'pypi', 'https://pypi.org/project/limnoria/') self.assertEqual(self.indexer.results, [{ 'revision_id': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', 'origin_id': origin_id}]) def test_svn(self): self.indexer.run( ['svn+http://0-512-md.googlecode.com/svn/']) origin_id = self._get_origin_id( 'svn', 'http://0-512-md.googlecode.com/svn/') self.assertEqual(self.indexer.results, [{ 'revision_id': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', 'origin_id': origin_id}]) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index afb0011..534fea8 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,123 +1,135 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery.result import AsyncResult from unittest import mock from swh.model.hashutil import hash_to_bytes from swh.storage.in_memory import Storage from swh.indexer.storage.in_memory import IndexerStorage from swh.objstorage.objstorage_in_memory import InMemoryObjStorage from swh.scheduler.celery_backend.runner import run_ready_tasks - -from .utils import fill_storage, fill_obj_storage -from .test_origin_head import OriginHeadTestIndexer -from swh.indexer.tests.tasks import ( - RevisionMetadataTestIndexer, OriginMetadataTestIndexer) - - -class OriginHeadTestIndexer(OriginHeadTestIndexer): - def prepare(self): - super().prepare() - self.config['tasks'] = { - 'revision_metadata': 'revision_metadata', - 'origin_intrinsic_metadata': 'origin_intrinsic_metadata', - } +from swh.indexer.metadata import ( + OriginMetadataIndexer, RevisionMetadataIndexer +) +from swh.indexer.origin_head import OriginHeadIndexer + +from .utils import fill_storage, fill_obj_storage, BASE_TEST_CONFIG +from .test_metadata import REVISION_METADATA_CONFIG + + +ORIGIN_HEAD_CONFIG = { + **BASE_TEST_CONFIG, + 'tools': { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {}, + }, + 'tasks': { + 'revision_metadata': 'revision_metadata', + 'origin_intrinsic_metadata': 'origin_intrinsic_metadata', + } +} +@mock.patch('swh.indexer.metadata.RevisionMetadataIndexer.parse_config_file') +@mock.patch('swh.indexer.origin_head.OriginHeadIndexer.parse_config_file') @mock.patch('swh.indexer.storage.in_memory.IndexerStorage') @mock.patch('swh.storage.in_memory.Storage') def test_pipeline(storage_mock, idx_storage_mock, + origin_head_parse_config, revision_metadata_parse_config, swh_app, celery_session_worker, indexer_scheduler): scheduler = indexer_scheduler # Always returns the same instance of the idx storage, because # this function is called by each of the three indexers. objstorage = InMemoryObjStorage() storage = Storage() idx_storage = IndexerStorage() + origin_head_parse_config.return_value = ORIGIN_HEAD_CONFIG + revision_metadata_parse_config.return_value = REVISION_METADATA_CONFIG storage_mock.return_value = storage idx_storage_mock.return_value = idx_storage fill_obj_storage(objstorage) fill_storage(storage) # TODO: find a better way to share the ContentMetadataIndexer use # the same objstorage instance. import swh.objstorage old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory'] swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage try: - RevisionMetadataTestIndexer.scheduler = scheduler - OriginMetadataTestIndexer.scheduler = scheduler - indexer = OriginHeadTestIndexer() + RevisionMetadataIndexer.scheduler = scheduler + OriginMetadataIndexer.scheduler = scheduler + indexer = OriginHeadIndexer() indexer.scheduler = scheduler indexer.run(["git+https://github.com/librariesio/yarn-parser"]) tasks = [] tasks.extend(run_ready_tasks(scheduler, swh_app)) # Run the first task # Wait for the task to complete and schedule the 2nd one task = [x for x in tasks if x['task'] == 1] assert len(task) == 1 promise = AsyncResult(id=task[0]['backend_id']) promise.wait() tasks.extend(run_ready_tasks(scheduler, swh_app)) # Run the 2nd task task = [x for x in tasks if x['task'] == 2] assert len(task) == 1 promise = AsyncResult(id=task[0]['backend_id']) promise.wait() finally: swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage - del RevisionMetadataTestIndexer.scheduler - del OriginMetadataTestIndexer.scheduler + del RevisionMetadataIndexer.scheduler + del OriginMetadataIndexer.scheduler origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'codeRepository': 'git+git+https://github.com/librariesio/yarn-parser.git', 'author': [{ 'type': 'Person', 'name': 'Andrew Nesbitt' }], 'license': 'https://spdx.org/licenses/AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], } rev_metadata = { 'id': rev_id, 'translated_metadata': metadata, } origin_metadata = { 'origin_id': origin['id'], 'from_revision': rev_id, 'metadata': metadata, } results = list(indexer.idx_storage.revision_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) for result in results: del result['tool'] assert results == [origin_metadata]