diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import abc +import ast import os import logging import shutil @@ -543,6 +544,10 @@ results = [] for id_ in ids: + if isinstance(id_, str): + # Data coming from JSON, which requires string keys, so + # one extra level of deserialization is needed + id_ = ast.literal_eval(id_) if isinstance(id_, (tuple, list)): if len(id_) != 2: raise TypeError('Expected a (type, url) tuple.') @@ -554,8 +559,8 @@ raise TypeError('Invalid value in "ids": %r' % id_) origin = self.storage.origin_get(params) if not origin: - self.log.warning('Origins %s not found in storage' % - list(ids)) + self.log.warning('Origin %s not found in storage' % + list(id_)) continue try: res = self.index(origin, **kwargs) @@ -563,7 +568,7 @@ results.append(res) except Exception: self.log.exception( - 'Problem when processing origin %s' % id_) + 'Problem when processing origin %s' % (id_,)) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -285,7 +285,7 @@ of OriginHeadIndexer. policy_update (str): `'ignore-dups'` or `'update-dups'` """ - origin_head_map = {int(origin_id): hashutil.hash_to_bytes(rev_id) + origin_head_map = {origin_id: hashutil.hash_to_bytes(rev_id) for (origin_id, rev_id) in origin_head.items()} # Fix up the argument order. revisions_metadata has to be the @@ -298,7 +298,7 @@ def index(self, origin, *, origin_head_map): # Get the last revision of the origin. - revision_id = origin_head_map[origin['id']] + revision_id = origin_head_map[str(origin['id'])] revision_metadata = self.idx_storage \ .revision_metadata_get([revision_id]) diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -628,7 +628,7 @@ list: dictionaries with the following keys: - **origin_id** (int) - - **translated_metadata** (str): associated metadata + - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py --- a/swh/indexer/tests/storage/test_in_memory.py +++ b/swh/indexer/tests/storage/test_in_memory.py @@ -1,5 +1,4 @@ from unittest import TestCase -import pytest from .test_storage import CommonTestStorage @@ -16,6 +15,5 @@ def reset_storage_tables(self): self.storage = self.storage.__class__() - @pytest.mark.xfail def test_check_config(self): pass diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -7,7 +7,7 @@ from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.tests.test_utils import ( - MockIndexerStorage, MockStorage, BASE_TEST_CONFIG + BASE_TEST_CONFIG, fill_storage ) @@ -29,58 +29,67 @@ } } - def prepare(self): - super().prepare() - self.storage = MockStorage() - self.idx_storage = MockIndexerStorage() - def persist_index_computations(self, results, policy_update): self.results = results class OriginHead(unittest.TestCase): + def setUp(self): + self.indexer = OriginHeadTestIndexer() + fill_storage(self.indexer.storage) + + def _get_origin_id(self, type_, url): + origin = self.indexer.storage.origin_get({ + 'type': type_, 'url': url}) + return origin['id'] + def test_git(self): - indexer = OriginHeadTestIndexer() - indexer.run( + self.indexer.run( ['git+https://github.com/SoftwareHeritage/swh-storage']) - self.assertEqual(indexer.results, [{ + origin_id = self._get_origin_id( + 'git', 'https://github.com/SoftwareHeritage/swh-storage') + self.assertEqual(self.indexer.results, [{ 'revision_id': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{' b'\xd7}\xac\xefrm', - 'origin_id': 52189575}]) + 'origin_id': origin_id}]) def test_ftp(self): - indexer = OriginHeadTestIndexer() - indexer.run( + self.indexer.run( ['ftp+rsync://ftp.gnu.org/gnu/3dldf']) - self.assertEqual(indexer.results, [{ + origin_id = self._get_origin_id( + 'ftp', 'rsync://ftp.gnu.org/gnu/3dldf') + self.assertEqual(self.indexer.results, [{ 'revision_id': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', - 'origin_id': 4423668}]) + 'origin_id': origin_id}]) def test_deposit(self): - indexer = OriginHeadTestIndexer() - indexer.run( + self.indexer.run( ['deposit+https://forge.softwareheritage.org/source/' 'jesuisgpl/']) - self.assertEqual(indexer.results, [{ + origin_id = self._get_origin_id( + 'deposit', 'https://forge.softwareheritage.org/source/jesuisgpl/') + self.assertEqual(self.indexer.results, [{ 'revision_id': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', - 'origin_id': 77775770}]) + 'origin_id': origin_id}]) def test_pypi(self): - indexer = OriginHeadTestIndexer() - indexer.run( + self.indexer.run( ['pypi+https://pypi.org/project/limnoria/']) - self.assertEqual(indexer.results, [{ + origin_id = self._get_origin_id( + 'pypi', 'https://pypi.org/project/limnoria/') + self.assertEqual(self.indexer.results, [{ 'revision_id': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', - 'origin_id': 85072327}]) + 'origin_id': origin_id}]) def test_svn(self): - indexer = OriginHeadTestIndexer() - indexer.run( + self.indexer.run( ['svn+http://0-512-md.googlecode.com/svn/']) - self.assertEqual(indexer.results, [{ + origin_id = self._get_origin_id( + 'svn', 'http://0-512-md.googlecode.com/svn/') + self.assertEqual(self.indexer.results, [{ 'revision_id': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', - 'origin_id': 49908349}]) + 'origin_id': origin_id}]) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -8,15 +8,18 @@ from celery import task from swh.model.hashutil import hash_to_bytes +from swh.storage.in_memory import Storage from swh.indexer.metadata import ( OriginMetadataIndexer, RevisionMetadataIndexer ) +from swh.indexer.storage.in_memory import IndexerStorage +from swh.objstorage.objstorage_in_memory import InMemoryObjStorage + from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture from .test_utils import ( - MockObjStorage, MockStorage, MockIndexerStorage, - BASE_TEST_CONFIG + BASE_TEST_CONFIG, fill_storage, fill_obj_storage ) from .test_origin_head import OriginHeadTestIndexer from .test_metadata import ContentMetadataTestIndexer @@ -41,12 +44,6 @@ } } - def prepare(self): - super().prepare() - self.idx_storage = MockIndexerStorage() - self.storage = MockStorage() - self.objstorage = MockObjStorage() - @task def revision_metadata_test_task(*args, **kwargs): @@ -62,12 +59,6 @@ 'tools': [] } - def prepare(self): - super().prepare() - self.storage = MockStorage() - self.objstorage = MockObjStorage() - self.idx_storage = MockIndexerStorage() - @task def origin_intrinsic_metadata_test_task(*args, **kwargs): @@ -89,9 +80,6 @@ def setUp(self): super().setUp() self.maxDiff = None - # FIXME: Improve mock indexer storage reset behavior - MockIndexerStorage.added_data = [] - MockIndexerStorage.revision_metadata = {} self.add_scheduler_task_type( 'revision_metadata_test_task', 'swh.indexer.tests.test_origin_metadata.' @@ -106,51 +94,79 @@ del RevisionMetadataTestIndexer.scheduler super().tearDown() - def test_pipeline(self): - indexer = OriginHeadTestIndexer() - indexer.scheduler = self.scheduler - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) - - self.run_ready_tasks() # Run the first task - time.sleep(0.1) # Give it time to complete and schedule the 2nd one - self.run_ready_tasks() # Run the second task + @unittest.mock.patch('swh.indexer.storage.in_memory.IndexerStorage') + @unittest.mock.patch('swh.storage.in_memory.Storage') + def test_pipeline(self, storage_mock, idx_storage_mock): + # Always returns the same instance of the idx storage, because + # this function is called by each of the three indexers. + objstorage = InMemoryObjStorage() + storage = Storage() + idx_storage = IndexerStorage() + + storage_mock.return_value = storage + idx_storage_mock.return_value = idx_storage + + fill_obj_storage(objstorage) + fill_storage(storage) + + # TODO: find a better way to share the ContentMetadataIndexer use + # the same objstorage instance. + import swh.objstorage + old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory'] + swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage + try: + indexer = OriginHeadTestIndexer() + indexer.scheduler = self.scheduler + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + self.run_ready_tasks() # Run the first task + # Give it time to complete and schedule the 2nd one + time.sleep(0.1) + self.run_ready_tasks() # Run the second task + finally: + swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', - 'schema:codeRepository': - 'git+https://github.com/librariesio/yarn-parser.git', - 'schema:author': 'Andrew Nesbitt', - 'license': 'AGPL-3.0', + 'codeRepository': + 'git+git+https://github.com/librariesio/yarn-parser.git', + 'author': [{ + 'type': 'Person', + 'name': 'Andrew Nesbitt' + }], + 'license': 'https://spdx.org/licenses/AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', - 'codemeta:issueTracker': + 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], } rev_metadata = { - 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), + 'id': rev_id, 'translated_metadata': metadata, } origin_metadata = { - 'origin_id': 54974445, - 'from_revision': hash_to_bytes( - '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), + 'origin_id': origin['id'], + 'from_revision': rev_id, 'metadata': metadata, } - expected_results = [ - ('revision_metadata', True, [rev_metadata]), - ('origin_intrinsic_metadata', True, [origin_metadata]), - ] - results = list(indexer.idx_storage.added_data) + results = list(indexer.idx_storage.revision_metadata_get([rev_id])) for result in results: - metadata = result[2] - for item in metadata: - # cannot check those (generated ids) - del item['indexer_configuration_id'] + del result['tool'] + self.assertEqual(results, [rev_metadata]) - self.assertCountEqual(expected_results, results) + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + for result in results: + del result['tool'] + self.assertEqual(results, [origin_metadata]) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -323,6 +323,48 @@ 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', + '636465': b""" + { + "name": "yarn-parser", + "version": "1.0.0", + "description": "Tiny web service for parsing yarn.lock files", + "main": "index.js", + "scripts": { + "start": "node index.js", + "test": "mocha" + }, + "engines": { + "node": "9.8.0" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/librariesio/yarn-parser.git" + }, + "keywords": [ + "yarn", + "parse", + "lock", + "dependencies" + ], + "author": "Andrew Nesbitt", + "license": "AGPL-3.0", + "bugs": { + "url": "https://github.com/librariesio/yarn-parser/issues" + }, + "homepage": "https://github.com/librariesio/yarn-parser#readme", + "dependencies": { + "@yarnpkg/lockfile": "^1.0.0", + "body-parser": "^1.15.2", + "express": "^4.14.0" + }, + "devDependencies": { + "chai": "^4.1.2", + "mocha": "^5.2.0", + "request": "^2.87.0", + "test": "^0.6.0" + } + } +""" } CONTENT_METADATA = [{ @@ -478,12 +520,21 @@ for origin in ORIGINS: origin = origin.copy() del origin['id'] - last_origin_id = storage.origin_add_one(origin) - visit = storage.origin_visit_add(last_origin_id, datetime.datetime.now()) - for (snap_id, snap_branches) in SNAPSHOTS.items(): - storage.snapshot_add(last_origin_id, visit['visit'], { + storage.origin_add_one(origin) + for (orig_pseudo_id, snap) in SNAPSHOTS.items(): + for orig in ORIGINS: + if orig_pseudo_id == orig['id']: + origin_id = storage.origin_get( + {'type': orig['type'], 'url': orig['url']})['id'] + break + else: + assert False + visit = storage.origin_visit_add(origin_id, datetime.datetime.now()) + snap_id = snap.get('id') or \ + bytes([random.randint(0, 255) for _ in range(32)]) + storage.snapshot_add(origin_id, visit['visit'], { 'id': snap_id, - 'branches': snap_branches + 'branches': snap['branches'] }) storage.revision_add(REVISIONS) storage.directory_add([{