Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/tests/test_origin_metadata.py
# Copyright (C) 2018 The Software Heritage developers | # Copyright (C) 2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import time | import time | ||||
import unittest | import unittest | ||||
from celery import task | from celery import task | ||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||
from swh.storage.in_memory import Storage | |||||
from swh.indexer.metadata import ( | from swh.indexer.metadata import ( | ||||
OriginMetadataIndexer, RevisionMetadataIndexer | OriginMetadataIndexer, RevisionMetadataIndexer | ||||
) | ) | ||||
from swh.indexer.storage.in_memory import IndexerStorage | |||||
from swh.objstorage.objstorage_in_memory import InMemoryObjStorage | |||||
from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture | from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture | ||||
from .test_utils import ( | from .test_utils import ( | ||||
MockObjStorage, MockStorage, MockIndexerStorage, | BASE_TEST_CONFIG, fill_storage, fill_obj_storage | ||||
BASE_TEST_CONFIG | |||||
) | ) | ||||
from .test_origin_head import OriginHeadTestIndexer | from .test_origin_head import OriginHeadTestIndexer | ||||
from .test_metadata import ContentMetadataTestIndexer | from .test_metadata import ContentMetadataTestIndexer | ||||
class RevisionMetadataTestIndexer(RevisionMetadataIndexer): | class RevisionMetadataTestIndexer(RevisionMetadataIndexer): | ||||
"""Specific indexer whose configuration is enough to satisfy the | """Specific indexer whose configuration is enough to satisfy the | ||||
indexing tests. | indexing tests. | ||||
""" | """ | ||||
ContentMetadataIndexer = ContentMetadataTestIndexer | ContentMetadataIndexer = ContentMetadataTestIndexer | ||||
def parse_config_file(self, *args, **kwargs): | def parse_config_file(self, *args, **kwargs): | ||||
return { | return { | ||||
**BASE_TEST_CONFIG, | **BASE_TEST_CONFIG, | ||||
'tools': { | 'tools': { | ||||
'name': 'swh-metadata-detector', | 'name': 'swh-metadata-detector', | ||||
'version': '0.0.2', | 'version': '0.0.2', | ||||
'configuration': { | 'configuration': { | ||||
'type': 'local', | 'type': 'local', | ||||
'context': 'NpmMapping' | 'context': 'NpmMapping' | ||||
} | } | ||||
} | } | ||||
} | } | ||||
def prepare(self): | |||||
super().prepare() | |||||
self.idx_storage = MockIndexerStorage() | |||||
self.storage = MockStorage() | |||||
self.objstorage = MockObjStorage() | |||||
@task | @task | ||||
def revision_metadata_test_task(*args, **kwargs): | def revision_metadata_test_task(*args, **kwargs): | ||||
indexer = RevisionMetadataTestIndexer() | indexer = RevisionMetadataTestIndexer() | ||||
indexer.run(*args, **kwargs) | indexer.run(*args, **kwargs) | ||||
return indexer.results | return indexer.results | ||||
class OriginMetadataTestIndexer(OriginMetadataIndexer): | class OriginMetadataTestIndexer(OriginMetadataIndexer): | ||||
def parse_config_file(self, *args, **kwargs): | def parse_config_file(self, *args, **kwargs): | ||||
return { | return { | ||||
**BASE_TEST_CONFIG, | **BASE_TEST_CONFIG, | ||||
'tools': [] | 'tools': [] | ||||
} | } | ||||
def prepare(self): | |||||
super().prepare() | |||||
self.storage = MockStorage() | |||||
self.objstorage = MockObjStorage() | |||||
self.idx_storage = MockIndexerStorage() | |||||
@task | @task | ||||
def origin_intrinsic_metadata_test_task(*args, **kwargs): | def origin_intrinsic_metadata_test_task(*args, **kwargs): | ||||
indexer = OriginMetadataTestIndexer() | indexer = OriginMetadataTestIndexer() | ||||
indexer.run(*args, **kwargs) | indexer.run(*args, **kwargs) | ||||
return indexer.results | return indexer.results | ||||
class OriginHeadTestIndexer(OriginHeadTestIndexer): | class OriginHeadTestIndexer(OriginHeadTestIndexer): | ||||
def prepare(self): | def prepare(self): | ||||
super().prepare() | super().prepare() | ||||
self.config['tasks'] = { | self.config['tasks'] = { | ||||
'revision_metadata': 'revision_metadata_test_task', | 'revision_metadata': 'revision_metadata_test_task', | ||||
'origin_intrinsic_metadata': 'origin_intrinsic_metadata_test_task', | 'origin_intrinsic_metadata': 'origin_intrinsic_metadata_test_task', | ||||
} | } | ||||
class TestOriginMetadata(SchedulerTestFixture, unittest.TestCase): | class TestOriginMetadata(SchedulerTestFixture, unittest.TestCase): | ||||
def setUp(self): | def setUp(self): | ||||
super().setUp() | super().setUp() | ||||
self.maxDiff = None | self.maxDiff = None | ||||
# FIXME: Improve mock indexer storage reset behavior | |||||
MockIndexerStorage.added_data = [] | |||||
MockIndexerStorage.revision_metadata = {} | |||||
self.add_scheduler_task_type( | self.add_scheduler_task_type( | ||||
'revision_metadata_test_task', | 'revision_metadata_test_task', | ||||
'swh.indexer.tests.test_origin_metadata.' | 'swh.indexer.tests.test_origin_metadata.' | ||||
'revision_metadata_test_task') | 'revision_metadata_test_task') | ||||
self.add_scheduler_task_type( | self.add_scheduler_task_type( | ||||
'origin_intrinsic_metadata_test_task', | 'origin_intrinsic_metadata_test_task', | ||||
'swh.indexer.tests.test_origin_metadata.' | 'swh.indexer.tests.test_origin_metadata.' | ||||
'origin_intrinsic_metadata_test_task') | 'origin_intrinsic_metadata_test_task') | ||||
RevisionMetadataTestIndexer.scheduler = self.scheduler | RevisionMetadataTestIndexer.scheduler = self.scheduler | ||||
def tearDown(self): | def tearDown(self): | ||||
del RevisionMetadataTestIndexer.scheduler | del RevisionMetadataTestIndexer.scheduler | ||||
super().tearDown() | super().tearDown() | ||||
def test_pipeline(self): | @unittest.mock.patch('swh.indexer.storage.in_memory.IndexerStorage') | ||||
@unittest.mock.patch('swh.storage.in_memory.Storage') | |||||
def test_pipeline(self, storage_mock, idx_storage_mock): | |||||
# Always returns the same instance of the idx storage, because | |||||
# this function is called by each of the three indexers. | |||||
objstorage = InMemoryObjStorage() | |||||
storage = Storage() | |||||
idx_storage = IndexerStorage() | |||||
storage_mock.return_value = storage | |||||
idx_storage_mock.return_value = idx_storage | |||||
fill_obj_storage(objstorage) | |||||
fill_storage(storage) | |||||
# TODO: find a better way to share the ContentMetadataIndexer use | |||||
# the same objstorage instance. | |||||
import swh.objstorage | |||||
old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory'] | |||||
swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage | |||||
try: | |||||
indexer = OriginHeadTestIndexer() | indexer = OriginHeadTestIndexer() | ||||
indexer.scheduler = self.scheduler | indexer.scheduler = self.scheduler | ||||
indexer.run(["git+https://github.com/librariesio/yarn-parser"]) | indexer.run(["git+https://github.com/librariesio/yarn-parser"]) | ||||
self.run_ready_tasks() # Run the first task | self.run_ready_tasks() # Run the first task | ||||
time.sleep(0.1) # Give it time to complete and schedule the 2nd one | # Give it time to complete and schedule the 2nd one | ||||
time.sleep(0.1) | |||||
self.run_ready_tasks() # Run the second task | self.run_ready_tasks() # Run the second task | ||||
finally: | |||||
swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage | |||||
origin = storage.origin_get({ | |||||
'type': 'git', | |||||
'url': 'https://github.com/librariesio/yarn-parser'}) | |||||
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') | |||||
metadata = { | metadata = { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
'url': | 'url': | ||||
'https://github.com/librariesio/yarn-parser#readme', | 'https://github.com/librariesio/yarn-parser#readme', | ||||
'schema:codeRepository': | 'codeRepository': | ||||
'git+https://github.com/librariesio/yarn-parser.git', | 'git+git+https://github.com/librariesio/yarn-parser.git', | ||||
'schema:author': 'Andrew Nesbitt', | 'author': [{ | ||||
'license': 'AGPL-3.0', | 'type': 'Person', | ||||
'name': 'Andrew Nesbitt' | |||||
}], | |||||
'license': 'https://spdx.org/licenses/AGPL-3.0', | |||||
'version': '1.0.0', | 'version': '1.0.0', | ||||
'description': | 'description': | ||||
'Tiny web service for parsing yarn.lock files', | 'Tiny web service for parsing yarn.lock files', | ||||
'codemeta:issueTracker': | 'issueTracker': | ||||
'https://github.com/librariesio/yarn-parser/issues', | 'https://github.com/librariesio/yarn-parser/issues', | ||||
'name': 'yarn-parser', | 'name': 'yarn-parser', | ||||
'keywords': ['yarn', 'parse', 'lock', 'dependencies'], | 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], | ||||
} | } | ||||
rev_metadata = { | rev_metadata = { | ||||
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), | 'id': rev_id, | ||||
'translated_metadata': metadata, | 'translated_metadata': metadata, | ||||
} | } | ||||
origin_metadata = { | origin_metadata = { | ||||
'origin_id': 54974445, | 'origin_id': origin['id'], | ||||
'from_revision': hash_to_bytes( | 'from_revision': rev_id, | ||||
'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), | |||||
'metadata': metadata, | 'metadata': metadata, | ||||
} | } | ||||
expected_results = [ | |||||
('revision_metadata', True, [rev_metadata]), | |||||
('origin_intrinsic_metadata', True, [origin_metadata]), | |||||
] | |||||
results = list(indexer.idx_storage.added_data) | results = list(indexer.idx_storage.revision_metadata_get([rev_id])) | ||||
for result in results: | for result in results: | ||||
metadata = result[2] | del result['tool'] | ||||
for item in metadata: | self.assertEqual(results, [rev_metadata]) | ||||
# cannot check those (generated ids) | |||||
del item['indexer_configuration_id'] | |||||
self.assertCountEqual(expected_results, results) | results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ | ||||
origin['id']])) | |||||
for result in results: | |||||
del result['tool'] | |||||
self.assertEqual(results, [origin_metadata]) |