diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -160,7 +160,8 @@ It: - filters out the non textual content - - (optionally) filters out content already indexed (cf :callable:`range`) + - (optionally) filters out content already indexed (cf + :func:`indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -5,17 +5,20 @@ import unittest -from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS -from swh.indexer.metadata_detector import detect_metadata -from swh.indexer.metadata_detector import extract_minimal_metadata_dict -from swh.indexer.metadata import ContentMetadataIndexer -from swh.indexer.metadata import RevisionMetadataIndexer -from swh.indexer.tests.test_utils import MockObjStorage, MockStorage -from swh.indexer.tests.test_utils import MockIndexerStorage - from swh.model.hashutil import hash_to_bytes -from .test_utils import BASE_TEST_CONFIG +from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS +from swh.indexer.metadata_detector import ( + detect_metadata, extract_minimal_metadata_dict +) +from swh.indexer.metadata import ( + ContentMetadataIndexer, RevisionMetadataIndexer +) + +from .test_utils import ( + MockObjStorage, MockStorage, MockIndexerStorage, + BASE_TEST_CONFIG +) class ContentMetadataTestIndexer(ContentMetadataIndexer): diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -4,62 +4,33 @@ # See top-level LICENSE file for more information import time -import logging import unittest -from celery import task - -from swh.indexer.metadata import OriginMetadataIndexer, \ - RevisionMetadataIndexer, ContentMetadataIndexer -from swh.indexer.tests.test_utils import MockObjStorage, MockStorage -from swh.indexer.tests.test_utils import MockIndexerStorage -from swh.indexer.tests.test_origin_head import OriginHeadTestIndexer - -from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture +from celery import task from swh.model.hashutil import hash_to_bytes -from .test_utils import BASE_TEST_CONFIG - +from swh.indexer.metadata import ( + OriginMetadataIndexer, RevisionMetadataIndexer +) -class OriginMetadataTestIndexer(OriginMetadataIndexer): - def parse_config_file(self, *args, **kwargs): - return { - **BASE_TEST_CONFIG, - 'tools': [], - } - - def prepare(self): - super().prepare() - self.storage = MockStorage() - self.idx_storage = MockIndexerStorage() - self.objstorage = MockObjStorage() - - -class ContentMetadataTestIndexer(ContentMetadataIndexer): - """Specific Metadata whose configuration is enough to satisfy the - indexing tests. - """ - def prepare(self): - self.idx_storage = MockIndexerStorage() - self.log = logging.getLogger('swh.indexer') - self.objstorage = MockObjStorage() - self.tools = self.register_tools(self.config['tools']) - self.tool = self.tools[0] - self.results = [] +from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture +from .test_utils import ( + MockObjStorage, MockStorage, MockIndexerStorage, + BASE_TEST_CONFIG +) +from .test_origin_head import OriginHeadTestIndexer +from .test_metadata import ContentMetadataTestIndexer class RevisionMetadataTestIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ - ContentMetadataIndexer = ContentMetadataTestIndexer - def prepare(self): - self.config = { - 'storage': {}, - 'objstorage': {}, - 'indexer_storage': {}, + def parse_config_file(self, *args, **kwargs): + return { + **BASE_TEST_CONFIG, 'tools': { 'name': 'swh-metadata-detector', 'version': '0.0.2', @@ -69,12 +40,12 @@ } } } - self.storage = MockStorage() + + def prepare(self): + super().prepare() self.idx_storage = MockIndexerStorage() - self.log = logging.getLogger('swh.indexer') + self.storage = MockStorage() self.objstorage = MockObjStorage() - self.tools = self.register_tools(self.config['tools']) - self.tool = self.tools[0] @task @@ -84,6 +55,20 @@ return indexer.results +class OriginMetadataTestIndexer(OriginMetadataIndexer): + def parse_config_file(self, *args, **kwargs): + return { + **BASE_TEST_CONFIG, + 'tools': [] + } + + def prepare(self): + super().prepare() + self.storage = MockStorage() + self.objstorage = MockObjStorage() + self.idx_storage = MockIndexerStorage() + + @task def origin_intrinsic_metadata_test_task(*args, **kwargs): indexer = OriginMetadataTestIndexer() @@ -149,18 +134,23 @@ rev_metadata = { 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'translated_metadata': metadata, - 'indexer_configuration_id': 7, } origin_metadata = { 'origin_id': 54974445, 'from_revision': hash_to_bytes( '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'metadata': metadata, - 'indexer_configuration_id': 7, } expected_results = [ - ('origin_intrinsic_metadata', True, [origin_metadata]), - ('revision_metadata', True, [rev_metadata])] + ('revision_metadata', True, [rev_metadata]), + ('origin_intrinsic_metadata', True, [origin_metadata]), + ] results = list(indexer.idx_storage.added_data) + for result in results: + metadata = result[2] + for item in metadata: + # cannot check those (generated ids) + del item['indexer_configuration_id'] + self.assertCountEqual(expected_results, results) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -357,7 +357,15 @@ for item in self.revision_metadata.get(id_): item = item.copy() tool_id = item.pop('indexer_configuration_id') - item['tool'] = self.tools[tool_id].copy() + if tool_id in self.tools: + item['tool'] = self.tools[tool_id].copy() + else: # HACK: this needs to be removed altogether + item['tool'] = { + 'id': tool_id, + 'name': tool_id[0], + 'version': tool_id[1], + 'configuration': tool_id[2], + } yield item def origin_intrinsic_metadata_add(self, metadata, conflict_update=None):