diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -78,7 +78,6 @@ super().prepare() self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] - self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. @@ -91,12 +90,6 @@ } for sha1 in ids )) - def compute_ctags(self, path, lang): - """Compute ctags on file at path with language lang. - - """ - return run_ctags(path, lang=lang) - def index(self, id, data): """Index sha1s' content and store result. diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -71,22 +71,6 @@ def prepare(self): super().prepare() self.working_directory = self.config['workdir'] - self.tool = self.tools[0] - - def compute_license(self, path, log=None): - """Determine license from file at path. - - Args: - path: filepath to determine the license - - Returns: - dict: A dict with the following keys: - - - licenses ([str]): associated detected licenses to path - - path (bytes): content filepath - - """ - return compute_license(path, log=log) def index(self, id, data): """Index sha1s' content and store result. @@ -110,7 +94,7 @@ data=data) try: - properties = self.compute_license(path=content_path, log=self.log) + properties = compute_license(path=content_path, log=self.log) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -143,11 +143,20 @@ ADDITIONAL_CONFIG = {} - def __init__(self): + USE_TOOLS = True + + def __init__(self, config=None, **kw): """Prepare and check that the indexer is ready to run. """ super().__init__() + if config is not None: + self.config = config + else: + config_keys = ('base_filename', 'config_filename', + 'additional_configs', 'global_config') + config_args = {k: v for k, v in kw.items() if k in config_keys} + self.config = self.parse_config_file(**config_args) self.prepare() self.check() @@ -156,29 +165,36 @@ Without this step, the indexer cannot possibly run. """ - # HACK to deal with edge case (e.g revision metadata indexer) - if not hasattr(self, 'config'): - self.config = self.parse_config_file( - additional_configs=[self.ADDITIONAL_CONFIG]) config_storage = self.config.get('storage') if config_storage: self.storage = get_storage(**config_storage) + objstorage = self.config['objstorage'] - self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) + self.objstorage = get_objstorage(objstorage['cls'], + objstorage['args']) + idx_storage = self.config[INDEXER_CFG_KEY] self.idx_storage = get_indexer_storage(**idx_storage) _log = logging.getLogger('requests.packages.urllib3.connectionpool') _log.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') - self.tools = list(self.register_tools(self.config['tools'])) - def check(self, *, check_tools=True): + if self.USE_TOOLS: + self.tools = list(self.register_tools( + self.config.get('tools', []))) + self.results = [] + + @property + def tool(self): + return self.tools[0] + + def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ - if check_tools and not self.tools: + if self.USE_TOOLS and not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) @@ -235,6 +251,18 @@ """ pass + def filter(self, ids): + """Filter missing ids for that particular indexer. + + Args: + ids ([bytes]): list of ids + + Yields: + iterator of missing ids + + """ + yield from ids + @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. @@ -314,18 +342,6 @@ methods mentioned in the :class:`BaseIndexer` class. """ - @abc.abstractmethod - def filter(self, ids): - """Filter missing ids for that particular indexer. - - Args: - ids ([bytes]): list of ids - - Yields: - iterator of missing ids - - """ - pass def run(self, ids, policy_update, next_step=None, **kwargs): diff --git a/swh/indexer/language.py b/swh/indexer/language.py --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -132,9 +132,8 @@ def prepare(self): super().prepare() - c = self.config - self.max_content_size = c['tools']['configuration']['max_content_size'] - self.tool = self.tools[0] + self.max_content_size = self.tool['tool_configuration'].get( + 'max_content_size', 10240) def filter(self, ids): """Filter out known sha1s and return only missing ones. diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -6,6 +6,7 @@ import click import itertools import logging +from copy import deepcopy from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer from swh.indexer.metadata_dictionary import MAPPINGS @@ -32,16 +33,6 @@ # (not the case for example in the case of the RevisionMetadataIndexer) CONFIG_BASE_FILENAME = 'indexer/content_metadata' - def __init__(self, tool, config): - # FIXME: Simplify this twisted way to use the exact same - # config of RevisionMetadataIndexer object that uses - # internally ContentMetadataIndexer - self.config = config - self.config['tools'] = tool - self.results = [] - super().__init__() - self.tool = self.tools[0] # Tool is now registered (cf. prepare call) - def filter(self, ids): """Filter out known sha1s and return only missing ones. """ @@ -125,12 +116,6 @@ }), } - ContentMetadataIndexer = ContentMetadataIndexer - - def prepare(self): - super().prepare() - self.tool = self.tools[0] - def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. @@ -226,9 +211,11 @@ k: self.config[k] for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] } + config['tools'] = [tool] for context in detected_files.keys(): - tool['configuration']['context'] = context - c_metadata_indexer = self.ContentMetadataIndexer(tool, config) + cfg = deepcopy(config) + cfg['tools'][0]['configuration']['context'] = context + c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( @@ -271,12 +258,7 @@ 'tools': ('list', []) } - def check(self, **kwargs): - kwargs['check_tools'] = False - super().check(**kwargs) - - def filter(self, ids): - return ids + USE_TOOLS = False def run(self, origin_head, policy_update): """Expected to be called with the result of RevisionMetadataIndexer diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -48,10 +48,6 @@ CONFIG_BASE_FILENAME = 'indexer/mimetype' - def prepare(self): - super().prepare() - self.tool = self.tools[0] - def index(self, id, data): """Index sha1s' content and store result. diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -36,9 +36,6 @@ CONFIG_BASE_FILENAME = 'indexer/origin_head' - def filter(self, ids): - yield from ids - def persist_index_computations(self, results, policy_update): """Do nothing. The indexer's results are not persistent, they should only be piped to another indexer.""" diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -7,6 +7,7 @@ from unittest.mock import patch +from swh.indexer import fossology_license from swh.indexer.fossology_license import ( FossologyLicenseIndexer, FossologyLicenseRangeIndexer, compute_license @@ -42,23 +43,19 @@ }) -class InjectLicenseIndexer: - """Override license computations. +def mock_compute_license(path, log=None): + """path is the content identifier """ - def compute_license(self, path, log=None): - """path is the content identifier - - """ - if isinstance(id, bytes): - path = path.decode('utf-8') - return { - 'licenses': SHA1_TO_LICENSES.get(path) - } + if isinstance(id, bytes): + path = path.decode('utf-8') + return { + 'licenses': SHA1_TO_LICENSES.get(path) + } class FossologyLicenseTestIndexer( - NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseIndexer): + NoDiskIndexer, FossologyLicenseIndexer): """Specific fossology license whose configuration is enough to satisfy the indexing checks. @@ -90,6 +87,10 @@ def setUp(self): super().setUp() + # replace actual license computation with a mock + self.orig_compute_license = fossology_license.compute_license + fossology_license.compute_license = mock_compute_license + self.indexer = FossologyLicenseTestIndexer() self.idx_storage = self.indexer.idx_storage fill_storage(self.indexer.storage) @@ -117,9 +118,13 @@ } } + def tearDown(self): + super().tearDown() + fossology_license.compute_license = self.orig_compute_license + class FossologyLicenseRangeIndexerTest( - NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer): + NoDiskIndexer, FossologyLicenseRangeIndexer): """Testing the range indexer on fossology license. """ @@ -150,6 +155,11 @@ """ def setUp(self): super().setUp() + + # replace actual license computation with a mock + self.orig_compute_license = fossology_license.compute_license + fossology_license.compute_license = mock_compute_license + self.indexer = FossologyLicenseRangeIndexerTest() fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) @@ -176,6 +186,10 @@ } } + def tearDown(self): + super().tearDown() + fossology_license.compute_license = self.orig_compute_license + class FossologyLicenseIndexerUnknownToolTestStorage( CommonIndexerNoTool, FossologyLicenseTestIndexer): diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -192,8 +192,9 @@ ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping - metadata_indexer = ContentMetadataTestIndexer( - tool=TRANSLATOR_TOOL, config=BASE_TEST_CONFIG.copy()) + config = BASE_TEST_CONFIG.copy() + config['tools'] = [TRANSLATOR_TOOL] + metadata_indexer = ContentMetadataTestIndexer(config=config) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage)