diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -5,8 +5,9 @@ import json import subprocess -from typing import Dict, List +from typing import Any, Dict, List +from swh.core.config import merge_configs from swh.model import hashutil from .indexer import ContentIndexer, write_to_temp @@ -59,35 +60,24 @@ } +DEFAULT_CONFIG: Dict[str, Any] = { + "workdir": "/tmp/swh/indexer.ctags", + "tools": { + "name": "universal-ctags", + "version": "~git7859817b", + "configuration": { + "command_line": """ctags --fields=+lnz --sort=no --links=no """ + """--output-format=json """ + }, + }, + "languages": {}, +} + + class CtagsIndexer(ContentIndexer): - CONFIG_BASE_FILENAME = "indexer/ctags" - - ADDITIONAL_CONFIG = { - "workdir": ("str", "/tmp/swh/indexer.ctags"), - "tools": ( - "dict", - { - "name": "universal-ctags", - "version": "~git7859817b", - "configuration": { - "command_line": """ctags --fields=+lnz --sort=no --links=no """ - """--output-format=json """ - }, - }, - ), - "languages": ( - "dict", - { - "ada": "Ada", - "adl": None, - "agda": None, - # ... - }, - ), - } - - def prepare(self): - super().prepare() + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.config = merge_configs(DEFAULT_CONFIG, self.config) self.working_directory = self.config["workdir"] self.language_map = self.config["languages"] diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -7,6 +7,7 @@ import subprocess from typing import Any, Dict, List, Optional, Union +from swh.core.config import merge_configs from swh.indexer.storage.interface import PagedResult, Sha1 from swh.model import hashutil from swh.model.model import Revision @@ -53,6 +54,17 @@ } +DEFAULT_CONFIG: Dict[str, Any] = { + "workdir": "/tmp/swh/indexer.fossology.license", + "tools": { + "name": "nomos", + "version": "3.1.0rc2-31-ga2cbb8c", + "configuration": {"command_line": "nomossa ",}, + }, + "write_batch_size": 1000, +} + + class MixinFossologyLicenseIndexer: """Mixin fossology license indexer. @@ -61,25 +73,12 @@ """ - ADDITIONAL_CONFIG = { - "workdir": ("str", "/tmp/swh/indexer.fossology.license"), - "tools": ( - "dict", - { - "name": "nomos", - "version": "3.1.0rc2-31-ga2cbb8c", - "configuration": {"command_line": "nomossa ",}, - }, - ), - "write_batch_size": ("int", 1000), - } - - CONFIG_BASE_FILENAME = "indexer/fossology_license" # type: Optional[str] tool: Any idx_storage: Any - def prepare(self): - super().prepare() + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.config = merge_configs(DEFAULT_CONFIG, self.config) self.working_directory = self.config["workdir"] def index( diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -9,10 +9,10 @@ import os import shutil import tempfile -from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Set, Union from swh.core import utils -from swh.core.config import SWHConfig +from swh.core.config import load_from_envvar, merge_configs from swh.indexer.storage import INDEXER_CFG_KEY, PagedResult, Sha1, get_indexer_storage from swh.model import hashutil from swh.model.model import Revision @@ -49,7 +49,14 @@ shutil.rmtree(temp_dir) -class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): +DEFAULT_CONFIG = { + INDEXER_CFG_KEY: {"cls": "memory"}, + "storage": {"cls": "memory"}, + "objstorage": {"cls": "memory"}, +} + + +class BaseIndexer(metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in @@ -104,25 +111,6 @@ results: List[Dict] - CONFIG = "indexer/base" - - DEFAULT_CONFIG = { - INDEXER_CFG_KEY: ( - "dict", - {"cls": "remote", "args": {"url": "http://localhost:5007/"}}, - ), - "storage": ( - "dict", - {"cls": "remote", "args": {"url": "http://localhost:5002/",}}, - ), - "objstorage": ( - "dict", - {"cls": "remote", "args": {"url": "http://localhost:5003/",}}, - ), - } - - ADDITIONAL_CONFIG = {} # type: Dict[str, Tuple[str, Any]] - USE_TOOLS = True catch_exceptions = True @@ -141,18 +129,8 @@ elif SWH_CONFIG: self.config = SWH_CONFIG.copy() else: - config_keys = ( - "base_filename", - "config_filename", - "additional_configs", - "global_config", - ) - config_args = {k: v for k, v in kw.items() if k in config_keys} - if self.ADDITIONAL_CONFIG: - config_args.setdefault("additional_configs", []).append( - self.ADDITIONAL_CONFIG - ) - self.config = self.parse_config_file(**config_args) + self.config = load_from_envvar() + self.config = merge_configs(DEFAULT_CONFIG, self.config) self.prepare() self.check() self.log.debug("%s: config=%s", self, self.config) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -6,6 +6,7 @@ from copy import deepcopy from typing import Any, Callable, Dict, Iterator, List, Tuple +from swh.core.config import merge_configs from swh.core.utils import grouper from swh.indexer.codemeta import merge_documents from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer @@ -101,6 +102,15 @@ ) +DEFAULT_CONFIG: Dict[str, Any] = { + "tools": { + "name": "swh-metadata-detector", + "version": "0.0.2", + "configuration": {}, + }, +} + + class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer @@ -116,12 +126,9 @@ """ - ADDITIONAL_CONFIG = { - "tools": ( - "dict", - {"name": "swh-metadata-detector", "version": "0.0.2", "configuration": {},}, - ), - } + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.config = merge_configs(DEFAULT_CONFIG, self.config) def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. @@ -272,8 +279,6 @@ class OriginMetadataIndexer(OriginIndexer): - ADDITIONAL_CONFIG = RevisionMetadataIndexer.ADDITIONAL_CONFIG - USE_TOOLS = False def __init__(self, config=None, **kwargs) -> None: diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -7,6 +7,7 @@ import magic +from swh.core.config import merge_configs from swh.indexer.storage.interface import PagedResult, Sha1 from swh.model.model import Revision @@ -41,6 +42,16 @@ } +DEFAULT_CONFIG: Dict[str, Any] = { + "tools": { + "name": "file", + "version": "1:5.30-1+deb9u1", + "configuration": {"type": "library", "debian-package": "python3-magic"}, + }, + "write_batch_size": 1000, +} + + class MixinMimetypeIndexer: """Mixin mimetype indexer. @@ -50,19 +61,10 @@ tool: Any idx_storage: Any - ADDITIONAL_CONFIG = { - "tools": ( - "dict", - { - "name": "file", - "version": "1:5.30-1+deb9u1", - "configuration": {"type": "library", "debian-package": "python3-magic"}, - }, - ), - "write_batch_size": ("int", 1000), - } - CONFIG_BASE_FILENAME = "indexer/mimetype" # type: Optional[str] + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.config = merge_configs(DEFAULT_CONFIG, self.config) def index( self, id: Union[bytes, Dict, Revision], data: Optional[bytes] = None, **kwargs diff --git a/swh/indexer/rehash.py b/swh/indexer/rehash.py --- a/swh/indexer/rehash.py +++ b/swh/indexer/rehash.py @@ -9,15 +9,30 @@ from typing import Any, Dict, Generator, List, Optional, Tuple from swh.core import utils -from swh.core.config import SWHConfig +from swh.core.config import load_from_envvar from swh.model import hashutil from swh.model.model import Content from swh.objstorage.exc import ObjNotFoundError from swh.objstorage.factory import get_objstorage from swh.storage import get_storage - -class RecomputeChecksums(SWHConfig): +DEFAULT_CONFIG: Dict[str, Any] = { + "storage": {"cls": "memory"}, + "objstorage": {"cls": "memory"}, + # the set of checksums that should be computed. + # Examples: 'sha1_git', 'blake2b512', 'blake2s256' + "compute_checksums": [], + # whether checksums that already exist in the DB should be + # recomputed/updated or left untouched + "recompute_checksums": False, + # Number of contents to retrieve blobs at the same time + "batch_size_retrieve_content": 10, + # Number of contents to update at the same time + "batch_size_update": 100, +} + + +class RecomputeChecksums: """Class in charge of (re)computing content's hashes. Hashes to compute are defined across 2 configuration options: @@ -35,39 +50,8 @@ """ - DEFAULT_CONFIG = { - # The storage to read from or update metadata to - "storage": ( - "dict", - {"cls": "remote", "args": {"url": "http://localhost:5002/"},}, - ), - # The objstorage to read contents' data from - "objstorage": ( - "dict", - { - "cls": "pathslicing", - "args": { - "root": "/srv/softwareheritage/objects", - "slicing": "0:2/2:4/4:6", - }, - }, - ), - # the set of checksums that should be computed. - # Examples: 'sha1_git', 'blake2b512', 'blake2s256' - "compute_checksums": ("list[str]", []), - # whether checksums that already exist in the DB should be - # recomputed/updated or left untouched - "recompute_checksums": ("bool", False), - # Number of contents to retrieve blobs at the same time - "batch_size_retrieve_content": ("int", 10), - # Number of contents to update at the same time - "batch_size_update": ("int", 100), - } - - CONFIG_BASE_FILENAME = "indexer/rehash" - def __init__(self) -> None: - self.config = self.parse_config_file() + self.config = load_from_envvar(DEFAULT_CONFIG) self.storage = get_storage(**self.config["storage"]) self.objstorage = get_objstorage(**self.config["objstorage"]) self.compute_checksums = self.config["compute_checksums"] diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -3,11 +3,14 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy from datetime import datetime, timezone import unittest +import pytest + from swh.indexer.origin_head import OriginHeadIndexer -from swh.indexer.tests.utils import BASE_TEST_CONFIG, fill_storage +from swh.indexer.tests.utils import fill_storage from swh.model.model import ( Origin, OriginVisit, @@ -18,11 +21,24 @@ ) from swh.storage.utils import now -ORIGIN_HEAD_CONFIG = { - **BASE_TEST_CONFIG, - "tools": {"name": "origin-metadata", "version": "0.0.1", "configuration": {},}, - "tasks": {"revision_intrinsic_metadata": None, "origin_intrinsic_metadata": None,}, -} + +@pytest.fixture +def swh_indexer_config(swh_indexer_config): + config = copy.deepcopy(swh_indexer_config) + config.update( + { + "tools": { + "name": "origin-metadata", + "version": "0.0.1", + "configuration": {}, + }, + "tasks": { + "revision_intrinsic_metadata": None, + "origin_intrinsic_metadata": None, + }, + } + ) + return config class OriginHeadTestIndexer(OriginHeadIndexer): @@ -30,15 +46,14 @@ indexing tests. """ - def parse_config_file(self, *args, **kwargs): - return ORIGIN_HEAD_CONFIG - def persist_index_computations(self, results, policy_update): self.results = results class OriginHead(unittest.TestCase): - def setUp(self): + @pytest.fixture(autouse=True) + def init(self, swh_config): + super().setUp() self.indexer = OriginHeadTestIndexer() self.indexer.catch_exceptions = False fill_storage(self.indexer.storage)