diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -5,12 +5,12 @@ import json import subprocess -from typing import Any, Dict, Iterator, List, Optional, Union +from typing import Any, Dict, Iterator, List, Optional from swh.core.config import merge_configs +from swh.indexer.storage import Sha1 from swh.indexer.storage.model import ContentCtagsRow from swh.model import hashutil -from swh.model.model import Revision from .indexer import ContentIndexer, write_to_temp @@ -92,7 +92,7 @@ ) def index( - self, id: Union[bytes, Dict, Revision], data: Optional[bytes] = None, **kwargs + self, id: Sha1, data: Optional[bytes] = None, **kwargs ) -> List[ContentCtagsRow]: """Index sha1s' content and store result. diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -5,13 +5,12 @@ import logging import subprocess -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from swh.core.config import merge_configs from swh.indexer.storage.interface import IndexerStorageInterface, PagedResult, Sha1 from swh.indexer.storage.model import ContentLicenseRow from swh.model import hashutil -from swh.model.model import Revision from .indexer import ContentIndexer, ContentPartitionIndexer, write_to_temp @@ -83,7 +82,7 @@ self.working_directory = self.config["workdir"] def index( - self, id: Union[bytes, Dict, Revision], data: Optional[bytes] = None, **kwargs + self, id: Sha1, data: Optional[bytes] = None, **kwargs ) -> List[ContentLicenseRow]: """Index sha1s' content and store result. @@ -100,7 +99,6 @@ - indexer_configuration_id (int): tool used to compute the output """ - assert isinstance(id, bytes) assert data is not None with write_to_temp( filename=hashutil.hash_to_hex(id), # use the id as pathname diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -16,6 +16,7 @@ from swh.indexer.storage import INDEXER_CFG_KEY, PagedResult, Sha1, get_indexer_storage from swh.indexer.storage.interface import IndexerStorageInterface from swh.model import hashutil +from swh.model.model import Revision, Sha1Git from swh.objstorage.exc import ObjNotFoundError from swh.objstorage.factory import get_objstorage from swh.scheduler import CONFIG as SWH_CONFIG @@ -57,10 +58,15 @@ } +TId = TypeVar("TId") +"""type of the ids of index()ed objects.""" +TData = TypeVar("TData") +"""type of the objects passed to index().""" TResult = TypeVar("TResult") +"""return type of index()""" -class BaseIndexer(Generic[TResult], metaclass=abc.ABCMeta): +class BaseIndexer(Generic[TId, TData, TResult], metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in @@ -216,7 +222,7 @@ else: return [] - def index(self, id, data: Optional[bytes] = None, **kwargs) -> List[TResult]: + def index(self, id: TId, data: Optional[TData], **kwargs) -> List[TResult]: """Index computation for the id and associated raw data. Args: @@ -231,7 +237,7 @@ """ raise NotImplementedError() - def filter(self, ids: List[bytes]) -> Iterator[bytes]: + def filter(self, ids: List[TId]) -> Iterator[TId]: """Filter missing ids for that particular indexer. Args: @@ -263,7 +269,7 @@ return {} -class ContentIndexer(BaseIndexer[TResult], Generic[TResult]): +class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]): """A content indexer working on a list of ids directly. To work on indexer partition, use the :class:`ContentPartitionIndexer` @@ -275,9 +281,7 @@ """ - def run( - self, ids: Union[List[bytes], bytes, str], policy_update: str, **kwargs - ) -> Dict: + def run(self, ids: List[Sha1], policy_update: str, **kwargs) -> Dict: """Given a list of ids: - retrieve the content from the storage @@ -324,7 +328,7 @@ return summary -class ContentPartitionIndexer(BaseIndexer[TResult], Generic[TResult]): +class ContentPartitionIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]): """A content partition indexer. This expects as input a partition_id and a nb_partitions. This will then index the @@ -493,7 +497,7 @@ return summary -class OriginIndexer(BaseIndexer[TResult], Generic[TResult]): +class OriginIndexer(BaseIndexer[str, None, TResult], Generic[TResult]): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method @@ -549,7 +553,7 @@ return results -class RevisionIndexer(BaseIndexer[TResult], Generic[TResult]): +class RevisionIndexer(BaseIndexer[Sha1Git, Revision, TResult], Generic[TResult]): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Revision indexing using the run method @@ -560,7 +564,7 @@ """ - def run(self, ids: Union[str, bytes], policy_update: str) -> Dict: + def run(self, ids: List[Sha1Git], policy_update: str) -> Dict: """Given a list of sha1_gits: - retrieve revisions from storage @@ -579,15 +583,15 @@ revision_ids = [ hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids ] - for rev in self.storage.revision_get(revision_ids): + for (rev_id, rev) in zip(revision_ids, self.storage.revision_get(revision_ids)): if not rev: + # TODO: call self.index() with rev=None? self.log.warning( - "Revisions %s not found in storage" - % list(map(hashutil.hash_to_hex, ids)) + "Revision %s not found in storage", hashutil.hash_to_hex(rev_id) ) continue try: - results.extend(self.index(rev)) + results.extend(self.index(rev_id, rev)) except Exception: if not self.catch_exceptions: raise diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -23,14 +23,14 @@ from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.origin_head import OriginHeadIndexer -from swh.indexer.storage import INDEXER_CFG_KEY +from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 from swh.indexer.storage.model import ( ContentMetadataRow, OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow, ) from swh.model import hashutil -from swh.model.model import Revision +from swh.model.model import Revision, Sha1Git REVISION_GET_BATCH_SIZE = 10 ORIGIN_GET_BATCH_SIZE = 10 @@ -71,7 +71,11 @@ ) def index( - self, id, data: Optional[bytes] = None, log_suffix="unknown revision", **kwargs + self, + id: Sha1, + data: Optional[bytes] = None, + log_suffix="unknown revision", + **kwargs, ) -> List[ContentMetadataRow]: """Index sha1s' content and store result. @@ -162,7 +166,9 @@ ) ) - def index(self, id, data=None, **kwargs) -> List[RevisionIntrinsicMetadataRow]: + def index( + self, id: Sha1Git, data: Optional[Revision], **kwargs + ) -> List[RevisionIntrinsicMetadataRow]: """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames @@ -171,7 +177,8 @@ - if multiple file detected -> translation needed at revision level Args: - rev: revision model object from storage + id: sha1_git of the revision + data: revision model object from storage Returns: dict: dictionary representing a revision_intrinsic_metadata, with @@ -182,9 +189,8 @@ - metadata: dict of retrieved metadata """ - rev = id + rev = data assert isinstance(rev, Revision) - assert data is None try: root_dir = rev.directory @@ -343,7 +349,7 @@ self.log.warning("Missing head revision of origin %r", origin.url) continue - for rev_metadata in self.revision_metadata_indexer.index(rev): + for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev): # There is at most one rev_metadata orig_metadata = OriginIntrinsicMetadataRow( from_revision=rev_metadata.id, diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -3,14 +3,13 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional import magic from swh.core.config import merge_configs from swh.indexer.storage.interface import IndexerStorageInterface, PagedResult, Sha1 from swh.indexer.storage.model import ContentMimetypeRow -from swh.model.model import Revision from .indexer import ContentIndexer, ContentPartitionIndexer @@ -68,7 +67,7 @@ self.config = merge_configs(DEFAULT_CONFIG, self.config) def index( - self, id: Union[bytes, Dict, Revision], data: Optional[bytes] = None, **kwargs + self, id: Sha1, data: Optional[bytes] = None, **kwargs ) -> List[ContentMimetypeRow]: """Index sha1s' content and store result. @@ -84,7 +83,6 @@ - encoding: encoding in bytes """ - assert isinstance(id, bytes) assert data is not None properties = compute_mimetype_encoding(data) return [ diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -5,7 +5,7 @@ import logging import re -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Tuple, Union import click @@ -15,7 +15,7 @@ from swh.storage.algos.snapshot import snapshot_get_all_branches -class OriginHeadIndexer(OriginIndexer[Optional[Dict]]): +class OriginHeadIndexer(OriginIndexer[Dict]): """Origin-level indexer. This indexer is in charge of looking up the revision that acts as the @@ -34,13 +34,15 @@ # Dispatch - def index(self, origin_url): + def index(self, id: str, data: None = None, **kwargs) -> List[Dict]: + origin_url = id visit_and_status = origin_get_latest_visit_status( self.storage, origin_url, allowed_statuses=["full"], require_snapshot=True ) if not visit_and_status: return [] visit, visit_status = visit_and_status + assert visit_status.snapshot is not None snapshot = snapshot_get_all_branches(self.storage, visit_status.snapshot) if snapshot is None: return [] diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py --- a/swh/indexer/tests/test_indexer.py +++ b/swh/indexer/tests/test_indexer.py @@ -3,7 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from unittest.mock import Mock import pytest @@ -15,7 +15,6 @@ RevisionIndexer, ) from swh.indexer.storage import PagedResult, Sha1 -from swh.model.model import Revision from .utils import BASE_TEST_CONFIG @@ -28,7 +27,7 @@ USE_TOOLS = False def index( - self, id: Union[bytes, Dict, Revision], data: Optional[bytes] = None, **kwargs + self, id: Any, data: Optional[Any] = None, **kwargs ) -> List[Dict[str, Any]]: raise _TestException()