diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -437,11 +437,12 @@ ) for sha1 in indexed_page.results: contents.add(sha1) - yield from self._index_contents(partition_id, nb_partitions, contents) next_page_token = indexed_page.next_page_token if next_page_token is None: break + return self._index_contents(partition_id, nb_partitions, contents) + def run( self, partition_id: int, diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py --- a/swh/indexer/tests/test_indexer.py +++ b/swh/indexer/tests/test_indexer.py @@ -15,6 +15,7 @@ RevisionIndexer, ) from swh.indexer.storage import PagedResult, Sha1 +from swh.model.model import Content from .utils import BASE_TEST_CONFIG @@ -56,6 +57,27 @@ pass +class TrivialContentPartitionIndexer(ContentPartitionIndexer[str]): + USE_TOOLS = False + + def index(self, id: bytes, data: Optional[bytes], **kwargs) -> List[str]: + return ["indexed " + id.decode()] + + def indexed_contents_in_partition( + self, partition_id: int, nb_partitions: int, page_token: Optional[str] = None + ) -> PagedResult[Sha1]: + if page_token is None: + return PagedResult(results=[b"excluded hash"], next_page_token="not none") + elif page_token == "not none": + return PagedResult(results=[b"other excluded hash"], next_page_token=None) + else: + assert False, page_token + + def persist_index_computations(self, results: List[str]) -> Dict[str, int]: + self._results.append(results) # type: ignore + return {"nb_added": len(results)} + + def test_content_indexer_catch_exceptions(): indexer = CrashingContentIndexer(config=BASE_TEST_CONFIG) indexer.objstorage = Mock() @@ -104,3 +126,32 @@ with pytest.raises(_TestException): indexer.run(0, 42) + + +def test_content_partition_indexer(): + # TODO: simplify the mocking in this test + indexer = TrivialContentPartitionIndexer( + config={**BASE_TEST_CONFIG, "write_batch_size": 10,} # doesn't matter + ) + indexer.catch_exceptions = False + indexer._results = [] + indexer.storage = Mock() + indexer.storage.content_get_partition = lambda *args, **kwargs: PagedResult( + results=[ + Content(sha1=c, sha1_git=c, sha256=c, blake2s256=c, length=42) + for c in [ + b"hash1", + b"excluded hash", + b"hash2", + b"other excluded hash", + b"hash3", + ] + ], + next_page_token=None, + ) + indexer.objstorage = Mock() + indexer.objstorage.get = lambda id: b"foo" + nb_partitions = 1 + partition_id = 0 + indexer.run(partition_id, nb_partitions) + assert indexer._results == [["indexed hash1", "indexed hash2", "indexed hash3"]]