diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -173,14 +173,12 @@ **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier - Yields: - Content identifier (bytes) present in the range [start, end] + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any """ - while start: - result = self.idx_storage.content_fossology_license_get_range( + return self.idx_storage.content_fossology_license_get_range( start, end, self.tool['id']) - contents = result['ids'] - for _id in contents: - yield _id - start = result['next'] diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -439,6 +439,26 @@ if res: yield res + def _index_with_skipping_already_done(self, start, end): + """Index not already indexed contents in range [start, end]. + + Args: + **start** (Union[bytes, str]): Starting range identifier + **end** (Union[bytes, str]): Ending range identifier + + Yields: + Content identifier (bytes) present in the range [start, + end] which are not already indexed. + + """ + while start: + indexed_page = self.indexed_contents_in_range(start, end) + contents = indexed_page['ids'] + _end = contents[-1] if contents else end + yield from self._index_contents( + start, _end, contents) + start = indexed_page['next'] + def run(self, start, end, skip_existing=True, **kwargs): """Given a range of content ids, compute the indexing computations on the contents within. Either the indexer is incremental @@ -464,20 +484,20 @@ end = hashutil.hash_to_bytes(end) if skip_existing: - indexed = set(self.indexed_contents_in_range(start, end)) + gen = self._index_with_skipping_already_done(start, end) else: - indexed = set() + gen = self._index_contents(start, end, indexed=[]) - index_computations = self._index_contents(start, end, indexed) - for results in utils.grouper(index_computations, + for results in utils.grouper(gen, n=self.config['write_batch_size']): self.persist_index_computations( results, policy_update='update-dups') with_indexed_data = True - return with_indexed_data except Exception: self.log.exception( 'Problem when computing metadata.') + finally: + return with_indexed_data class OriginIndexer(BaseIndexer): diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -142,14 +142,12 @@ **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier - Yields: - Content identifier (bytes) present in the range [start, end] + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any """ - while start: - result = self.idx_storage.content_mimetype_get_range( - start, end, self.tool['id']) - contents = result['ids'] - for _id in contents: - yield _id - start = result['next'] + return self.idx_storage.content_mimetype_get_range( + start, end, self.tool['id']) diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -173,12 +173,12 @@ class MimetypeIndexerUnknownToolTestStorage( CommonIndexerNoTool, MimetypeTestIndexer): - """Fossology license indexer with wrong configuration""" + """Mimetype indexer with wrong configuration""" class MimetypeRangeIndexerUnknownToolTestStorage( CommonIndexerNoTool, MimetypeRangeIndexerTest): - """Fossology license range indexer with wrong configuration""" + """Mimetype range indexer with wrong configuration""" class TestMimetypeIndexersErrors( diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -685,6 +685,7 @@ """ start, end = [self.contents[0], self.contents[2]] # output hex ids + # given actual_results = self.indexer.run(start, end) @@ -702,7 +703,8 @@ # given actual_results = self.indexer.run( # checks the bytes input this time - start, end, skip_existing=False) # no data so same result + start, end, skip_existing=False) + # no already indexed data so same result as prior test # then self.assertTrue(actual_results)