diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -434,17 +434,17 @@ if res: yield res - def run(self, start, end, policy_update, **kwargs): - """Given a range of content ids, compute the indexing computation on - the contents within. Either only new ones (policy_update to - 'update-dups') or all (policy_update to 'ignore-dups'. + def run(self, start, end, skip_existing=True, **kwargs): + """Given a range of content ids, compute the indexing computations on + the contents within. Either the indexer is incremental + (filter out existing computed data) or not (compute + everything from scratch). Args: - **policy_update** (str): either 'update-dups' to do all - contents, or 'ignore-dups' to - only compute new ones **start** (Union[bytes, str]): Starting range identifier **end** (Union[bytes, str]): Ending range identifier + **skip_existing** (bool): Skip existing indexed data + (default) or not **kwargs: passed to the `index` method Returns: @@ -458,7 +458,7 @@ if isinstance(end, str): end = hashutil.hash_to_bytes(end) - if policy_update == 'update-dups': # incremental + if skip_existing: indexed = set(self.indexed_contents_in_range(start, end)) else: indexed = set() @@ -466,7 +466,8 @@ index_computations = self._index_contents(start, end, indexed) for results in utils.grouper(index_computations, n=self.config['write_batch_size']): - self.persist_index_computations(results, policy_update) + self.persist_index_computations( + results, policy_update='update-dups') with_indexed_data = True return with_indexed_data except Exception: diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -325,8 +325,7 @@ """ start, end = [self.contents[0], self.contents[2]] # output hex ids # given - actual_results = self.indexer.run( - start, end, policy_update='update-dups') + actual_results = self.indexer.run(start, end) # then self.assertTrue(actual_results) @@ -342,7 +341,7 @@ # given actual_results = self.indexer.run( # checks the bytes input this time - start, end, policy_update='ignore-dups') # no data so same result + start, end, skip_existing=False) # no data so same result # then self.assertTrue(actual_results) @@ -353,7 +352,7 @@ '0000000000000000000000000000000000000001'] # given actual_results = self.indexer.run( - start, end, policy_update='update-dups') + start, end, incremental=False) # then self.assertFalse(actual_results)