diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -145,16 +145,16 @@ def list_origins_by_producer(idx_storage, mappings, tool_ids): - start = '' + last = '' limit = 10000 while True: origins = list( idx_storage.origin_intrinsic_metadata_search_by_producer( - start=start, limit=limit, ids_only=True, + last=last, limit=limit, ids_only=True, mappings=mappings or None, tool_ids=tool_ids or None)) if not origins: break - start = origins[-1] + '\x00' # first possible string after this + last = origins[-1] yield from origins diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -783,14 +783,14 @@ @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer') @db_transaction_generator() def origin_intrinsic_metadata_search_by_producer( - self, start='', end=None, limit=100, ids_only=False, + self, last='', limit=100, ids_only=False, mappings=None, tool_ids=None, db=None, cur=None): """Returns the list of origins whose metadata contain all the terms. Args: - start (str): The minimum origin url to return - end (str): The maximum origin url to return + last (str): The last origin url returned by previous (paginated) + call. limit (int): The maximum number of results to return ids_only (bool): Determines whether only origin urls are returned or the content as well @@ -810,8 +810,9 @@ these metadata """ + assert isinstance(last, str) res = db.origin_intrinsic_metadata_search_by_producer( - start, end, limit, ids_only, mappings, tool_ids, cur) + last, limit, ids_only, mappings, tool_ids, cur) if ids_only: for (origin,) in res: yield origin diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -393,7 +393,7 @@ yield from cur def origin_intrinsic_metadata_search_by_producer( - self, start, end, limit, ids_only, mappings, tool_ids, cur): + self, last, limit, ids_only, mappings, tool_ids, cur): if ids_only: keys = 'oim.id' else: @@ -408,12 +408,9 @@ args = [] where = [] - if start: - where.append('oim.id >= %s') - args.append(start) - if end: - where.append('oim.id <= %s') - args.append(end) + if last: + where.append('oim.id > %s') + args.append(last) if mappings is not None: where.append('oim.mappings && %s') args.append(mappings) diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -706,14 +706,14 @@ yield result def origin_intrinsic_metadata_search_by_producer( - self, start='', end=None, limit=100, ids_only=False, + self, last='', limit=100, ids_only=False, mappings=None, tool_ids=None, db=None, cur=None): """Returns the list of origins whose metadata contain all the terms. Args: - start (str): The minimum origin url to return - end (str): The maximum origin url to return + last (str): The last origin url returned by previous (paginated) + call. limit (int): The maximum number of results to return ids_only (bool): Determines whether only origin ids are returned or the content as well @@ -733,13 +733,14 @@ these metadata """ + assert isinstance(last, str) nb_results = 0 if mappings is not None: mappings = frozenset(mappings) if tool_ids is not None: tool_ids = frozenset(tool_ids) for entry in self._origin_intrinsic_metadata.get_all(): - if entry['id'] < start or (end and entry['id'] > end): + if entry['id'] <= last: continue if nb_results >= limit: return diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1553,22 +1553,29 @@ endpoint = self.storage.origin_intrinsic_metadata_search_by_producer # test pagination + # no 'last' param, return all origins self.assertCountEqual( endpoint(ids_only=True), [self.origin_url_1, self.origin_url_2, self.origin_url_3]) + # 'last' is < than origin_1, return everything self.assertCountEqual( - endpoint(start=self.origin_url_1, ids_only=True), + endpoint(last=self.origin_url_1[:-1], ids_only=True), [self.origin_url_1, self.origin_url_2, self.origin_url_3]) + # 'last' is origin_3, return nothing self.assertCountEqual( - endpoint(start=self.origin_url_1, limit=2, ids_only=True), + endpoint(last=self.origin_url_3, ids_only=True), + []) + + # test limit argument + self.assertCountEqual( + endpoint(last=self.origin_url_1[:-1], limit=2, ids_only=True), [self.origin_url_1, self.origin_url_2]) self.assertCountEqual( - endpoint(start=self.origin_url_1+'2', ids_only=True), + endpoint(last=self.origin_url_1, limit=2, ids_only=True), [self.origin_url_2, self.origin_url_3]) self.assertCountEqual( - endpoint(start=self.origin_url_1+'2', end=self.origin_url_3[:-1], - ids_only=True), - [self.origin_url_2]) + endpoint(last=self.origin_url_2, limit=2, ids_only=True), + [self.origin_url_3]) # test mappings filtering self.assertCountEqual(