diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -145,17 +145,14 @@ def list_origins_by_producer(idx_storage, mappings, tool_ids): - start = '' + next_page_token = '' limit = 10000 - while True: - origins = list( - idx_storage.origin_intrinsic_metadata_search_by_producer( - start=start, limit=limit, ids_only=True, - mappings=mappings or None, tool_ids=tool_ids or None)) - if not origins: - break - start = origins[-1] + '\x00' # first possible string after this - yield from origins + while next_page_token is not None: + result = idx_storage.origin_intrinsic_metadata_search_by_producer( + page_token=next_page_token, limit=limit, ids_only=True, + mappings=mappings or None, tool_ids=tool_ids or None) + next_page_token = result.get('next_page_token') + yield from result['origins'] @schedule.command('reindex_origin_metadata') diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -781,25 +781,28 @@ dict(zip(db.origin_intrinsic_metadata_cols, c))) @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer') - @db_transaction_generator() + @db_transaction() def origin_intrinsic_metadata_search_by_producer( - self, start='', end=None, limit=100, ids_only=False, + self, page_token='', limit=100, ids_only=False, mappings=None, tool_ids=None, db=None, cur=None): """Returns the list of origins whose metadata contain all the terms. Args: - start (str): The minimum origin url to return - end (str): The maximum origin url to return + page_token (str): Opaque token used for pagination. limit (int): The maximum number of results to return ids_only (bool): Determines whether only origin urls are returned or the content as well mappings (List[str]): Returns origins whose intrinsic metadata were generated using at least one of these mappings. - Yields: - list: list of origin ids (int) if `ids_only=True`, else - dictionaries with the following keys: + Returns: + dict: dict with the following keys: + - **next_page_token** (str, optional): opaque token to be used as + `page_token` for retrieveing the next page. If absent, there is + no more pages to gather. + - **origins** (list): list of origin url (str) if `ids_only=True` + else dictionaries with the following keys: - **id** (str): origin urls - **from_revision**: sha1 id of the revision used to generate @@ -810,15 +813,24 @@ these metadata """ + assert isinstance(page_token, str) + # we go to limit+1 to check wether we should add next_page_token in + # the response res = db.origin_intrinsic_metadata_search_by_producer( - start, end, limit, ids_only, mappings, tool_ids, cur) + page_token, limit + 1, ids_only, mappings, tool_ids, cur) + result = {} if ids_only: - for (origin,) in res: - yield origin + result['origins'] = [origin for (origin,) in res] + if len(result['origins']) > limit: + result['origins'] = result['origins'][:limit] + result['next_page_token'] = result['origins'][-1] else: - for c in res: - yield converters.db_to_metadata( - dict(zip(db.origin_intrinsic_metadata_cols, c))) + result['origins'] = [converters.db_to_metadata( + dict(zip(db.origin_intrinsic_metadata_cols, c)))for c in res] + if len(result['origins']) > limit: + result['origins'] = result['origins'][:limit] + result['next_page_token'] = result['origins'][-1]['id'] + return result @remote_api_endpoint('origin_intrinsic_metadata/stats') @db_transaction() diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -393,7 +393,7 @@ yield from cur def origin_intrinsic_metadata_search_by_producer( - self, start, end, limit, ids_only, mappings, tool_ids, cur): + self, last, limit, ids_only, mappings, tool_ids, cur): if ids_only: keys = 'oim.id' else: @@ -408,12 +408,9 @@ args = [] where = [] - if start: - where.append('oim.id >= %s') - args.append(start) - if end: - where.append('oim.id <= %s') - args.append(end) + if last: + where.append('oim.id > %s') + args.append(last) if mappings is not None: where.append('oim.mappings && %s') args.append(mappings) diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -706,25 +706,27 @@ yield result def origin_intrinsic_metadata_search_by_producer( - self, start='', end=None, limit=100, ids_only=False, + self, page_token='', limit=100, ids_only=False, mappings=None, tool_ids=None, db=None, cur=None): """Returns the list of origins whose metadata contain all the terms. Args: - start (str): The minimum origin url to return - end (str): The maximum origin url to return + page_token (str): Opaque token used for pagination. limit (int): The maximum number of results to return ids_only (bool): Determines whether only origin ids are returned or the content as well mappings (List[str]): Returns origins whose intrinsic metadata were generated using at least one of these mappings. - Yields: - list: list of origin ids (int) if `ids_only=True`, else - dictionaries with the following keys: + Returns: + dict: dict with the following keys: + - **next_page_token** (str, optional): opaque token to be used as + `page_token` for retrieveing the next page. + - **origins** (list): list of origin url (str) if `ids_only=True` + else dictionaries with the following keys: - - **id** (str): origin url + - **id** (str): origin urls - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata** (str): associated metadata @@ -733,26 +735,37 @@ these metadata """ + assert isinstance(page_token, str) nb_results = 0 if mappings is not None: mappings = frozenset(mappings) if tool_ids is not None: tool_ids = frozenset(tool_ids) + origins = [] + + # we go to limit+1 to check wether we should add next_page_token in + # the response for entry in self._origin_intrinsic_metadata.get_all(): - if entry['id'] < start or (end and entry['id'] > end): + if entry['id'] <= page_token: continue - if nb_results >= limit: - return + if nb_results >= (limit + 1): + break if mappings is not None and mappings.isdisjoint(entry['mappings']): continue if tool_ids is not None and entry['tool']['id'] not in tool_ids: continue - if ids_only: - yield entry['id'] - else: - yield entry + origins.append(entry) nb_results += 1 + result = {} + if len(origins) > limit: + origins = origins[:limit] + result['next_page_token'] = origins[-1]['id'] + if ids_only: + origins = [origin['id'] for origin in origins] + result['origins'] = origins + return result + def origin_intrinsic_metadata_stats(self): """Returns statistics on stored intrinsic metadata. diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1553,58 +1553,109 @@ endpoint = self.storage.origin_intrinsic_metadata_search_by_producer # test pagination + # no 'page_token' param, return all origins + result = endpoint(ids_only=True) self.assertCountEqual( - endpoint(ids_only=True), + result['origins'], [self.origin_url_1, self.origin_url_2, self.origin_url_3]) + assert 'next_page_token' not in result + + # 'page_token' is < than origin_1, return everything + result = endpoint( + page_token=self.origin_url_1[:-1], ids_only=True) self.assertCountEqual( - endpoint(start=self.origin_url_1, ids_only=True), + result['origins'], [self.origin_url_1, self.origin_url_2, self.origin_url_3]) + assert 'next_page_token' not in result + + # 'page_token' is origin_3, return nothing + result = endpoint(page_token=self.origin_url_3, ids_only=True) + self.assertCountEqual( + endpoint(page_token=self.origin_url_3, ids_only=True)['origins'], + []) + assert 'next_page_token' not in result + + # test limit argument + result = endpoint(page_token=self.origin_url_1[:-1], + limit=2, ids_only=True) self.assertCountEqual( - endpoint(start=self.origin_url_1, limit=2, ids_only=True), + result['origins'], [self.origin_url_1, self.origin_url_2]) + assert result['next_page_token'] == result['origins'][-1] + + result = endpoint(page_token=self.origin_url_1, + limit=2, ids_only=True) self.assertCountEqual( - endpoint(start=self.origin_url_1+'2', ids_only=True), + result['origins'], [self.origin_url_2, self.origin_url_3]) + assert 'next_page_token' not in result + + result = endpoint(page_token=self.origin_url_2, + limit=2, ids_only=True) self.assertCountEqual( - endpoint(start=self.origin_url_1+'2', end=self.origin_url_3[:-1], - ids_only=True), - [self.origin_url_2]) + result['origins'], + [self.origin_url_3]) + assert 'next_page_token' not in result # test mappings filtering + result = endpoint(mappings=['npm'], ids_only=True) self.assertCountEqual( - endpoint(mappings=['npm'], ids_only=True), + result['origins'], [self.origin_url_1, self.origin_url_2]) + assert 'next_page_token' not in result + + result = endpoint(mappings=['npm', 'gemspec'], ids_only=True) self.assertCountEqual( - endpoint(mappings=['npm', 'gemspec'], ids_only=True), + result['origins'], [self.origin_url_1, self.origin_url_2]) + assert 'next_page_token' not in result + + result = endpoint(mappings=['gemspec'], ids_only=True) self.assertCountEqual( - endpoint(mappings=['gemspec'], ids_only=True), + result['origins'], [self.origin_url_2]) + assert 'next_page_token' not in result + + result = endpoint(mappings=['pkg-info'], ids_only=True) self.assertCountEqual( - endpoint(mappings=['pkg-info'], ids_only=True), + result['origins'], [self.origin_url_3]) + assert 'next_page_token' not in result + + result = endpoint(mappings=['foobar'], ids_only=True) self.assertCountEqual( - endpoint(mappings=['foobar'], ids_only=True), + result['origins'], []) + assert 'next_page_token' not in result # test pagination + mappings + result = endpoint(mappings=['npm'], limit=1, ids_only=True) self.assertCountEqual( - endpoint(mappings=['npm'], limit=1, ids_only=True), + result['origins'], [self.origin_url_1]) + assert result['next_page_token'] == result['origins'][-1] # test tool filtering + result = endpoint(tool_ids=[tool1['id']], ids_only=True) self.assertCountEqual( - endpoint(tool_ids=[tool1['id']], ids_only=True), + result['origins'], [self.origin_url_1]) + assert 'next_page_token' not in result + + result = endpoint(tool_ids=[tool2['id']], ids_only=True) self.assertCountEqual( - endpoint(tool_ids=[tool2['id']], ids_only=True), + result['origins'], [self.origin_url_2, self.origin_url_3]) + assert 'next_page_token' not in result + + result = endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True) self.assertCountEqual( - endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True), + result['origins'], [self.origin_url_1, self.origin_url_2, self.origin_url_3]) + assert 'next_page_token' not in result # test ids_only=False - self.assertEqual(list(endpoint(mappings=['gemspec'])), [{ + self.assertEqual(endpoint(mappings=['gemspec'])['origins'], [{ 'id': self.origin_url_2, 'metadata': { '@context': 'foo',