diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -111,7 +111,7 @@ self, *, url_pattern: str = None, metadata_pattern: str = None, with_visit: bool = False, - scroll_token: str = None, count: int = 50 + page_token: str = None, count: int = 50 ) -> Dict[str, object]: """Searches for origins matching the `url_pattern`. @@ -119,12 +119,12 @@ url_pattern (str): Part of thr URL to search for with_visit (bool): Whether origins with no visit are to be filtered out - scroll_token (str): Opaque value used for pagination. + page_token (str): Opaque value used for pagination. count (int): number of results to return. Returns: a dictionary with keys: - * `scroll_token`: + * `next_page_token`: opaque value used for fetching more results. `None` if there are no more result. * `results`: @@ -183,13 +183,13 @@ {'_id': 'asc'}, ] } - if scroll_token: + if page_token: # TODO: use ElasticSearch's scroll API? - scroll_token_content = msgpack.loads( - base64.b64decode(scroll_token)) + page_token_content = msgpack.loads( + base64.b64decode(page_token)) body['search_after'] = \ - [scroll_token_content[b'score'], - scroll_token_content[b'id'].decode('ascii')] + [page_token_content[b'score'], + page_token_content[b'id'].decode('ascii')] res = self._backend.search( index='origin', @@ -201,17 +201,17 @@ if len(hits) == count: last_hit = hits[-1] - next_scroll_token_content = { + next_page_token_content = { b'score': last_hit['_score'], b'id': last_hit['_id'], } - next_scroll_token = base64.b64encode(msgpack.dumps( - next_scroll_token_content)) # type: Optional[bytes] + next_page_token = base64.b64encode(msgpack.dumps( + next_page_token_content)) # type: Optional[bytes] else: - next_scroll_token = None + next_page_token = None return { - 'scroll_token': next_scroll_token, + 'next_page_token': next_page_token, 'results': [ { # TODO: also add 'id'? diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -62,7 +62,7 @@ self, *, url_pattern: str = None, metadata_pattern: str = None, with_visit: bool = False, - scroll_token: str = None, count: int = 50 + page_token: str = None, count: int = 50 ) -> Dict[str, object]: matches = \ (self._origins[id_] @@ -97,10 +97,10 @@ if with_visit: matches = filter(lambda o: o.get('has_visits'), matches) - if scroll_token: - scroll_token_content = msgpack.loads( - base64.b64decode(scroll_token)) - start_at_index = scroll_token_content[b'start_at_index'] + if page_token: + page_token_content = msgpack.loads( + base64.b64decode(page_token)) + start_at_index = page_token_content[b'start_at_index'] else: start_at_index = 0 @@ -108,16 +108,16 @@ matches, start_at_index, start_at_index+count)) if len(hits) == count: - next_scroll_token_content = { + next_page_token_content = { b'start_at_index': start_at_index+count, } - next_scroll_token = base64.b64encode(msgpack.dumps( - next_scroll_token_content)) # type: Optional[bytes] + next_page_token = base64.b64encode(msgpack.dumps( + next_page_token_content)) # type: Optional[bytes] else: - next_scroll_token = None + next_page_token = None return { - 'scroll_token': next_scroll_token, + 'next_page_token': next_page_token, 'results': [ {'url': hit['url']} for hit in hits diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py --- a/swh/search/tests/test_cli.py +++ b/swh/search/tests/test_cli.py @@ -86,12 +86,12 @@ assert result.output == expected_output results = self.search.origin_search(url_pattern='foobar') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://foobar.baz'}]} results = self.search.origin_search(url_pattern='foobar', with_visit=True) - assert results == {'scroll_token': None, 'results': []} + assert results == {'next_page_token': None, 'results': []} def test__journal_client__origin_visit(self): """Tests the re-indexing when origin_batch_size*task_batch_size is a @@ -125,5 +125,5 @@ results = self.search.origin_search(url_pattern='foobar', with_visit=True) - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://foobar.baz'}]} diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -17,20 +17,20 @@ ]) results = self.search.origin_search(url_pattern='foobar') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://foobar.baz'}]} results = self.search.origin_search(url_pattern='barb') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://barbaz.qux'}]} # 'bar' is part of 'foobar', but is not the beginning of it results = self.search.origin_search(url_pattern='bar') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://barbaz.qux'}]} results = self.search.origin_search(url_pattern='barbaz') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://barbaz.qux'}]} def test_origin_url_unique_word_prefix_multiple_results(self): @@ -41,14 +41,14 @@ ]) results = self.search.origin_search(url_pattern='qu') - assert results['scroll_token'] is None + assert results['next_page_token'] is None results = [res['url'] for res in results['results']] expected_results = ['http://qux.quux', 'http://barbaz.qux'] assert sorted(results) == sorted(expected_results) results = self.search.origin_search(url_pattern='qux') - assert results['scroll_token'] is None + assert results['next_page_token'] is None results = [res['url'] for res in results['results']] expected_results = ['http://barbaz.qux', 'http://qux.quux'] @@ -61,7 +61,7 @@ results = self.search.origin_search( url_pattern='foobar', with_visit=True) - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://foobar.baz'}]} def test_origin_with_visit_added(self): @@ -71,7 +71,7 @@ results = self.search.origin_search( url_pattern='foobar', with_visit=True) - assert results == {'scroll_token': None, 'results': []} + assert results == {'next_page_token': None, 'results': []} self.search.origin_update([ {'url': 'http://foobar.baz', 'has_visits': True}, @@ -79,7 +79,7 @@ results = self.search.origin_search( url_pattern='foobar', with_visit=True) - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://foobar.baz'}]} def test_origin_intrinsic_metadata_description(self): @@ -105,16 +105,16 @@ ]) results = self.search.origin_search(metadata_pattern='foo') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://origin2'}]} # ES returns both results, because blahblah results = self.search.origin_search(metadata_pattern='foo bar') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://origin2'}, {'url': 'http://origin3'}]} results = self.search.origin_search(metadata_pattern='bar baz') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://origin3'}, {'url': 'http://origin2'}]} def test_origin_intrinsic_metadata_nested(self): @@ -140,15 +140,15 @@ ]) results = self.search.origin_search(metadata_pattern='foo') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://origin2'}]} results = self.search.origin_search(metadata_pattern='foo bar') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://origin2'}, {'url': 'http://origin3'}]} results = self.search.origin_search(metadata_pattern='bar baz') - assert results == {'scroll_token': None, 'results': [ + assert results == {'next_page_token': None, 'results': [ {'url': 'http://origin3'}, {'url': 'http://origin2'}]} # TODO: add more tests with more codemeta terms diff --git a/swh/search/utils.py b/swh/search/utils.py --- a/swh/search/utils.py +++ b/swh/search/utils.py @@ -5,12 +5,12 @@ def stream_results(f, *args, **kwargs): - if 'scroll_token' in kwargs: - raise TypeError('stream_results has no argument "scroll_token".') - scroll_token = None + if 'page_token' in kwargs: + raise TypeError('stream_results has no argument "page_token".') + page_token = None while True: - results = f(*args, scroll_token=scroll_token, **kwargs) + results = f(*args, page_token=page_token, **kwargs) yield from results['results'] - scroll_token = results['scroll_token'] - if scroll_token is None: + page_token = results['next_page_token'] + if page_token is None: break