diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -14,7 +14,7 @@ from collections import defaultdict from datetime import timedelta -from typing import Any, Dict, Mapping +from typing import Any, Dict, Mapping, Optional import attr @@ -1157,6 +1157,38 @@ self._origins[self._origins_by_id[idx]]) yield {'id': idx+1, **origin} + def origin_list(self, page_token: Optional[str] = None, limit: int = 100 + ) -> dict: + """Returns the list of origins + + Args: + page_token: opaque token used for pagination. + limit: the maximum number of results to return + + Returns: + dict: dict with the following keys: + - **next_page_token** (str, optional): opaque token to be used as + `page_token` for retrieving the next page. if absent, there is + no more pages to gather. + - **origins** (List[dict]): list of origins, as returned by + `origin_get`. + """ + origin_urls = sorted(self._origins) + if page_token: + from_ = bisect.bisect_left(origin_urls, page_token) + else: + from_ = 0 + + result = { + 'origins': [{'url': origin_url} + for origin_url in origin_urls[from_:from_+limit]] + } + + if from_+limit < len(origin_urls): + result['next_page_token'] = origin_urls[from_+limit] + + return result + def origin_search(self, url_pattern, offset=0, limit=50, regexp=False, with_visit=False, db=None, cur=None): """Search for origins whose urls contain a provided string pattern diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -11,7 +11,7 @@ from collections import defaultdict from concurrent.futures import ThreadPoolExecutor from contextlib import contextmanager -from typing import Any, Dict, Mapping +from typing import Any, Dict, Mapping, Optional import dateutil.parser import psycopg2 @@ -1690,6 +1690,45 @@ for origin in db.origin_get_range(origin_from, origin_count, cur): yield dict(zip(db.origin_get_range_cols, origin)) + @remote_api_endpoint('origin/list') + @timed + @db_transaction() + def origin_list(self, page_token: Optional[str] = None, limit: int = 100, + *, db=None, cur=None) -> dict: + """Returns the list of origins + + Args: + page_token: opaque token used for pagination. + limit: the maximum number of results to return + + Returns: + dict: dict with the following keys: + - **next_page_token** (str, optional): opaque token to be used as + `page_token` for retrieving the next page. if absent, there is + no more pages to gather. + - **origins** (List[dict]): list of origins, as returned by + `origin_get`. + """ + page_token = page_token or '0' + if not isinstance(page_token, str): + raise TypeError('page_token must be a string.') + origin_from = int(page_token) + result: Dict[str, Any] = { + 'origins': [ + dict(zip(db.origin_get_range_cols, origin)) + for origin in db.origin_get_range(origin_from, limit, cur) + ], + } + + assert len(result['origins']) <= limit + if len(result['origins']) == limit: + result['next_page_token'] = str(result['origins'][limit-1]['id']+1) + + for origin in result['origins']: + del origin['id'] + + return result + @remote_api_endpoint('origin/search') @timed @db_transaction_generator() diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -3242,6 +3242,32 @@ assert actual_origins == expected_origins + @pytest.mark.parametrize('limit', [1, 7, 10, 100, 1000]) + def test_origin_list(self, swh_storage, swh_origins, limit): + returned_origins = [] + + page_token = None + i = 0 + while True: + result = swh_storage.origin_list( + page_token=page_token, limit=limit) + assert len(result['origins']) <= limit + + returned_origins.extend( + origin['url'] for origin in result['origins']) + + i += 1 + page_token = result.get('next_page_token') + + if page_token is None: + assert i*limit >= len(swh_origins) + break + else: + assert len(result['origins']) == limit + + expected_origins = [origin['url'] for origin in swh_origins] + assert sorted(returned_origins) == sorted(expected_origins) + ORIGINS = [ 'https://github.com/user1/repo1', 'https://github.com/user2/repo1',