diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -156,6 +156,10 @@ return self.post('origin/get_range', {'origin_from': origin_from, 'origin_count': origin_count}) + def origin_list(self, page_token=None, limit=100): + return self.post('origin/list', {'page_token': page_token, + 'limit': limit}) + def origin_add(self, origins): return self.post('origin/add_multi', {'origins': origins}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -358,6 +358,13 @@ **decode_request(request))) +@app.route('/origin/list', methods=['POST']) +@timed +def origin_list(): + return encode_data(get_storage().origin_list( + **decode_request(request))) + + @app.route('/origin/search', methods=['POST']) @timed def origin_search(): diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -1096,6 +1096,37 @@ self._origins[self._origins_by_id[idx]]) yield {'id': idx+1, **origin} + def origin_list(self, page_token=None, limit=100): + """Returns the list of origins + + Args: + page_token (Optional[str]): opaque token used for pagination. + limit (int): the maximum number of results to return + + Returns: + dict: dict with the following keys: + - **next_page_token** (str, optional): opaque token to be used as + `page_token` for retrieveing the next page. if absent, there is + no more pages to gather. + - **origins** (List[dict]): list of origins, as returned by + `origin_get`. + """ + origin_urls = sorted(self._origins) + if page_token: + from_ = bisect.bisect_left(origin_urls, page_token) + else: + from_ = 0 + + result = { + 'origins': [{'url': origin_url} + for origin_url in origin_urls[from_:from_+limit]] + } + + if from_+limit < len(origin_urls): + result['next_page_token'] = origin_urls[from_+limit] + + return result + def origin_search(self, url_pattern, offset=0, limit=50, regexp=False, with_visit=False, db=None, cur=None): """Search for origins whose urls contain a provided string pattern diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1507,6 +1507,42 @@ for origin in db.origin_get_range(origin_from, origin_count, cur): yield dict(zip(db.origin_get_range_cols, origin)) + @db_transaction() + def origin_list(self, page_token=None, limit=100, *, db, cur): + """Returns the list of origins + + Args: + page_token (Optional[str]): opaque token used for pagination. + limit (int): the maximum number of results to return + + Returns: + dict: dict with the following keys: + - **next_page_token** (str, optional): opaque token to be used as + `page_token` for retrieveing the next page. if absent, there is + no more pages to gather. + - **origins** (List[dict]): list of origins, as returned by + `origin_get`. + """ + page_token = page_token or '0' + if not isinstance(page_token, str): + raise TypeError('page_token must be a string.') + origin_from = int(page_token) + result = { + 'origins': [ + dict(zip(db.origin_get_range_cols, origin)) + for origin in db.origin_get_range(origin_from, limit, cur) + ], + } + + assert len(result['origins']) <= limit + if len(result['origins']) == limit: + result['next_page_token'] = str(result['origins'][limit-1]['id']+1) + + for origin in result['origins']: + del origin['id'] + + return result + @db_transaction_generator() def origin_search(self, url_pattern, offset=0, limit=50, regexp=False, with_visit=False, db=None, cur=None): diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -3127,6 +3127,32 @@ assert actual_origins == expected_origins + @pytest.mark.parametrize('limit', [1, 7, 10, 100, 1000]) + def test_origin_list(self, swh_storage, swh_origins, limit): + returned_origins = [] + + page_token = None + i = 0 + while True: + result = swh_storage.origin_list( + page_token=page_token, limit=limit) + assert len(result['origins']) <= limit + + returned_origins.extend( + origin['url'] for origin in result['origins']) + + i += 1 + page_token = result.get('next_page_token') + + if page_token is None: + assert i*limit >= len(swh_origins) + break + else: + assert len(result['origins']) == limit + + expected_origins = [origin['url'] for origin in swh_origins] + assert sorted(returned_origins) == sorted(expected_origins) + ORIGINS = [ 'https://github.com/user1/repo1', 'https://github.com/user2/repo1',