diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -38,6 +38,11 @@ def content_get_metadata(self, content): return self.post('content/metadata', {'content': content}) + def content_get_range(self, start, end, limit=1000): + return self.post('content/range', {'start': start, + 'end': end, + 'limit': limit}) + def content_find(self, content): return self.post('content/present', {'content': content}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -107,6 +107,12 @@ **decode_request(request))) +@app.route('/content/range', methods=['POST']) +def content_get_range(): + return encode_data(get_storage().content_get_range( + **decode_request(request))) + + @app.route('/directory/missing', methods=['POST']) def directory_missing(): return encode_data(get_storage().directory_missing( diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -289,6 +289,18 @@ ((sha1,) for sha1 in sha1s), ) + def content_get_range(self, start, end, limit=None, cur=None): + """Retrieve contents within range [start, end]. + + """ + cur = self._cursor(cur) + query = """select %s from content + where %%s <= sha1 and sha1 <= %%s + order by sha1 + limit %%s""" % ', '.join(self.content_get_metadata_keys) + cur.execute(query, (start, end, limit)) + yield from cursor_to_bytes(cur) + content_hash_keys = ['sha1', 'sha1_git', 'sha256', 'blake2s256'] def content_missing_from_list(self, contents, cur=None): diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -236,6 +236,41 @@ yield {'sha1': obj_id, 'data': data} + @db_transaction() + def content_get_range(self, start, end, limit=1000, db=None, cur=None): + """Retrieve contents within range [start, end] bound by limit. + + Args: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **limit** (int): Limit result (default to 1000) + + Returns: + a dict with keys: + - contents [dict]: iterable of contents in between the range. + - next (bytes): There remains content in the range + starting from this next sha1 + + """ + if limit is None: + raise ValueError('Development error: limit should not be None') + contents = [] + next_content = None + for counter, content_row in enumerate( + db.content_get_range(start, end, limit+1, cur)): + content = dict(zip(db.content_get_metadata_keys, content_row)) + if counter >= limit: + # take the last commit for the next page starting from this + next_content = content['sha1'] + break + contents.append(content) + return { + 'contents': contents, + 'next': next_content, + } + @db_transaction_generator(statement_timeout=500) def content_get_metadata(self, content, db=None, cur=None): """Retrieve content metadata in bulk diff --git a/swh/storage/tests/__init__.py b/swh/storage/tests/__init__.py --- a/swh/storage/tests/__init__.py +++ b/swh/storage/tests/__init__.py @@ -6,7 +6,7 @@ from os import path import swh.storage -from hypothesis.strategies import (binary, composite, lists) +from hypothesis.strategies import (binary, composite, sets) from swh.model.hashutil import MultiHash @@ -39,8 +39,9 @@ [min_size:max_size]. """ - raw_contents = draw(lists( - gen_raw_content(), min_size=min_size, max_size=max_size)) + raw_contents = draw(sets( + gen_raw_content(), + min_size=min_size, max_size=max_size)) contents = [] for raw_content in raw_contents: diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1846,7 +1846,6 @@ self.assertIsNotNone(o_m1) -@pytest.mark.property_based class PropBasedTestStorage(BaseTestStorage, unittest.TestCase): def assert_contents_ok(self, expected_contents, actual_contents, keys_to_check={'sha1', 'data'}): @@ -1882,7 +1881,7 @@ # retrieve contents actual_contents = list(self.storage.content_get_metadata(get_sha1s)) - self.assertEquals(len(actual_contents), len(contents)) + self.assertEqual(len(actual_contents), len(contents)) # will check that all contents are retrieved correctly one_content = contents[0] @@ -1891,6 +1890,74 @@ self.assert_contents_ok(contents, actual_contents, keys_to_check=keys_to_check) + def test_generate_content_get_range_limit_none(self): + with self.assertRaises(ValueError) as e: + self.storage.content_get_range(start=None, end=None, limit=None) + + self.assertEqual(e.exception.args, ( + 'Development error: limit should not be None',)) + + @given(gen_contents(min_size=1, max_size=4)) + def test_generate_content_get_range_no_limit(self, contents): + self.reset_storage_tables() + # add contents to storage + self.storage.content_add(contents) + + # input the list of sha1s we want from storage + get_sha1s = [c['sha1'] for c in contents] + get_sha1s.sort() + + print('input sha1s: %s' % get_sha1s) + start = get_sha1s[0] + end = get_sha1s[-1] + + # retrieve contents + actual_result = self.storage.content_get_range(start, end) + + actual_contents = actual_result['contents'] + actual_next = actual_result['next'] + + self.assertEqual(len(contents), len(actual_contents)) + self.assertIsNone(actual_next) + + one_content = contents[0] + keys_to_check = set(one_content.keys()) - {'data'} + self.assert_contents_ok(contents, actual_contents, keys_to_check) + + @pytest.mark.property_based + @given(gen_contents(min_size=4, max_size=4)) + def test_generate_content_get_range_limit(self, contents): + self.reset_storage_tables() + contents_map = {c['sha1']: c for c in contents} + + # add contents to storage + self.storage.content_add(contents) + + # input the list of sha1s we want from storage + get_sha1s = [c['sha1'] for c in contents] + get_sha1s.sort() + + print('input sha1s: %s' % get_sha1s) + start = get_sha1s[0] + end = get_sha1s[-1] + + # retrieve contents limited to 3 results + limited_results = len(contents) - 1 + actual_result = self.storage.content_get_range(start, end, + limit=limited_results) + + actual_contents = actual_result['contents'] + actual_next = actual_result['next'] + + self.assertEqual(limited_results, len(actual_contents)) + self.assertIsNotNone(actual_next) + self.assertEqual(actual_next, get_sha1s[-1]) + + expected_contents = [contents_map[sha1] for sha1 in get_sha1s[:-1]] + keys_to_check = set(contents[0].keys()) - {'data'} + self.assert_contents_ok(expected_contents, actual_contents, + keys_to_check) + class TestLocalStorage(CommonTestStorage, unittest.TestCase): """Test the local storage"""