diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -3,11 +3,13 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import random import select from swh.core.db import BaseDb from swh.core.db.db_utils import stored_procedure, jsonize from swh.core.db.db_utils import execute_values_generator +from swh.model.model import SHA1_SIZE class Db(BaseDb): @@ -199,6 +201,10 @@ if ret: return ret[0] + def snapshot_get_random(self, cur=None): + return self._get_random_row_from_table( + 'snapshot', ['id'], 'id', cur) + content_find_cols = ['sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', 'ctime', 'status'] @@ -237,6 +243,10 @@ content = cur.fetchall() return content + def content_get_random(self, cur=None): + return self._get_random_row_from_table( + 'content', ['sha1_git'], 'sha1_git', cur) + def directory_missing_from_list(self, directories, cur=None): cur = self._cursor(cur) yield from execute_values_generator( @@ -280,6 +290,10 @@ return None return data + def directory_get_random(self, cur=None): + return self._get_random_row_from_table( + 'directory', ['id'], 'id', cur) + def revision_missing_from_list(self, revisions, cur=None): cur = self._cursor(cur) @@ -577,6 +591,10 @@ cur.execute(query, (root_revisions, limit)) yield from cur + def revision_get_random(self, cur=None): + return self._get_random_row_from_table( + 'revision', ['id'], 'id', cur) + def release_missing_from_list(self, releases, cur=None): cur = self._cursor(cur) yield from execute_values_generator( @@ -805,6 +823,10 @@ """ % query_keys, ((id,) for id in releases)) + def release_get_random(self, cur=None): + return self._get_random_row_from_table( + 'release', ['id'], 'id', cur) + def origin_metadata_add(self, origin, ts, provider, tool, metadata, cur=None): """ Add an origin_metadata for the origin at ts with provider, tool and @@ -915,3 +937,19 @@ (provider_name, provider_url)) return cur.fetchone() + + def _get_random_row_from_table(self, table_name, cols, id_col, cur=None): + random_sha1 = bytes(random.randint(0, 255) for _ in range(SHA1_SIZE)) + cur = self._cursor(cur) + query = ''' + (SELECT {cols} FROM {table} WHERE {id_col} >= %s + ORDER BY {id_col} LIMIT 1) + UNION + (SELECT {cols} FROM {table} WHERE {id_col} < %s + ORDER BY {id_col} DESC LIMIT 1) + LIMIT 1 + '''.format(cols=', '.join(cols), table=table_name, id_col=id_col) + cur.execute(query, (random_sha1, random_sha1)) + row = cur.fetchone() + if row: + return row[0] diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -420,6 +420,14 @@ yield content break + def content_get_random(self): + """Finds a random content id. + + Returns: + a sha1_git + """ + return random.choice(list(self._content_indexes['sha1_git'])) + def directory_add(self, directories): """Add directories to the storage @@ -535,6 +543,14 @@ """ return self._directory_entry_get_by_path(directory, paths, b'') + def directory_get_random(self): + """Finds a random directory id. + + Returns: + a sha1_git + """ + return random.choice(list(self._directories)) + def _directory_entry_get_by_path(self, directory, paths, prefix): if not paths: return @@ -681,6 +697,14 @@ yield from ((rev['id'], rev['parents']) for rev in self.revision_log(revisions, limit)) + def revision_get_random(self): + """Finds a random revision id. + + Returns: + a sha1_git + """ + return random.choice(list(self._revisions)) + def release_add(self, releases): """Add releases to the storage @@ -756,6 +780,14 @@ else: yield None + def release_get_random(self): + """Finds a random release id. + + Returns: + a sha1_git + """ + return random.choice(list(self._releases)) + def snapshot_add(self, snapshots): """Add a snapshot to the storage @@ -983,6 +1015,14 @@ 'next_branch': next_branch, } + def snapshot_get_random(self): + """Finds a random snapshot id. + + Returns: + a sha1_git + """ + return random.choice(list(self._snapshots)) + def object_find_by_sha1_git(self, ids, db=None, cur=None): """Return the objects found with the given ids. diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -611,6 +611,17 @@ return [dict(zip(db.content_find_cols, content)) for content in contents] + @remote_api_endpoint('content/get_random') + @timed + @db_transaction() + def content_get_random(self, db=None, cur=None): + """Finds a random content id. + + Returns: + a sha1_git + """ + return db.content_get_random(cur) + @remote_api_endpoint('directory/add') @timed @process_metrics @@ -759,6 +770,17 @@ if res: return dict(zip(db.directory_ls_cols, res)) + @remote_api_endpoint('directory/get_random') + @timed + @db_transaction() + def directory_get_random(self, db=None, cur=None): + """Finds a random directory id. + + Returns: + a sha1_git + """ + return db.directory_get_random(cur) + @remote_api_endpoint('revision/add') @timed @process_metrics @@ -916,6 +938,17 @@ yield from db.revision_shortlog(revisions, limit, cur) + @remote_api_endpoint('revision/get_random') + @timed + @db_transaction() + def revision_get_random(self, db=None, cur=None): + """Finds a random revision id. + + Returns: + a sha1_git + """ + return db.revision_get_random(cur) + @remote_api_endpoint('release/add') @timed @process_metrics @@ -1015,6 +1048,17 @@ ) yield data if data['target_type'] else None + @remote_api_endpoint('release/get_random') + @timed + @db_transaction() + def release_get_random(self, db=None, cur=None): + """Finds a random release id. + + Returns: + a sha1_git + """ + return db.release_get_random(cur) + @remote_api_endpoint('snapshot/add') @timed @process_metrics @@ -1276,6 +1320,17 @@ return None + @remote_api_endpoint('snapshot/get_random') + @timed + @db_transaction() + def snapshot_get_random(self, db=None, cur=None): + """Finds a random snapshot id. + + Returns: + a sha1_git + """ + return db.snapshot_get_random(cur) + @remote_api_endpoint('origin/visit/add') @timed @db_transaction() diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -389,6 +389,13 @@ assert list(gen) == [missing_cont] + def test_content_get_random(self, swh_storage): + swh_storage.content_add([data.cont, data.cont2, data.cont3]) + + assert swh_storage.content_get_random() in { + data.cont['sha1_git'], data.cont2['sha1_git'], + data.cont3['sha1_git']} + def test_directory_add(self, swh_storage): init_missing = list(swh_storage.directory_missing([data.dir['id']])) assert [data.dir['id']] == init_missing @@ -579,6 +586,12 @@ [entry['name']]) assert actual_entry is None + def test_directory_get_random(self, swh_storage): + swh_storage.directory_add([data.dir, data.dir2, data.dir3]) + + assert swh_storage.directory_get_random() in \ + {data.dir['id'], data.dir2['id'], data.dir3['id']} + def test_revision_add(self, swh_storage): init_missing = swh_storage.revision_missing([data.revision['id']]) assert list(init_missing) == [data.revision['id']] @@ -758,6 +771,13 @@ assert len(get) == 1 assert get[0]['parents'] == [] # no parents on this one + def test_revision_get_random(self, swh_storage): + swh_storage.revision_add( + [data.revision, data.revision2, data.revision3]) + + assert swh_storage.revision_get_random() in \ + {data.revision['id'], data.revision2['id'], data.revision3['id']} + def test_release_add(self, swh_storage): init_missing = swh_storage.release_missing([data.release['id'], data.release2['id']]) @@ -868,6 +888,12 @@ assert unknown_releases[0] is None + def test_release_get_random(self, swh_storage): + swh_storage.release_add([data.release, data.release2, data.release3]) + + assert swh_storage.release_get_random() in \ + {data.release['id'], data.release2['id'], data.release3['id']} + def test_origin_add_one(self, swh_storage): origin0 = swh_storage.origin_get(data.origin) assert origin0 is None @@ -2362,6 +2388,14 @@ assert{**data.snapshot, 'next_branch': None} \ == swh_storage.snapshot_get_latest(origin_url) + def test_snapshot_get_random(self, swh_storage): + swh_storage.snapshot_add( + [data.snapshot, data.empty_snapshot, data.complete_snapshot]) + + assert swh_storage.snapshot_get_random() in { + data.snapshot['id'], data.empty_snapshot['id'], + data.complete_snapshot['id']} + def test_stat_counters(self, swh_storage): expected_keys = ['content', 'directory', 'origin', 'revision']