diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -826,7 +826,9 @@ target_type snapshot_target ); -create or replace function swh_snapshot_get_by_id(id snapshot.id%type) +create or replace function swh_snapshot_get_by_id(id snapshot.id%type, + branches_offset bigint default NULL, branches_limit bigint default NULL, + branches_targets snapshot_target[] default NULL) returns setof snapshot_result language sql stable @@ -836,6 +838,24 @@ from snapshot_branches inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id) + and branches_targets is null or target_type = any(branches_targets) + or (target_type is null and (select bool_or(a is null) from unnest(branches_targets) s(a))) + order by name offset branches_offset limit branches_limit +$$; + +create type snapshot_size as ( + target_type snapshot_target, + count bigint +); + +create or replace function swh_snapshot_branches_count(id snapshot.id%type) + returns setof snapshot_size + language sql + stable +as $$ + SELECT target_type, count(name) + from swh_snapshot_get_by_id(swh_snapshot_branches_count.id) + group by target_type; $$; create or replace function swh_snapshot_get_by_origin_visit(origin_id bigint, visit_id bigint) diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -12,7 +12,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(123, now(), 'Work In Progress'); + values(124, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); diff --git a/sql/upgrades/124.sql b/sql/upgrades/124.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/124.sql @@ -0,0 +1,35 @@ +-- SWH DB schema upgrade +-- from_version: 123 +-- to_version: 124 +-- description: Enable to paginate, filter and count snapshot content + +insert into dbversion(version, release, description) + values(124, now(), 'Work In Progress'); + +DROP FUNCTION swh_snapshot_get_by_id(id public.sha1_git); + +CREATE TYPE snapshot_size AS ( + target_type public.snapshot_target, + "count" bigint +); + +CREATE OR REPLACE FUNCTION swh_snapshot_branches_count(id public.sha1_git) RETURNS SETOF public.snapshot_size + LANGUAGE sql STABLE + AS $$ + SELECT target_type, count(name) + from swh_snapshot_get_by_id(swh_snapshot_branches_count.id) + group by target_type; +$$; + +CREATE OR REPLACE FUNCTION swh_snapshot_get_by_id(id public.sha1_git, branches_offset bigint = NULL::bigint, branches_limit bigint = NULL::bigint, branches_targets public.snapshot_target[] = NULL::public.snapshot_target[]) RETURNS SETOF public.snapshot_result + LANGUAGE sql STABLE + AS $$ + select + swh_snapshot_get_by_id.id as snapshot_id, name, target, target_type + from snapshot_branches + inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id + where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id) + and branches_targets is null or target_type = any(branches_targets) + or (target_type is null and (select bool_or(a is null) from unnest(branches_targets) s(a))) + order by name offset branches_offset limit branches_limit +$$; diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -100,17 +100,40 @@ 'origin': origin, 'visit': visit, 'snapshot': snapshot, }) - def snapshot_get(self, snapshot_id): - return self.post('snapshot', {'snapshot_id': snapshot_id}) + def snapshot_branches_count(self, snapshot_id): + return self.post('snapshot/branches_count', { + 'snapshot_id': snapshot_id + }) + + def snapshot_get(self, snapshot_id, branches_offset=None, + branches_limit=None, branches_targets=None): + return self.post('snapshot', { + 'snapshot_id': snapshot_id, + 'branches_offset': branches_offset, + 'branches_limit': branches_limit, + 'branches_targets': branches_targets + }) - def snapshot_get_by_origin_visit(self, origin, visit): - return self.post('snapshot/by_origin_visit', {'origin': origin, - 'visit': visit}) + def snapshot_get_by_origin_visit(self, origin, visit, + branches_offset=None, branches_limit=None, + branches_targets=None): + return self.post('snapshot/by_origin_visit', { + 'origin': origin, + 'visit': visit, + 'branches_offset': branches_offset, + 'branches_limit': branches_limit, + 'branches_targets': branches_targets + }) - def snapshot_get_latest(self, origin, allowed_statuses=None): + def snapshot_get_latest(self, origin, allowed_statuses=None, + branches_offset=None, branches_limit=None, + branches_targets=None): return self.post('snapshot/latest', { 'origin': origin, - 'allowed_statuses': allowed_statuses + 'allowed_statuses': allowed_statuses, + 'branches_offset': branches_offset, + 'branches_limit': branches_limit, + 'branches_targets': branches_targets }) def origin_get(self, origin): diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -202,6 +202,12 @@ return encode_data(get_storage().snapshot_add(**decode_request(request))) +@app.route('/snapshot/branches_count', methods=['POST']) +def snapshot_branches_count(): + return encode_data(get_storage().snapshot_branches_count( + **decode_request(request))) + + @app.route('/snapshot', methods=['POST']) def snapshot_get(): return encode_data(get_storage().snapshot_get(**decode_request(request))) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -359,15 +359,31 @@ cur.execute("""SELECT swh_snapshot_add(%s, %s, %s)""", (origin, visit, snapshot_id)) + snapshot_count_cols = ['target_type', 'count'] + + def snapshot_branches_count(self, snapshot_id, cur=None): + cur = self._cursor(cur) + query = """\ + SELECT %s FROM swh_snapshot_branches_count(%%s) + """ % ', '.join(self.snapshot_count_cols) + + cur.execute(query, (snapshot_id,)) + + yield from cursor_to_bytes(cur) + snapshot_get_cols = ['snapshot_id', 'name', 'target', 'target_type'] - def snapshot_get_by_id(self, snapshot_id, cur=None): + def snapshot_get_by_id(self, snapshot_id, branches_offset=None, + branches_limit=None, branches_targets=None, + cur=None): cur = self._cursor(cur) query = """\ - SELECT %s FROM swh_snapshot_get_by_id(%%s) + SELECT %s + FROM swh_snapshot_get_by_id(%%s, %%s, %%s, %%s :: snapshot_target[]) """ % ', '.join(self.snapshot_get_cols) - cur.execute(query, (snapshot_id,)) + cur.execute(query, (snapshot_id, branches_offset, branches_limit, + branches_targets)) yield from cursor_to_bytes(cur) diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -756,19 +756,46 @@ db.snapshot_add(origin, visit, snapshot['id'], cur) @db_transaction(statement_timeout=2000) - def snapshot_get(self, snapshot_id, db=None, cur=None): + def snapshot_branches_count(self, snapshot_id, db=None, cur=None): + """Count the number of branches in the snapshot with the given id + + Args: + snapshot_id (bytes): id of the snapshot + + Returns: + dict: A dict whose keys are the target types of branches and + values their corresponding amount + """ + return dict([bc for bc in + db.snapshot_branches_count(snapshot_id, cur)]) + + @db_transaction(statement_timeout=2000) + def snapshot_get(self, snapshot_id, branches_offset=None, + branches_limit=None, branches_targets=None, + db=None, cur=None): """Get the snapshot with the given id Args: - snapshot_id (bytes): id of the snapshot + snapshot_id (bytes): id of the snapshot + branches_offset (int): optional parameter used to skip a + given amount of branches before returning them + branches_limit (int): optional parameter used to restrain + the amount of returned branches + branches_targets (list): optional parameter used to filter the + types of branch to return (possible values that can be + contained in that list are 'content', 'directory', 'revision', + 'release', 'snapshot', 'alias' or None for dangling branches) Returns: - dict: a snapshot with two keys: - id:: identifier for the snapshot - branches:: a list of branches contained by the snapshot + dict: a snapshot with two keys: + id:: identifier for the snapshot + branches:: a list of branches contained by the snapshot + sorted by their names in ascending order """ branches = {} - for branch in db.snapshot_get_by_id(snapshot_id, cur): + for branch in db.snapshot_get_by_id(snapshot_id, branches_offset, + branches_limit, branches_targets, + cur): branch = dict(zip(db.snapshot_get_cols, branch)) del branch['snapshot_id'] name = branch.pop('name') @@ -786,22 +813,35 @@ return None @db_transaction(statement_timeout=2000) - def snapshot_get_by_origin_visit(self, origin, visit, db=None, cur=None): + def snapshot_get_by_origin_visit(self, origin, visit, branches_offset=None, + branches_limit=None, branches_targets=None, # noqa + db=None, cur=None): """Get the snapshot for the given origin visit Args: - origin (int): the origin identifier - visit (int): the visit identifier + origin (int): the origin identifier + visit (int): the visit identifier + branches_offset (int): optional parameter used to skip a + given amount of branches before returning them + branches_limit (int): optional parameter used to restrain + the amount of returned branches + branches_targets (list): optional parameter used to filter the + types of branch to return (possible values that can be + contained in that list are 'content', 'directory', 'revision', + 'release', 'snapshot', 'alias' or None for dangling branches) Returns: - dict: a snapshot with two keys: - id:: identifier for the snapshot - branches:: a dictionary containing the snapshot branch information + dict: a snapshot with two keys: + id:: identifier for the snapshot + branches:: a dictionary containing the snapshot branch + information """ snapshot_id = db.snapshot_get_by_origin_visit(origin, visit, cur) if snapshot_id: - return self.snapshot_get(snapshot_id, db=db, cur=cur) + return self.snapshot_get(snapshot_id, branches_offset, + branches_limit, branches_targets, + db=db, cur=cur) else: # compatibility code during the snapshot migration origin_visit_info = self.origin_visit_get_by(origin, visit, @@ -815,28 +855,40 @@ return None @db_transaction(statement_timeout=2000) - def snapshot_get_latest(self, origin, allowed_statuses=None, db=None, - cur=None): + def snapshot_get_latest(self, origin, allowed_statuses=None, + branches_offset=None, branches_limit=None, + branches_targets=None, db=None, cur=None): """Get the latest snapshot for the given origin, optionally only from visits that have one of the given allowed_statuses. Args: origin (int): the origin identifier allowed_statuses (list of str): list of visit statuses considered - to find the latest snapshot for the visit. For instance, - ``allowed_statuses=['full']`` will only consider visits that - have successfully run to completion. + to find the latest snapshot for the visit. For instance, + ``allowed_statuses=['full']`` will only consider visits that + have successfully run to completion. + branches_offset (int): optional parameter used to skip a + given amount of branches before returning them + branches_limit (int): optional parameter used to restrain + the amount of returned branches + branches_targets (list): optional parameter used to filter the + types of branch to return (possible values that can be + contained in that list are 'content', 'directory', 'revision', + 'release', 'snapshot', 'alias' or None for dangling branches) Returns: - dict: a snapshot with two keys: - id:: identifier for the snapshot - branches:: a dictionary containing the snapshot branch information + dict: a snapshot with two keys: + id:: identifier for the snapshot + branches:: a dictionary containing the snapshot branch + information """ origin_visit = db.origin_visit_get_latest_snapshot( origin, allowed_statuses=allowed_statuses, cur=cur) if origin_visit: origin_visit = dict(zip(db.origin_visit_get_cols, origin_visit)) - return self.snapshot_get(origin_visit['snapshot'], db=db, cur=cur) + return self.snapshot_get(origin_visit['snapshot'], branches_offset, + branches_limit, branches_targets, db=db, + cur=cur) @db_transaction() def occurrence_add(self, occurrences, db=None, cur=None): diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1613,6 +1613,102 @@ self.assertEqual(by_ov, self.complete_snapshot) @istest + def snapshot_add_count_branches(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit_id = origin_visit1['visit'] + + self.storage.snapshot_add(origin_id, visit_id, self.complete_snapshot) + + snp_id = self.complete_snapshot['id'] + snp_size = self.storage.snapshot_branches_count(snp_id) + + expected_snp_size = { + 'alias': 1, + 'content': 1, + 'directory': 1, + 'release': 1, + 'revision': 1, + 'snapshot': 1, + None: 1 + } + + self.assertEqual(snp_size, expected_snp_size) + + @istest + def snapshot_add_get_paginated(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit_id = origin_visit1['visit'] + + self.storage.snapshot_add(origin_id, visit_id, self.complete_snapshot) + + snp_id = self.complete_snapshot['id'] + + snapshot = self.storage.snapshot_get(snp_id, + branches_offset=4) + + expected_snapshot = copy.deepcopy(self.complete_snapshot) + for name in [b'alias', b'content', b'dangling', b'directory']: + del expected_snapshot['branches'][name] + + self.assertEqual(snapshot, expected_snapshot) + + snapshot = self.storage.snapshot_get(snp_id, + branches_limit=1) + + expected_snapshot = copy.deepcopy(self.complete_snapshot) + for name in [b'content', b'dangling', b'directory', + b'release', b'revision', b'snapshot']: + del expected_snapshot['branches'][name] + + self.assertEqual(snapshot, expected_snapshot) + + snapshot = self.storage.snapshot_get(snp_id, + branches_offset=2, + branches_limit=3) + + expected_snapshot = copy.deepcopy(self.complete_snapshot) + for name in [b'alias', b'content', b'revision', b'snapshot']: + del expected_snapshot['branches'][name] + + self.assertEqual(snapshot, expected_snapshot) + + @istest + def snapshot_add_get_filtered(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit_id = origin_visit1['visit'] + + self.storage.snapshot_add(origin_id, visit_id, self.complete_snapshot) + + snp_id = self.complete_snapshot['id'] + + snapshot = self.storage.snapshot_get(snp_id, + branches_targets=['release', + 'revision']) + + expected_snapshot = copy.deepcopy(self.complete_snapshot) + for name in [b'alias', b'content', b'dangling', b'directory', + b'snapshot']: + del expected_snapshot['branches'][name] + + self.assertEqual(snapshot, expected_snapshot) + + snapshot = self.storage.snapshot_get(snp_id, + branches_targets=['alias', None]) + + expected_snapshot = copy.deepcopy(self.complete_snapshot) + for name in [b'content', b'directory', b'release', b'revision', + b'snapshot']: + del expected_snapshot['branches'][name] + + self.assertEqual(snapshot, expected_snapshot) + + @istest def snapshot_add_get(self): origin_id = self.storage.origin_add_one(self.origin) origin_visit1 = self.storage.origin_visit_add(origin_id,