diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -826,7 +826,9 @@ target_type snapshot_target ); -create or replace function swh_snapshot_get_by_id(id snapshot.id%type) +create or replace function swh_snapshot_get_by_id(id snapshot.id%type, + branches_from bytea default '', branches_count bigint default null, + target_types snapshot_target[] default NULL) returns setof snapshot_result language sql stable @@ -836,6 +838,24 @@ from snapshot_branches inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id) + and (target_types is null or target_type = any(target_types)) + and name >= branches_from + order by name limit branches_count +$$; + +create type snapshot_size as ( + target_type snapshot_target, + count bigint +); + +create or replace function swh_snapshot_count_branches(id snapshot.id%type) + returns setof snapshot_size + language sql + stable +as $$ + SELECT target_type, count(name) + from swh_snapshot_get_by_id(swh_snapshot_count_branches.id) + group by target_type; $$; create or replace function swh_snapshot_get_by_origin_visit(origin_id bigint, visit_id bigint) diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -12,7 +12,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(123, now(), 'Work In Progress'); + values(124, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); diff --git a/sql/upgrades/124.sql b/sql/upgrades/124.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/124.sql @@ -0,0 +1,35 @@ +-- SWH DB schema upgrade +-- from_version: 123 +-- to_version: 124 +-- description: Enable to paginate, filter and count snapshot content + +insert into dbversion(version, release, description) + values(124, now(), 'Work In Progress'); + +DROP FUNCTION swh_snapshot_get_by_id(id public.sha1_git); + +CREATE TYPE snapshot_size AS ( + target_type public.snapshot_target, + "count" bigint +); + +CREATE OR REPLACE FUNCTION swh_snapshot_count_branches(id public.sha1_git) RETURNS SETOF public.snapshot_size + LANGUAGE sql STABLE + AS $$ + SELECT target_type, count(name) + from swh_snapshot_get_by_id(swh_snapshot_count_branches.id) + group by target_type; +$$; + +CREATE OR REPLACE FUNCTION swh_snapshot_get_by_id(id public.sha1_git, branches_from bytea = '\x'::bytea, branches_count bigint = NULL::bigint, target_types public.snapshot_target[] = NULL::public.snapshot_target[]) RETURNS SETOF public.snapshot_result + LANGUAGE sql STABLE + AS $$ + select + swh_snapshot_get_by_id.id as snapshot_id, name, target, target_type + from snapshot_branches + inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id + where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id) + and (target_types is null or target_type = any(target_types)) + and name >= branches_from + order by name limit branches_count +$$; \ No newline at end of file diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -101,11 +101,15 @@ }) def snapshot_get(self, snapshot_id): - return self.post('snapshot', {'snapshot_id': snapshot_id}) + return self.post('snapshot', { + 'snapshot_id': snapshot_id + }) def snapshot_get_by_origin_visit(self, origin, visit): - return self.post('snapshot/by_origin_visit', {'origin': origin, - 'visit': visit}) + return self.post('snapshot/by_origin_visit', { + 'origin': origin, + 'visit': visit + }) def snapshot_get_latest(self, origin, allowed_statuses=None): return self.post('snapshot/latest', { @@ -113,6 +117,20 @@ 'allowed_statuses': allowed_statuses }) + def snapshot_count_branches(self, snapshot_id): + return self.post('snapshot/count_branches', { + 'snapshot_id': snapshot_id + }) + + def snapshot_get_branches(self, snapshot_id, branches_from=b'', + branches_count=None, target_types=None): + return self.post('snapshot/get_branches', { + 'snapshot_id': snapshot_id, + 'branches_from': branches_from, + 'branches_count': branches_count, + 'target_types': target_types + }) + def origin_get(self, origin): return self.post('origin/get', {'origin': origin}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -219,6 +219,18 @@ **decode_request(request))) +@app.route('/snapshot/count_branches', methods=['POST']) +def snapshot_count_branches(): + return encode_data(get_storage().snapshot_count_branches( + **decode_request(request))) + + +@app.route('/snapshot/get_branches', methods=['POST']) +def snapshot_get_branches(): + return encode_data(get_storage().snapshot_get_branches( + **decode_request(request))) + + @app.route('/origin/get', methods=['POST']) def origin_get(): return encode_data(get_storage().origin_get(**decode_request(request))) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -359,15 +359,31 @@ cur.execute("""SELECT swh_snapshot_add(%s, %s, %s)""", (origin, visit, snapshot_id)) + snapshot_count_cols = ['target_type', 'count'] + + def snapshot_count_branches(self, snapshot_id, cur=None): + cur = self._cursor(cur) + query = """\ + SELECT %s FROM swh_snapshot_count_branches(%%s) + """ % ', '.join(self.snapshot_count_cols) + + cur.execute(query, (snapshot_id,)) + + yield from cursor_to_bytes(cur) + snapshot_get_cols = ['snapshot_id', 'name', 'target', 'target_type'] - def snapshot_get_by_id(self, snapshot_id, cur=None): + def snapshot_get_by_id(self, snapshot_id, branches_from=b'', + branches_count=None, target_types=None, + cur=None): cur = self._cursor(cur) query = """\ - SELECT %s FROM swh_snapshot_get_by_id(%%s) + SELECT %s + FROM swh_snapshot_get_by_id(%%s, %%s, %%s, %%s :: snapshot_target[]) """ % ', '.join(self.snapshot_get_cols) - cur.execute(query, (snapshot_id,)) + cur.execute(query, (snapshot_id, branches_from, branches_count, + target_types)) yield from cursor_to_bytes(cur) diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -757,18 +757,33 @@ @db_transaction(statement_timeout=2000) def snapshot_get(self, snapshot_id, db=None, cur=None): - """Get the snapshot with the given id + """Get the content, possibly partial, of a snapshot with the given id + + The branches of the snapshot are iterated in the lexicographical + order of their names. + + .. warning:: At most 1000 branches contained in the snapshot will be + returned for performance reasons. In order to browse the whole + set of branches, the method :meth:`snapshot_get_branches` + should be used instead. Args: - snapshot_id (bytes): id of the snapshot + snapshot_id (bytes): identifier of the snapshot Returns: - dict: a snapshot with two keys: - id:: identifier for the snapshot - branches:: a list of branches contained by the snapshot - + dict: a dict with three keys: + * **id**: identifier of the snapshot + * **branches**: a dict of branches contained in the snapshot + whose keys are the branches' names. + * **next_branch**: the name of the first branch not returned + or :const:`None` if the snapshot has less than 1000 + branches. """ + max_branches = 1000 branches = {} - for branch in db.snapshot_get_by_id(snapshot_id, cur): + next_branch = None + fetched_branches = list(db.snapshot_get_by_id( + snapshot_id, branches_count=max_branches+1, cur=cur)) + for branch in fetched_branches[:max_branches]: branch = dict(zip(db.snapshot_get_cols, branch)) del branch['snapshot_id'] name = branch.pop('name') @@ -776,26 +791,50 @@ branch = None branches[name] = branch + if len(fetched_branches) > max_branches: + branch = dict(zip(db.snapshot_get_cols, fetched_branches[-1])) + next_branch = branch['name'] + if branches: - return {'id': snapshot_id, 'branches': branches} + return { + 'id': snapshot_id, + 'branches': branches, + 'next_branch': next_branch + } if db.snapshot_exists(snapshot_id, cur): # empty snapshot - return {'id': snapshot_id, 'branches': {}} + return { + 'id': snapshot_id, + 'branches': {}, + 'next_branch': None + } return None @db_transaction(statement_timeout=2000) def snapshot_get_by_origin_visit(self, origin, visit, db=None, cur=None): - """Get the snapshot for the given origin visit + """Get the content, possibly partial, of a snapshot for the given origin visit + + The branches of the snapshot are iterated in the lexicographical + order of their names. + + .. warning:: At most 1000 branches contained in the snapshot will be + returned for performance reasons. In order to browse the whole + set of branches, the method :meth:`snapshot_get_branches` + should be used instead. Args: - origin (int): the origin identifier - visit (int): the visit identifier + origin (int): the origin identifier + visit (int): the visit identifier Returns: - dict: a snapshot with two keys: - id:: identifier for the snapshot - branches:: a dictionary containing the snapshot branch information + dict: a dict with three keys: + * **id**: identifier of the snapshot + * **branches**: a dict of branches contained in the snapshot + whose keys are the branches' names. + * **next_branch**: the name of the first branch not returned + or :const:`None` if the snapshot has less than 1000 + branches. """ snapshot_id = db.snapshot_get_by_origin_visit(origin, visit, cur) @@ -817,20 +856,32 @@ @db_transaction(statement_timeout=2000) def snapshot_get_latest(self, origin, allowed_statuses=None, db=None, cur=None): - """Get the latest snapshot for the given origin, optionally only from visits - that have one of the given allowed_statuses. + """Get the content, possibly partial, of the latest snapshot for the + given origin, optionally only from visits that have one of the given + allowed_statuses + + The branches of the snapshot are iterated in the lexicographical + order of their names. + + .. warning:: At most 1000 branches contained in the snapshot will be + returned for performance reasons. In order to browse the whole + set of branches, the method :meth:`snapshot_get_branches` + should be used instead. Args: origin (int): the origin identifier allowed_statuses (list of str): list of visit statuses considered - to find the latest snapshot for the visit. For instance, - ``allowed_statuses=['full']`` will only consider visits that - have successfully run to completion. - + to find the latest snapshot for the visit. For instance, + ``allowed_statuses=['full']`` will only consider visits that + have successfully run to completion. Returns: - dict: a snapshot with two keys: - id:: identifier for the snapshot - branches:: a dictionary containing the snapshot branch information + dict: a dict with three keys: + * **id**: identifier of the snapshot + * **branches**: a dict of branches contained in the snapshot + whose keys are the branches' names. + * **next_branch**: the name of the first branch not returned + or :const:`None` if the snapshot has less than 1000 + branches. """ origin_visit = db.origin_visit_get_latest_snapshot( origin, allowed_statuses=allowed_statuses, cur=cur) @@ -838,6 +889,63 @@ origin_visit = dict(zip(db.origin_visit_get_cols, origin_visit)) return self.snapshot_get(origin_visit['snapshot'], db=db, cur=cur) + @db_transaction(statement_timeout=2000) + def snapshot_count_branches(self, snapshot_id, db=None, cur=None): + """Count the number of branches in the snapshot with the given id + + Args: + snapshot_id (bytes): identifier of the snapshot + + Returns: + dict: A dict whose keys are the target types of branches and + values their corresponding amount + """ + return dict([bc for bc in + db.snapshot_count_branches(snapshot_id, cur)]) + + @db_transaction(statement_timeout=2000) + def snapshot_get_branches(self, snapshot_id, branches_from=b'', + branches_count=None, target_types=None, + db=None, cur=None): + """Get the content, possibly partial, of a snapshot with the given id + + The branches of the snapshot are iterated in the lexicographical + order of their names. + + Args: + snapshot_id (bytes): identifier of the snapshot + branches_from (bytes): optional parameter used to skip branches + whose name is lesser than it before returning them + branches_count (int): optional parameter used to restrain + the amount of returned branches + target_types (list): optional parameter used to filter the + target types of branch to return (possible values that can be + contained in that list are `'content', 'directory', + 'revision', 'release', 'snapshot', 'alias'`) + Returns: + dict: a dict with two keys: + * **id**: identifier of the snapshot + * **branches**: a dict of branches contained in the snapshot + whose keys are the branches' names. + """ + branches = {} + for branch in db.snapshot_get_by_id(snapshot_id, branches_from, + branches_count, target_types, cur): + branch = dict(zip(db.snapshot_get_cols, branch)) + del branch['snapshot_id'] + name = branch.pop('name') + if branch == {'target': None, 'target_type': None}: + branch = None + branches[name] = branch + + if branches: + return {'id': snapshot_id, 'branches': branches} + + if db.snapshot_exists(snapshot_id, cur): + return {'id': snapshot_id, 'branches': {}} + + return None + @db_transaction() def occurrence_add(self, occurrences, db=None, cur=None): """Add occurrences to the storage diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -593,11 +593,13 @@ 'target_type': self.occurrence['target_type'], }, }, + 'next_branch': None } self.empty_snapshot = { 'id': hash_to_bytes('1a8893e6a86f444e8be8e7bda6cb34fb1735a00e'), 'branches': {}, + 'next_branch': None } self.complete_snapshot = { @@ -633,7 +635,8 @@ 'target_type': 'snapshot', }, b'dangling': None, - } + }, + 'next_branch': None } def tearDown(self): @@ -1613,6 +1616,105 @@ self.assertEqual(by_ov, self.complete_snapshot) @istest + def snapshot_add_count_branches(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit_id = origin_visit1['visit'] + + self.storage.snapshot_add(origin_id, visit_id, self.complete_snapshot) + + snp_id = self.complete_snapshot['id'] + snp_size = self.storage.snapshot_count_branches(snp_id) + + expected_snp_size = { + 'alias': 1, + 'content': 1, + 'directory': 1, + 'release': 1, + 'revision': 1, + 'snapshot': 1, + None: 1 + } + + self.assertEqual(snp_size, expected_snp_size) + + @istest + def snapshot_add_get_paginated(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit_id = origin_visit1['visit'] + + self.storage.snapshot_add(origin_id, visit_id, self.complete_snapshot) + + snp_id = self.complete_snapshot['id'] + + snapshot = self.storage.snapshot_get_branches(snp_id, + branches_from=b'release') + + expected_snapshot = copy.deepcopy(self.complete_snapshot) + del expected_snapshot['next_branch'] + for name in [b'alias', b'content', b'dangling', b'directory']: + del expected_snapshot['branches'][name] + + self.assertEqual(snapshot, expected_snapshot) + + snapshot = self.storage.snapshot_get_branches(snp_id, + branches_count=1) + + expected_snapshot = copy.deepcopy(self.complete_snapshot) + del expected_snapshot['next_branch'] + for name in [b'content', b'dangling', b'directory', + b'release', b'revision', b'snapshot']: + del expected_snapshot['branches'][name] + + self.assertEqual(snapshot, expected_snapshot) + + snapshot = self.storage.snapshot_get_branches( + snp_id, branches_from=b'directory', branches_count=3) + + expected_snapshot = copy.deepcopy(self.complete_snapshot) + del expected_snapshot['next_branch'] + for name in [b'alias', b'content', b'dangling', b'snapshot']: + del expected_snapshot['branches'][name] + + self.assertEqual(snapshot, expected_snapshot) + + @istest + def snapshot_add_get_filtered(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit_id = origin_visit1['visit'] + + self.storage.snapshot_add(origin_id, visit_id, self.complete_snapshot) + + snp_id = self.complete_snapshot['id'] + + snapshot = self.storage.snapshot_get_branches( + snp_id, target_types=['release', 'revision']) + + expected_snapshot = copy.deepcopy(self.complete_snapshot) + del expected_snapshot['next_branch'] + for name in [b'alias', b'content', b'dangling', b'directory', + b'snapshot']: + del expected_snapshot['branches'][name] + + self.assertEqual(snapshot, expected_snapshot) + + snapshot = self.storage.snapshot_get_branches(snp_id, + target_types=['alias']) + + expected_snapshot = copy.deepcopy(self.complete_snapshot) + del expected_snapshot['next_branch'] + for name in [b'content', b'dangling', b'directory', b'release', + b'revision', b'snapshot']: + del expected_snapshot['branches'][name] + + self.assertEqual(snapshot, expected_snapshot) + + @istest def snapshot_add_get(self): origin_id = self.storage.origin_add_one(self.origin) origin_visit1 = self.storage.origin_visit_add(origin_id,