diff --git a/sql/swh-enums.sql b/sql/swh-enums.sql --- a/sql/swh-enums.sql +++ b/sql/swh-enums.sql @@ -40,9 +40,12 @@ create type revision_type as enum ('git', 'tar', 'dsc', 'svn'); comment on type revision_type is 'Possible revision types'; -create type object_type as enum ('content', 'directory', 'revision', 'release'); +create type object_type as enum ('content', 'directory', 'revision', 'release', 'snapshot'); comment on type object_type is 'Data object types stored in data model'; +create type snapshot_target as enum ('content', 'directory', 'revision', 'release', 'snapshot', 'alias'); +comment on type snapshot_target is 'Types of targets for snapshot branches'; + create type origin_visit_status as enum ( 'ongoing', 'full', diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -130,6 +130,17 @@ ) on commit drop; $$; +-- create a temporary table for the branches of a snapshot +create or replace function swh_mktemp_snapshot_branch() + returns void + language sql +as $$ + create temporary table tmp_snapshot_branch ( + name bytea not null, + target bytea, + target_type snapshot_target + ) on commit drop; +$$; create or replace function swh_mktemp_tool() returns void @@ -937,6 +948,70 @@ end $$; +create or replace function swh_snapshot_add(origin bigint, visit bigint, snapshot_id snapshot.id%type) + returns void + language plpgsql +as $$ +declare + snapshot_object_id snapshot.object_id%type; +begin + select object_id from snapshot where id = snapshot_id into snapshot_object_id; + if snapshot_object_id is null then + insert into snapshot (id) values (snapshot_id) returning object_id into snapshot_object_id; + with all_branches(name, target_type, target) as ( + select name, target_type, target from tmp_snapshot_branch + ), inserted as ( + insert into snapshot_branch (name, target_type, target) + select name, target_type, target from all_branches + on conflict do nothing + returning object_id + ) + insert into snapshot_branches (snapshot_id, branch_id) + select snapshot_object_id, object_id as branch_id from inserted + union all + select snapshot_object_id, object_id as branch_id + from all_branches ab + join snapshot_branch sb + on sb.name = ab.name + and sb.target_type is not distinct from ab.target_type + and sb.target is not distinct from ab.target; + end if; + update origin_visit ov + set snapshot_id = snapshot_object_id + where ov.origin=swh_snapshot_add.origin and ov.visit=swh_snapshot_add.visit; +end; +$$; + +create type snapshot_result as ( + snapshot_id sha1_git, + name bytea, + target bytea, + target_type snapshot_target +); + +create or replace function swh_snapshot_get_by_id(id snapshot.id%type) + returns setof snapshot_result + language sql + stable +as $$ + select + swh_snapshot_get_by_id.id as snapshot_id, name, target, target_type + from snapshot_branches + inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id + where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id) +$$; + +create or replace function swh_snapshot_get_by_origin_visit(origin_id bigint, visit_id bigint) + returns snapshot.id%type + language sql + stable +as $$ + select snapshot.id + from origin_visit + left join snapshot + on snapshot.object_id = origin_visit.snapshot_id + where origin_visit.origin=origin_id and origin_visit.visit=visit_id; +$$; -- Absolute path: directory reference + complete path relative to it create type content_dir as ( diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql --- a/sql/swh-indexes.sql +++ b/sql/swh-indexes.sql @@ -153,6 +153,34 @@ alter table revision_history add constraint revision_history_id_fkey foreign key (id) references revision(id) not valid; alter table revision_history validate constraint revision_history_id_fkey; +-- snapshot +create unique index concurrently snapshot_pkey on snapshot(object_id); +alter table snapshot add primary key using index snapshot_pkey; + +create unique index concurrently on snapshot(id); + +-- snapshot_branch +create unique index concurrently snapshot_branch_pkey on snapshot_branch(object_id); +alter table snapshot_branch add primary key using index snapshot_branch_pkey; + +create unique index concurrently on snapshot_branch (target_type, target, name); +alter table snapshot_branch add constraint snapshot_branch_target_check check ((target_type is null) = (target is null)) not valid; +alter table snapshot_branch validate constraint snapshot_branch_target_check; +alter table snapshot_branch add constraint snapshot_target_check check (target_type not in ('content', 'directory', 'revision', 'release', 'snapshot') or length(target) = 20) not valid; +alter table snapshot_branch validate constraint snapshot_target_check; + +create unique index concurrently on snapshot_branch (name) where target_type is null and target is null; + +-- snapshot_branches +create unique index concurrently snapshot_branches_pkey on snapshot_branches(snapshot_id, branch_id); +alter table snapshot_branches add primary key using index snapshot_branches_pkey; + +alter table snapshot_branches add constraint snapshot_branches_snapshot_id_fkey foreign key (snapshot_id) references snapshot(object_id) not valid; +alter table snapshot_branches validate constraint snapshot_branches_snapshot_id_fkey; + +alter table snapshot_branches add constraint snapshot_branches_branch_id_fkey foreign key (branch_id) references snapshot_branch(object_id) not valid; +alter table snapshot_branches validate constraint snapshot_branches_branch_id_fkey; + -- origin_visit create unique index concurrently origin_visit_pkey on origin_visit(origin, visit); alter table origin_visit add primary key using index origin_visit_pkey; @@ -162,6 +190,9 @@ alter table origin_visit add constraint origin_visit_origin_fkey foreign key (origin) references origin(id) not valid; alter table origin_visit validate constraint origin_visit_origin_fkey; +alter table origin_visit add constraint origin_visit_snapshot_id_fkey foreign key (snapshot_id) references snapshot(object_id) not valid; +alter table origin_visit validate constraint origin_visit_snapshot_id_fkey; + -- occurrence_history create unique index concurrently occurrence_history_pkey on occurrence_history(object_id); alter table occurrence_history add primary key using index occurrence_history_pkey; @@ -180,7 +211,6 @@ alter table occurrence add constraint occurrence_origin_fkey foreign key (origin) references origin(id) not valid; alter table occurrence validate constraint occurrence_origin_fkey; - -- release create unique index concurrently release_pkey on release(id); alter table release add primary key using index release_pkey; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(114, now(), 'Work In Progress'); + values(115, now(), 'Work In Progress'); -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); @@ -293,11 +293,12 @@ -- The timestamps at which Software Heritage has made a visit of the given origin. create table origin_visit ( - origin bigint not null, - visit bigint not null, - date timestamptz not null, - status origin_visit_status not null, - metadata jsonb + origin bigint not null, + visit bigint not null, + date timestamptz not null, + status origin_visit_status not null, + metadata jsonb, + snapshot_id bigint ); comment on column origin_visit.origin is 'Visited origin'; @@ -305,6 +306,7 @@ comment on column origin_visit.date is 'Visit date for that origin'; comment on column origin_visit.status is 'Visit status for that origin'; comment on column origin_visit.metadata is 'Metadata associated with the visit'; +comment on column origin_visit.snapshot_id is 'id of the snapshot associated with the visit'; -- The content of software origins is indexed starting from top-level pointers @@ -321,7 +323,8 @@ target_type object_type not null, -- ref target type visits bigint[] not null, -- the visits where that occurrence was valid. References -- origin_visit(visit), where o_h.origin = origin_visit.origin. - object_id bigserial not null -- short object identifier + object_id bigserial not null, -- short object identifier + snapshot_branch_id bigint ); -- Materialized view of occurrence_history, storing the *current* value of each @@ -334,6 +337,25 @@ target_type object_type not null ); + +create table snapshot ( + object_id bigserial not null, + id sha1_git +); + +create table snapshot_branch ( + object_id bigserial not null, + name bytea not null, + target bytea, + target_type snapshot_target +); + +create table snapshot_branches ( + snapshot_id bigint not null, + branch_id bigint not null +); + + -- A "memorable" point in the development history of a project. -- -- Synonyms/mappings: diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -104,6 +104,17 @@ def occurrence_add(self, occurrences): return self.post('occurrence/add', {'occurrences': occurrences}) + def snapshot_add(self, origin, visit, snapshot): + return self.post('snapshot/add', {'origin': origin, 'visit': visit, + 'snapshot': snapshot}) + + def snapshot_get(self, snapshot_id): + return self.post('snapshot', {'snapshot_id': snapshot_id}) + + def snapshot_get_by_origin_visit(self, origin, visit): + return self.post('snapshot/by_origin_visit', {'origin': origin, + 'visit': visit}) + def origin_get(self, origin): return self.post('origin/get', {'origin': origin}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -192,6 +192,22 @@ return encode_data(g.storage.occurrence_add(**decode_request(request))) +@app.route('/snapshot/add', methods=['POST']) +def snapshot_add(): + return encode_data(g.storage.snapshot_add(**decode_request(request))) + + +@app.route('/snapshot', methods=['POST']) +def snapshot_get(): + return encode_data(g.storage.snapshot_get(**decode_request(request))) + + +@app.route('/snapshot/by_origin_visit', methods=['POST']) +def snapshot_get_by_origin_visit(): + return encode_data(g.storage.snapshot_get_by_origin_visit( + **decode_request(request))) + + @app.route('/origin/get', methods=['POST']) def origin_get(): return encode_data(g.storage.origin_get(**decode_request(request))) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -205,6 +205,9 @@ @stored_procedure('swh_mktemp_occurrence_history') def mktemp_occurrence_history(self, cur=None): pass + @stored_procedure('swh_mktemp_snapshot_branch') + def mktemp_snapshot_branch(self, cur=None): pass + @stored_procedure('swh_mktemp_entity_lister') def mktemp_entity_lister(self, cur=None): pass @@ -316,6 +319,44 @@ yield from cursor_to_bytes(cur) + def snapshot_exists(self, snapshot_id, cur=None): + """Check whether a snapshot with the given id exists""" + cur = self._cursor(cur) + + cur.execute("""SELECT 1 FROM snapshot where id=%s""", (snapshot_id,)) + + return bool(cur.fetchone()) + + def snapshot_add(self, origin, visit, snapshot_id, cur=None): + """Add a snapshot for origin/visit from the temporary table""" + cur = self._cursor(cur) + + cur.execute("""SELECT swh_snapshot_add(%s, %s, %s)""", + (origin, visit, snapshot_id)) + + snapshot_get_cols = ['snapshot_id', 'name', 'target', 'target_type'] + + def snapshot_get_by_id(self, snapshot_id, cur=None): + cur = self._cursor(cur) + query = """\ + SELECT %s FROM swh_snapshot_get_by_id(%%s) + """ % ', '.join(self.snapshot_get_cols) + + cur.execute(query, (snapshot_id,)) + + yield from cursor_to_bytes(cur) + + def snapshot_get_by_origin_visit(self, origin_id, visit_id, cur=None): + cur = self._cursor(cur) + query = """\ + SELECT swh_snapshot_get_by_origin_visit(%s, %s) + """ + + cur.execute(query, (origin_id, visit_id)) + ret = cur.fetchone() + if ret: + return line_to_bytes(ret)[0] + content_find_cols = ['sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', 'ctime', 'status'] diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -760,6 +760,134 @@ ) @db_transaction + def snapshot_add(self, origin, visit, snapshot, cur=None): + """Add a snapshot for the given origin/visit couple + + Args: + origin (int): id of the origin + visit (int): id of the visit + snapshot (dict): the snapshot to add to the visit, containing the + following keys: + + - **id** (:class:`bytes`): id of the snapshot + - **branches** (:class:`dict`): branches the snapshot contains, + mapping the branch name (:class:`bytes`) to the branch target, + itself a :class:`dict` (or ``None`` if the branch points to an + unknown object) + + - **target_type** (:class:`str`): one of ``content``, + ``directory``, ``revision``, ``release``, + ``snapshot``, ``alias`` + - **target** (:class:`bytes`): identifier of the target + (currently a ``sha1_git`` for all object kinds, or the name + of the target branch for aliases) + """ + db = self.db + + if not db.snapshot_exists(snapshot['id'], cur): + db.mktemp_snapshot_branch(cur) + db.copy_to( + ( + { + 'name': name, + 'target': info['target'] if info else None, + 'target_type': info['target_type'] if info else None, + } + for name, info in snapshot['branches'].items() + ), + 'tmp_snapshot_branch', + ['name', 'target', 'target_type'], + cur, + ) + + db.snapshot_add(origin, visit, snapshot['id'], cur) + + # TODO: drop this compat feature + occurrences = [] + for name, info in snapshot['branches'].items(): + if not info: + target = b'\x00' * 20 + target_type = 'revision' + elif info['target_type'] == 'alias': + continue + else: + target = info['target'] + target_type = info['target_type'] + + occurrences.append({ + 'origin': origin, + 'visit': visit, + 'branch': name, + 'target': target, + 'target_type': target_type, + }) + + self.occurrence_add(occurrences) + + @db_transaction + def snapshot_get(self, snapshot_id, cur=None): + """Get the snapshot with the given id + + Args: + snapshot_id (bytes): id of the snapshot + Returns: + dict: a snapshot with two keys: + id:: identifier for the snapshot + branches:: a list of branches contained by the snapshot + + """ + db = self.db + + branches = {} + for branch in db.snapshot_get_by_id(snapshot_id, cur): + branch = dict(zip(db.snapshot_get_cols, branch)) + del branch['snapshot_id'] + name = branch.pop('name') + if branch == {'target': None, 'target_type': None}: + branch = None + branches[name] = branch + + if branches: + return {'id': snapshot_id, 'branches': branches} + + if db.snapshot_exists(snapshot_id, cur): + # empty snapshot + return {'id': snapshot_id, 'branches': {}} + + return None + + @db_transaction + def snapshot_get_by_origin_visit(self, origin, visit, cur=None): + """Get the snapshot for the given origin visit + + Args: + origin (int): the origin identifier + visit (int): the visit identifier + Returns: + dict: a snapshot with two keys: + id:: identifier for the snapshot + branches:: a dictionary containing the snapshot branch information + + """ + db = self.db + + snapshot_id = db.snapshot_get_by_origin_visit(origin, visit, cur) + + if snapshot_id: + return self.snapshot_get(snapshot_id, cur=cur) + else: + # compatibility code during the snapshot migration + origin_visit_info = self.origin_visit_get_by(origin, visit, + cur=cur) + if origin_visit_info is None: + return None + ret = {'id': None} + ret['branches'] = origin_visit_info['occurrences'] + return ret + + return None + + @db_transaction def occurrence_add(self, occurrences, cur=None): """Add occurrences to the storage diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -380,7 +380,7 @@ self.occurrence = { 'branch': b'master', - 'target': b'67890123456789012345', + 'target': self.revision['id'], 'target_type': 'revision', } @@ -584,6 +584,57 @@ }, } + self.snapshot = { + 'id': hash_to_bytes('2498dbf535f882bc7f9a18fb16c9ad27fda7bab7'), + 'branches': { + self.occurrence['branch']: { + 'target': self.occurrence['target'], + 'target_type': self.occurrence['target_type'], + }, + }, + } + + self.empty_snapshot = { + 'id': hash_to_bytes('1a8893e6a86f444e8be8e7bda6cb34fb1735a00e'), + 'branches': {}, + } + + self.complete_snapshot = { + 'id': hash_to_bytes('6e65b86363953b780d92b0a928f3e8fcdd10db36'), + 'branches': { + b'directory': { + 'target': hash_to_bytes( + '1bd0e65f7d2ff14ae994de17a1e7fe65111dcad8'), + 'target_type': 'directory', + }, + b'content': { + 'target': hash_to_bytes( + 'fe95a46679d128ff167b7c55df5d02356c5a1ae1'), + 'target_type': 'content', + }, + b'alias': { + 'target': b'revision', + 'target_type': 'alias', + }, + b'revision': { + 'target': hash_to_bytes( + 'aafb16d69fd30ff58afdd69036a26047f3aebdc6'), + 'target_type': 'revision', + }, + b'release': { + 'target': hash_to_bytes( + '7045404f3d1c54e6473c71bbb716529fbad4be24'), + 'target_type': 'release', + }, + b'snapshot': { + 'target': hash_to_bytes( + '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e'), + 'target_type': 'snapshot', + }, + b'dangling': None, + } + } + def tearDown(self): self.reset_storage_tables() super().tearDown() @@ -1604,6 +1655,129 @@ self.assertEquals(actual_occurrence[0], expected_occurrence) @istest + def snapshot_add_get_empty(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit_id = origin_visit1['visit'] + + self.storage.snapshot_add(origin_id, visit_id, self.empty_snapshot) + + by_id = self.storage.snapshot_get(self.empty_snapshot['id']) + self.assertEqual(by_id, self.empty_snapshot) + + by_ov = self.storage.snapshot_get_by_origin_visit(origin_id, visit_id) + self.assertEqual(by_ov, self.empty_snapshot) + + @istest + def snapshot_add_get_complete(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit_id = origin_visit1['visit'] + + self.storage.snapshot_add(origin_id, visit_id, self.complete_snapshot) + + by_id = self.storage.snapshot_get(self.complete_snapshot['id']) + self.assertEqual(by_id, self.complete_snapshot) + + by_ov = self.storage.snapshot_get_by_origin_visit(origin_id, visit_id) + self.assertEqual(by_ov, self.complete_snapshot) + + @istest + def snapshot_add_get(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit_id = origin_visit1['visit'] + + self.storage.snapshot_add(origin_id, visit_id, self.snapshot) + + by_id = self.storage.snapshot_get(self.snapshot['id']) + self.assertEqual(by_id, self.snapshot) + + by_ov = self.storage.snapshot_get_by_origin_visit(origin_id, visit_id) + self.assertEqual(by_ov, self.snapshot) + + # retrocompat test + origin_visit_info = self.storage.origin_visit_get_by(origin_id, + visit_id) + self.assertEqual(origin_visit_info['occurrences'], + self.snapshot['branches']) + + @istest + def snapshot_add_twice(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit1_id = origin_visit1['visit'] + self.storage.snapshot_add(origin_id, visit1_id, self.snapshot) + + by_ov1 = self.storage.snapshot_get_by_origin_visit(origin_id, + visit1_id) + self.assertEqual(by_ov1, self.snapshot) + + origin_visit2 = self.storage.origin_visit_add(origin_id, + self.date_visit2) + visit2_id = origin_visit2['visit'] + + self.storage.snapshot_add(origin_id, visit2_id, self.snapshot) + + by_ov2 = self.storage.snapshot_get_by_origin_visit(origin_id, + visit2_id) + self.assertEqual(by_ov2, self.snapshot) + + @istest + def snapshot_get_nonexistent(self): + bogus_snapshot_id = b'bogus snapshot id 00' + bogus_origin_id = 1 + bogus_visit_id = 1 + + by_id = self.storage.snapshot_get(bogus_snapshot_id) + self.assertIsNone(by_id) + + by_ov = self.storage.snapshot_get_by_origin_visit(bogus_origin_id, + bogus_visit_id) + self.assertIsNone(by_ov) + + @istest + def snapshot_get_retrocompat(self): + empty_retro_snapshot = { + 'id': None, + 'branches': {}, + } + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit_id = origin_visit1['visit'] + + by_ov = self.storage.snapshot_get_by_origin_visit(origin_id, visit_id) + + self.assertEqual(by_ov, empty_retro_snapshot) + + self.storage.revision_add([self.revision]) + self.storage.occurrence_add([{ + 'origin': origin_id, + 'visit': visit_id, + 'branch': self.occurrence['branch'], + 'target': self.occurrence['target'], + 'target_type': self.occurrence['target_type'], + }]) + + one_branch_retro_snapshot = { + 'id': None, + 'branches': { + self.occurrence['branch']: { + 'target': self.occurrence['target'], + 'target_type': self.occurrence['target_type'], + }, + }, + } + + by_ov = self.storage.snapshot_get_by_origin_visit(origin_id, visit_id) + self.assertEqual(by_ov, one_branch_retro_snapshot) + + @istest def entity_get_from_lister_metadata(self): self.storage.entity_add([self.entity1])