diff --git a/sql/upgrades/132.sql b/sql/upgrades/132.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/132.sql @@ -0,0 +1,12 @@ +-- SWH DB schema upgrade +-- from_version: 131 +-- to_version: 132 +-- description: Use sha1 instead of bigint as FK from origin_visit to snapshot (part 2: backfill) + +insert into dbversion(version, release, description) + values(132, now(), 'Work In Progress'); + +update origin_visit + set snapshot=snapshot.id + from snapshot + where snapshot.object_id=origin_visit.snapshot_id; diff --git a/sql/upgrades/133.sql b/sql/upgrades/133.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/133.sql @@ -0,0 +1,12 @@ +-- SWH DB schema upgrade +-- from_version: 132 +-- to_version: 133 +-- description: Use sha1 instead of bigint as FK from origin_visit to snapshot (part 3: remove old column) + +insert into dbversion(version, release, description) + values(133, now(), 'Work In Progress'); + +drop index constraint origin_visit_snapshot_id_fkey; +alter table origin_visit drop column snapshot_id; + +drop function swh_snapshot_get_by_origin_visit; diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -175,7 +175,8 @@ def snapshot_get_by_origin_visit(self, origin_id, visit_id, cur=None): cur = self._cursor(cur) query = """\ - SELECT swh_snapshot_get_by_origin_visit(%s, %s) + SELECT snapshot from origin_visit where + origin_visit.origin=%s and origin_visit.visit=%s; """ cur.execute(query, (origin_id, visit_id)) @@ -311,15 +312,8 @@ update_cols.append('metadata=%s') values.append(jsonize(updates.pop('metadata'))) if 'snapshot' in updates: - # New 'snapshot' column update_cols.append('snapshot=%s') - values.append(updates['snapshot']) - - # Old 'snapshot_id' column - update_cols.append('snapshot_id=snapshot.object_id') - from_ = 'FROM snapshot' - where.append('snapshot.id=%s') - where_values.append(updates.pop('snapshot')) + values.append(updates.pop('snapshot')) assert not updates, 'Unknown fields: %r' % updates query = """UPDATE origin_visit SET {update_cols} @@ -355,13 +349,12 @@ args = (origin_id, limit) query = """\ - SELECT %s, - (select id from snapshot where object_id = snapshot_id) as snapshot + SELECT %s FROM origin_visit WHERE origin=%%s %s order by visit asc limit %%s""" % ( - ', '.join(self.origin_visit_get_cols[:-1]), extra_condition + ', '.join(self.origin_visit_get_cols), extra_condition ) cur.execute(query, args) @@ -382,12 +375,10 @@ cur = self._cursor(cur) query = """\ - SELECT %s, - (select id from snapshot where object_id = snapshot_id) - as snapshot + SELECT %s FROM origin_visit WHERE origin = %%s AND visit = %%s - """ % (', '.join(self.origin_visit_get_cols[:-1])) + """ % (', '.join(self.origin_visit_get_cols)) cur.execute(query, (origin_id, visit_id)) r = cur.fetchall() @@ -425,15 +416,14 @@ (tuple(allowed_statuses),)).decode() query = """\ - SELECT %s, - (select id from snapshot where object_id = snapshot_id) - as snapshot + SELECT %s FROM origin_visit + INNER JOIN snapshot ON (origin_visit.snapshot=snapshot.id) WHERE - origin = %%s AND snapshot_id is not null %s + origin = %%s AND snapshot is not null %s ORDER BY date DESC, visit DESC LIMIT 1 - """ % (', '.join(self.origin_visit_get_cols[:-1]), extra_clause) + """ % (', '.join(self.origin_visit_get_cols), extra_clause) cur.execute(query, (origin_id,)) r = cur.fetchone() diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -12,7 +12,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(131, now(), 'Work In Progress'); + values(133, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); @@ -211,7 +211,6 @@ date timestamptz not null, status origin_visit_status not null, metadata jsonb, - snapshot_id bigint, snapshot sha1_git ); @@ -220,7 +219,6 @@ comment on column origin_visit.date is 'Visit timestamp'; comment on column origin_visit.status is 'Visit result'; comment on column origin_visit.metadata is 'Origin metadata at visit time'; -comment on column origin_visit.snapshot_id is 'Origin snapshot at visit time'; comment on column origin_visit.snapshot is 'Origin snapshot at visit time'; diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql --- a/swh/storage/sql/40-swh-func.sql +++ b/swh/storage/sql/40-swh-func.sql @@ -782,18 +782,6 @@ group by target_type; $$; -create or replace function swh_snapshot_get_by_origin_visit(origin_id bigint, visit_id bigint) - returns snapshot.id%type - language sql - stable -as $$ - select snapshot.id - from origin_visit - left join snapshot - on snapshot.object_id = origin_visit.snapshot_id - where origin_visit.origin=origin_id and origin_visit.visit=visit_id; -$$; - -- Absolute path: directory reference + complete path relative to it create type content_dir as ( directory sha1_git, diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql --- a/swh/storage/sql/60-swh-indexes.sql +++ b/swh/storage/sql/60-swh-indexes.sql @@ -135,9 +135,6 @@ alter table origin_visit add constraint origin_visit_origin_fkey foreign key (origin) references origin(id) not valid; alter table origin_visit validate constraint origin_visit_origin_fkey; -alter table origin_visit add constraint origin_visit_snapshot_id_fkey foreign key (snapshot_id) references snapshot(object_id) not valid; -alter table origin_visit validate constraint origin_visit_snapshot_id_fkey; - -- release create unique index concurrently release_pkey on release(id); alter table release add primary key using index release_pkey; diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1293,6 +1293,28 @@ ('origin_visit', data4), ('origin_visit', data5)]) + def test_origin_visit_update_missing_snapshot(self): + # given + origin_id = self.storage.origin_add_one(self.origin) + + origin_visit = self.storage.origin_visit_add( + origin_id, + date=self.date_visit1) + + # when + self.storage.origin_visit_update( + origin_id, origin_visit['visit'], + snapshot=self.snapshot['id']) + + # then + actual_origin_visit = self.storage.origin_visit_get_by( + origin_visit['origin'], origin_visit['visit']) + self.assertEqual(actual_origin_visit['snapshot'], self.snapshot['id']) + + # when + self.storage.snapshot_add(self.snapshot) + self.assertEqual(actual_origin_visit['snapshot'], self.snapshot['id']) + def test_origin_visit_get_by(self): origin_id = self.storage.origin_add_one(self.origin2) origin_id2 = self.storage.origin_add_one(self.origin) @@ -1867,6 +1889,59 @@ self.assertEqual(self.complete_snapshot, self.storage.snapshot_get_latest(origin_id)) + def test_snapshot_get_latest__missing_snapshot(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit1_id = origin_visit1['visit'] + origin_visit2 = self.storage.origin_visit_add(origin_id, + self.date_visit2) + visit2_id = origin_visit2['visit'] + + # Two visits, both with no snapshot: latest snapshot is None + self.assertIsNone(self.storage.snapshot_get_latest(origin_id)) + + # Add unknown snapshot to visit1, latest snapshot = None + self.storage.origin_visit_update( + origin_id, visit1_id, snapshot=self.complete_snapshot['id']) + self.assertIsNone(self.storage.snapshot_get_latest(origin_id)) + + # Status filter: both visits are status=ongoing, so no snapshot + # returned + self.assertIsNone( + self.storage.snapshot_get_latest(origin_id, + allowed_statuses=['full']) + ) + + # Mark the first visit as completed and check status filter again + self.storage.origin_visit_update(origin_id, visit1_id, status='full') + self.assertIsNone( + self.storage.snapshot_get_latest(origin_id, + allowed_statuses=['full']), + ) + + # Actually add the snapshot and check status filter again + self.storage.snapshot_add(self.complete_snapshot) + self.assertEqual( + self.complete_snapshot, + self.storage.snapshot_get_latest(origin_id) + ) + + # Add unknown snapshot to visit2 and check that the old snapshot + # is still returned + self.storage.origin_visit_update( + origin_id, visit2_id, snapshot=self.empty_snapshot['id']) + self.assertEqual( + self.complete_snapshot, + self.storage.snapshot_get_latest(origin_id)) + + # Actually add that snapshot and check that the new one is returned + self.storage.snapshot_add(self.empty_snapshot) + self.assertEqual( + self.empty_snapshot, + self.storage.snapshot_get_latest(origin_id) + ) + def test_snapshot_get_latest__legacy_add(self): origin_id = self.storage.origin_add_one(self.origin) origin_visit1 = self.storage.origin_visit_add(origin_id,