Page MenuHomeSoftware Heritage

D3320.diff
No OneTemporary

D3320.diff

diff --git a/sql/upgrades/154.sql b/sql/upgrades/154.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/154.sql
@@ -0,0 +1,35 @@
+-- SWH DB schema upgrade
+-- from_version: 153
+-- to_version: 154
+-- description: make sure swh_snapshot_get_by_id doesn't degenerate into a very large index scan
+
+insert into dbversion(version, release, description)
+ values(154, now(), 'Work In Progress');
+
+create or replace function swh_snapshot_get_by_id(id sha1_git,
+ branches_from bytea default '', branches_count bigint default null,
+ target_types snapshot_target[] default NULL)
+ returns setof snapshot_result
+ language sql
+ stable
+as $$
+ -- with small limits, the "naive" version of this query can degenerate into
+ -- using the deduplication index on snapshot_branch (name, target,
+ -- target_type); The planner happily scans several hundred million rows.
+
+ -- Do the query in two steps: first pull the relevant branches for the given
+ -- snapshot (filtering them by type), then do the limiting. This two-step
+ -- process guides the planner into using the proper index.
+ with filtered_snapshot_branches as (
+ select swh_snapshot_get_by_id.id as snapshot_id, name, target, target_type
+ from snapshot_branches
+ inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id
+ where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id)
+ and (target_types is null or target_type = any(target_types))
+ order by name
+ )
+ select snapshot_id, name, target, target_type
+ from filtered_snapshot_branches
+ where name >= branches_from
+ order by name limit branches_count;
+$$;
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(152, now(), 'Work In Progress');
+ values(154, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql
--- a/swh/storage/sql/40-swh-func.sql
+++ b/swh/storage/sql/40-swh-func.sql
@@ -650,14 +650,25 @@
language sql
stable
as $$
- select
- swh_snapshot_get_by_id.id as snapshot_id, name, target, target_type
- from snapshot_branches
- inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id
- where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id)
- and (target_types is null or target_type = any(target_types))
- and name >= branches_from
- order by name limit branches_count
+ -- with small limits, the "naive" version of this query can degenerate into
+ -- using the deduplication index on snapshot_branch (name, target,
+ -- target_type); The planner happily scans several hundred million rows.
+
+ -- Do the query in two steps: first pull the relevant branches for the given
+ -- snapshot (filtering them by type), then do the limiting. This two-step
+ -- process guides the planner into using the proper index.
+ with filtered_snapshot_branches as (
+ select swh_snapshot_get_by_id.id as snapshot_id, name, target, target_type
+ from snapshot_branches
+ inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id
+ where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id)
+ and (target_types is null or target_type = any(target_types))
+ order by name
+ )
+ select snapshot_id, name, target, target_type
+ from filtered_snapshot_branches
+ where name >= branches_from
+ order by name limit branches_count;
$$;
create type snapshot_size as (

File Metadata

Mime Type
text/plain
Expires
Dec 21 2024, 9:11 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217399

Event Timeline