Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9343073
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
10 KB
Subscribers
None
View Options
diff --git a/sql/archiver/swh-archiver-data.sql b/sql/archiver/swh-archiver-data.sql
index 38364e09..816a40aa 100644
--- a/sql/archiver/swh-archiver-data.sql
+++ b/sql/archiver/swh-archiver-data.sql
@@ -1,2 +1,3 @@
+INSERT INTO archive(id) VALUES('uffizi');
INSERT INTO archive(id) VALUES('banco');
INSERT INTO archive(id) VALUES('azure');
diff --git a/sql/archiver/swh-archiver-schema.sql b/sql/archiver/swh-archiver-schema.sql
index fffd27be..82285f01 100644
--- a/sql/archiver/swh-archiver-schema.sql
+++ b/sql/archiver/swh-archiver-schema.sql
@@ -1,59 +1,151 @@
-- In order to archive the content of the object storage, add
-- some tables to keep trace of what have already been archived.
create table dbversion
(
version int primary key,
release timestamptz,
description text
);
comment on table dbversion is 'Schema update tracking';
INSERT INTO dbversion(version, release, description)
-VALUES(5, now(), 'Work In Progress');
+VALUES(6, now(), 'Work In Progress');
CREATE TABLE archive (
id text PRIMARY KEY
);
comment on table archive is 'Possible archives';
comment on column archive.id is 'Short identifier for the archive';
CREATE TYPE archive_status AS ENUM (
'missing',
'ongoing',
'present',
'corrupted'
);
comment on type archive_status is 'Status of a given archive';
-- a SHA1 checksum (not necessarily originating from Git)
CREATE DOMAIN sha1 AS bytea CHECK (LENGTH(VALUE) = 20);
+-- a bucket for which we count items
+CREATE DOMAIN bucket AS bytea CHECK (LENGTH(VALUE) = 2);
+
CREATE TABLE content_archive (
content_id sha1 primary key,
copies jsonb,
num_present int default null
);
create index on content_archive(num_present);
comment on table content_archive is 'Referencing the status and whereabouts of a content';
comment on column content_archive.content_id is 'content identifier';
comment on column content_archive.copies is 'map archive_id -> { "status": archive_status, "mtime": epoch timestamp }';
comment on column content_archive.num_present is 'Number of copies marked as present (cache updated via trigger)';
+CREATE TABLE content_archive_counts (
+ archive text not null references archive(id),
+ bucket bucket not null,
+ count bigint,
+ primary key (archive, bucket)
+);
+
+comment on table content_archive_counts is 'Bucketed count of archive contents';
+comment on column content_archive_counts.archive is 'the archive for which we''re counting';
+comment on column content_archive_counts.bucket is 'the bucket of items we''re counting';
+comment on column content_archive_counts.count is 'the number of items counted in the given bucket';
+
-- Keep the num_copies cache updated
CREATE FUNCTION update_num_present() RETURNS TRIGGER AS $$
BEGIN
NEW.num_present := (select count(*) from jsonb_each(NEW.copies) where value->>'status' = 'present');
RETURN new;
END;
$$ LANGUAGE PLPGSQL;
CREATE TRIGGER update_num_present
BEFORE INSERT OR UPDATE OF copies ON content_archive
FOR EACH ROW
EXECUTE PROCEDURE update_num_present();
+
+CREATE FUNCTION count_copies(from_id bytea, to_id bytea) returns void language sql as $$
+ with sample as (
+ select content_id, copies from content_archive
+ where content_id > from_id and content_id <= to_id
+ ), data as (
+ select substring(content_id from 19) as bucket, jbe.key as archive
+ from sample
+ join lateral jsonb_each(copies) jbe on true
+ where jbe.value->>'status' = 'present'
+ ), bucketed as (
+ select bucket, archive, count(*) as count
+ from data
+ group by bucket, archive
+ ) update content_archive_counts cac set
+ count = cac.count + bucketed.count
+ from bucketed
+ where cac.archive = bucketed.archive and cac.bucket = bucketed.bucket;
+$$;
+
+comment on function count_copies(bytea, bytea) is 'Count the objects between from_id and to_id, add the results to content_archive_counts';
+
+CREATE FUNCTION init_content_archive_counts() returns void language sql as $$
+ insert into content_archive_counts (
+ select id, decode(lpad(to_hex(bucket), 4, '0'), 'hex')::bucket as bucket, 0 as count
+ from archive join lateral generate_series(0, 65535) bucket on true
+ ) on conflict (archive, bucket) do nothing;
+$$;
+
+comment on function init_content_archive_counts() is 'Initialize the content archive counts for the registered archives';
+
+-- keep the content_archive_counts updated
+CREATE FUNCTION update_content_archive_counts() RETURNS TRIGGER LANGUAGE PLPGSQL AS $$
+ DECLARE
+ content_id sha1;
+ content_bucket bucket;
+ copies record;
+ old_row content_archive;
+ new_row content_archive;
+ BEGIN
+ -- default values for old or new row depending on trigger type
+ if tg_op = 'INSERT' then
+ old_row := (null::sha1, '{}'::jsonb, 0);
+ else
+ old_row := old;
+ end if;
+ if tg_op = 'DELETE' then
+ new_row := (null::sha1, '{}'::jsonb, 0);
+ else
+ new_row := new;
+ end if;
+
+ -- get the content bucket
+ content_id := coalesce(old_row.content_id, new_row.content_id);
+ content_bucket := substring(content_id from 19)::bucket;
+
+ -- compare copies present in old and new row for each archive type
+ FOR copies IN
+ select coalesce(o.key, n.key) as archive, o.value->>'status' as old_status, n.value->>'status' as new_status
+ from jsonb_each(old_row.copies) o full outer join lateral jsonb_each(new_row.copies) n on o.key = n.key
+ LOOP
+ -- the count didn't change
+ CONTINUE WHEN copies.old_status is distinct from copies.new_status OR
+ (copies.old_status != 'present' AND copies.new_status != 'present');
+
+ update content_archive_counts cac
+ set count = count + (case when copies.old_status = 'present' then -1 else 1 end)
+ where archive = copies.archive and bucket = content_bucket;
+ END LOOP;
+ return null;
+ END;
+$$;
+
+create trigger update_content_archive_counts
+ AFTER INSERT OR UPDATE OR DELETE ON content_archive
+ FOR EACH ROW
+ EXECUTE PROCEDURE update_content_archive_counts();
diff --git a/sql/archiver/upgrades/006.sql b/sql/archiver/upgrades/006.sql
new file mode 100644
index 00000000..d9d1b24c
--- /dev/null
+++ b/sql/archiver/upgrades/006.sql
@@ -0,0 +1,100 @@
+-- SWH DB schema upgrade
+-- from_version: 5
+-- to_version: 6
+-- description: Create a bucketed count of contents in the archive.
+
+INSERT INTO dbversion(version, release, description)
+VALUES(6, now(), 'Work In Progress');
+
+-- a bucket for which we count items
+CREATE DOMAIN bucket AS bytea CHECK (LENGTH(VALUE) = 2);
+
+CREATE TABLE content_archive_counts (
+ archive text not null references archive(id),
+ bucket bucket not null,
+ count bigint,
+ primary key (archive, bucket)
+);
+
+comment on table content_archive_counts is 'Bucketed count of archive contents';
+comment on column content_archive_counts.archive is 'the archive for which we''re counting';
+comment on column content_archive_counts.bucket is 'the bucket of items we''re counting';
+comment on column content_archive_counts.count is 'the number of items counted in the given bucket';
+
+
+CREATE FUNCTION count_copies(from_id bytea, to_id bytea) returns void language sql as $$
+ with sample as (
+ select content_id, copies from content_archive
+ where content_id > from_id and content_id <= to_id
+ ), data as (
+ select substring(content_id from 19) as bucket, jbe.key as archive
+ from sample
+ join lateral jsonb_each(copies) jbe on true
+ where jbe.value->>'status' = 'present'
+ ), bucketed as (
+ select bucket, archive, count(*) as count
+ from data
+ group by bucket, archive
+ ) update content_archive_counts cac set
+ count = cac.count + bucketed.count
+ from bucketed
+ where cac.archive = bucketed.archive and cac.bucket = bucketed.bucket;
+$$;
+
+comment on function count_copies(bytea, bytea) is 'Count the objects between from_id and to_id, add the results to content_archive_counts';
+
+CREATE FUNCTION init_content_archive_counts() returns void language sql as $$
+ insert into content_archive_counts (
+ select id, decode(lpad(to_hex(bucket), 4, '0'), 'hex')::bucket as bucket, 0 as count
+ from archive join lateral generate_series(0, 65535) bucket on true
+ ) on conflict (archive, bucket) do nothing;
+$$;
+
+comment on function init_content_archive_counts() is 'Initialize the content archive counts for the registered archives';
+
+-- keep the content_archive_counts updated
+CREATE FUNCTION update_content_archive_counts() RETURNS TRIGGER LANGUAGE PLPGSQL AS $$
+ DECLARE
+ content_id sha1;
+ content_bucket bucket;
+ copies record;
+ old_row content_archive;
+ new_row content_archive;
+ BEGIN
+ -- default values for old or new row depending on trigger type
+ if tg_op = 'INSERT' then
+ old_row := (null::sha1, '{}'::jsonb, 0);
+ else
+ old_row := old;
+ end if;
+ if tg_op = 'DELETE' then
+ new_row := (null::sha1, '{}'::jsonb, 0);
+ else
+ new_row := new;
+ end if;
+
+ -- get the content bucket
+ content_id := coalesce(old_row.content_id, new_row.content_id);
+ content_bucket := substring(content_id from 19)::bucket;
+
+ -- compare copies present in old and new row for each archive type
+ FOR copies IN
+ select coalesce(o.key, n.key) as archive, o.value->>'status' as old_status, n.value->>'status' as new_status
+ from jsonb_each(old_row.copies) o full outer join lateral jsonb_each(new_row.copies) n on o.key = n.key
+ LOOP
+ -- the count didn't change
+ CONTINUE WHEN copies.old_status is distinct from copies.new_status OR
+ (copies.old_status != 'present' AND copies.new_status != 'present');
+
+ update content_archive_counts cac
+ set count = count + (case when copies.old_status = 'present' then -1 else 1 end)
+ where archive = copies.archive and bucket = content_bucket;
+ END LOOP;
+ return null;
+ END;
+$$;
+
+create trigger update_content_archive_counts
+ AFTER INSERT OR UPDATE OR DELETE ON content_archive
+ FOR EACH ROW
+ EXECUTE PROCEDURE update_content_archive_counts();
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 1:14 PM (1 w, 13 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3251791
Attached To
rDSTO Storage manager
Event Timeline
Log In to Comment