diff --git a/sql/schema.sql b/sql/schema.sql index 98a2107..1f9808d 100644 --- a/sql/schema.sql +++ b/sql/schema.sql @@ -1,141 +1,195 @@ --- --- Software Heritage - Antelink Data Model --- -- scan of sesi-pv-lc2 data, antelink's backup fresh from 2 years alter table content rename to content_sesi_ante_drama; -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); -- a SHA256 checksum create domain sha256 as bytea check (length(value) = 32); create type content_status as enum ('absent', 'visible', 'hidden'); -- scan of antelink's data from s3 (fresher than content_sesi_ante_drama) create table content_s3 ( sha1 sha1 primary key, path text not null, length bigint not null ); create table content_s3_downloaded ( path text not null, length bigint not null ); create unique index on content_s3_downloaded(path); -- alter table content_s3 alter column sha1 set data type sha1; create table content_sesi ( origin_sha1 sha1 not null, sha1 sha1 not null, sha1_git sha1_git not null, sha256 sha256 not null, length bigint not null, path text not null, corrupted boolean not null, primary key (origin_sha1, path) ); -- create unique index on content_sesi (sha1); create index on content_sesi (sha1) where not corrupted; -- alter table content_sesi_all drop constraint content_sesi_all_pkey; -- alter table content_sesi_all add primary key(origin_sha1, path); -- consider all s3 sha1 that are not already in sesi (if not corrupted) create materialized view content_s3_not_in_sesi as select sha1 from content_s3 as s3 where not exists (select 1 from content_sesi as sesi where s3.sha1 = sesi.sha1 and not corrupted); -- 16609944 create unique index on content_s3_not_in_sesi(sha1); -- consider from that all sha1 not already in swh create materialized view content_s3_not_in_sesi_nor_in_swh as select sha1 from content_s3_not_in_sesi as s3 where not exists (select 1 from content as swh where s3.sha1 = swh.sha1); -- 12637533 -- consider only not corrupted sha1 in sesi not already present in swh create materialized view content_sesi_not_in_swh as select sha1 from content_sesi as sesi where not corrupted and not exists (select 1 from content as swh where sesi.sha1 = swh.sha1); -- 190139056 SELECT sum(s3.length) FROM content_s3_not_in_sesi_nor_in_swh s3_not INNER JOIN content_s3 s3 on s3_not.sha1 = s3.sha1; -- 13594449732809 -> ~12.364080005508185 Tib -- estimates of files with size >= 100Mib select sum(length) from content_sesi where not corrupted and length >= 104857600; -- 12490817718209 b = 11.360332535522502 Tib + +select sum(length) +from content_sesi_not_in_swh notinswh +inner join content_sesi sesi +on notinswh.sha1=sesi.sha1 +where not sesi.corrupted; +-- 44300076550806 b = 40.290684911093194 Tib --> way too much. Turns out that there are ~4M duplicates in content_sesi_not_in_swh view + +-- +create view content_sesi_unique_and_not_corrupted +as select distinct sha1 from content_sesi where not corrupted; + +create unique index content_s3_not_in_sesi_nor_in_swh (path); + + +-- the ones to one download again +-- select path from content_s3_downloaded + +-- the ones to filter with `content_s3_downloaded` +-- select sha1 from content_s3_not_in_sesi_nor_in_swh + +with recursive sha1_from_path(p) as ( + select sha1 + from content_s3_not_in_sesi_nor_in_swh + join content_s3 s3 using (sha1) + where s3.path = p +) +select sha1 from sha1_from_path('0/0/0/0/0/2/c/0/000002c0442fe8271ad253171839d86053e6801b.gz'); + +-- take 1 +create materialized view content_s3_not_in_sesi_nor_in_swh_final +as select sha1 + from content_s3_not_in_sesi_nor_in_swh + join content_s3 s3 using (sha1) + where s3.path not in (select path from content_s3_downloaded s3d where s3d.path = s3.path); + +explain(select sha1 + from content_s3_not_in_sesi_nor_in_swh + join content_s3 s3 using (sha1) + where s3.path not in (select path from content_s3_downloaded s3d where s3d.path = s3.path)); + +-- take 2 +create materialized view content_s3_not_in_sesi_nor_in_swh_final +as select sha1 + from content_s3_not_in_sesi_nor_in_swh + join content_s3 s3 using (sha1) + where not exists (select 1 from content_s3_downloaded s3d where s3d.path = s3.path); + + +explain(select sha1 +from content_s3_not_in_sesi_nor_in_swh +join content_s3 s3 using (sha1) +where not exists (select 1 from content_s3_downloaded s3d where s3d.path = s3.path)); + -- Find the maximal length on content_sesi select max(length) from content_sesi; -- 5659069799 b = 5.2704194551333785 Gib -- -- First computation -- create materialized view content_sesi_old_not_in_s3 as select sha1 from content_sesi_ante_drama sesi where not exists (select 1 from content_s3 s3 where sesi.sha1 = s3.sha1); -- -- Create content present on s3 and not on sesi (could be present in -- swh though)... create materialized view content_s3_not_in_sesi_old as select sha1, path from content_s3 as s3 where not exists (select 1 from content_sesi_ante_drama as sesi where s3.sha1 = sesi.sha1); -- 741797 create materialized view content_s3_not_in_sesi_nor_in_swh_old as select sha1, path from content_s3_not_in_sesi_old as s3 where not exists (select 1 from content as swh where s3.sha1 = swh.sha1); -- 46 create materialized view content_sesi_not_in_swh_old as select sha1 from content_sesi_ante_drama as sesi where not exists (select 1 from content as swh where sesi.sha1 = swh.sha1); -- 207095510