diff --git a/sql/clusters.dot b/sql/clusters.dot index d22b50f4..dec8dc4f 100644 --- a/sql/clusters.dot +++ b/sql/clusters.dot @@ -1,91 +1,84 @@ subgraph "logical_grouping" { style = rounded; bgcolor = gray95; color = gray; subgraph cluster_meta { label = <meta>; dbversion; } subgraph cluster_content { label = <content>; content; } subgraph cluster_directory { label = <directories>; directory; directory_entry_dir; directory_entry_file; directory_entry_rev; - directory_list_dir; - directory_list_file; - directory_list_rev; - {rank=same; - directory_list_dir - directory_list_file - directory_list_rev} {rank=same; directory_entry_dir directory_entry_file directory_entry_rev} } subgraph cluster_revision { label = <revisions>; revision; revision_history; person; } subgraph cluster_release { label = <releases>; release; } subgraph cluster_occurrences { label = <occurrences>; occurrence; occurrence_history; } subgraph cluster_origins { label = <origins>; origin; fetch_history; } subgraph cluster_projects { label = <projects>; project; project_history; } subgraph cluster_organization { label = <organizations>; organization; list_history; } { edge [style = dashed]; # "rtcolN" identifies the N-th row in a table, as a source # "ltcolN" identifies the N-th row in a table, as a destination "directory_entry_dir":rtcol2 -> "directory":ltcol1; "directory_entry_file":rtcol2 -> "content":ltcol2; "directory_entry_rev":rtcol2 -> "revision":ltcol1; - "directory_list_dir":rtcol2 -> "directory_entry_dir":ltcol1; - "directory_list_file":rtcol2 -> "directory_entry_file":ltcol1; - "directory_list_rev":rtcol2 -> "directory_entry_rev":ltcol1; + "directory":rtcol2 -> "directory_entry_dir":ltcol1; + "directory":rtcol3 -> "directory_entry_file":ltcol1; + "directory":rtcol4 -> "directory_entry_rev":ltcol1; "occurrence":rtcol3 -> "revision":ltcol1; "occurrence_history":rtcol3 -> "revision":ltcol1; "release":rtcol2 -> "revision":ltcol1; "revision":rtcol7 -> "directory":ltcol1; "revision_history":rtcol2 -> "revision":ltcol1; } } diff --git a/sql/swh-func.sql b/sql/swh-func.sql index d615b7db..9335ac41 100644 --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -1,698 +1,724 @@ -- create a temporary table called tmp_TBLNAME, mimicking existing table -- TBLNAME -- -- Args: -- tblname: name of the table to mimick create or replace function swh_mktemp(tblname regclass) returns void language plpgsql as $$ begin execute format(' create temporary table tmp_%I (like %I including defaults) on commit drop ', tblname, tblname); return; end $$; -- create a temporary table for directory entries called tmp_TBLNAME, -- mimicking existing table TBLNAME with an extra dir_id (sha1_git) -- column, and dropping the id column. -- -- This is used to create the tmp_directory_entry_ tables. -- -- Args: -- tblname: name of the table to mimick create or replace function swh_mktemp_dir_entry(tblname regclass) returns void language plpgsql as $$ begin execute format(' create temporary table tmp_%I (like %I including defaults, dir_id sha1_git) on commit drop; alter table tmp_%I drop column id; ', tblname, tblname, tblname, tblname); return; end $$; -- create a temporary table for revisions called tmp_revisions, -- mimicking existing table revision, replacing the foreign keys to -- people with an email and name field -- create or replace function swh_mktemp_revision() returns void language plpgsql as $$ begin create temporary table tmp_revision ( like revision including defaults, author_name text not null default '', author_email text not null default '', committer_name text not null default '', committer_email text not null default '' ) on commit drop; alter table tmp_revision drop column author; alter table tmp_revision drop column committer; return; end $$; -- create a temporary table for releases called tmp_release, -- mimicking existing table release, replacing the foreign keys to -- people with an email and name field -- create or replace function swh_mktemp_release() returns void language plpgsql as $$ begin create temporary table tmp_release ( like release including defaults, author_name text not null default '', author_email text not null default '' ) on commit drop; alter table tmp_release drop column author; return; end $$; -- a content signature is a set of cryptographic checksums that we use to -- uniquely identify content, for the purpose of verifying if we already have -- some content or not during content injection create type content_signature as ( sha1 sha1, sha1_git sha1_git, sha256 sha256 ); -- check which entries of tmp_content are missing from content -- -- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content, -- 2. call this function create or replace function swh_content_missing() returns setof content_signature language plpgsql as $$ begin return query select sha1, sha1_git, sha256 from tmp_content except select sha1, sha1_git, sha256 from content; return; end $$; -- check which entries of tmp_skipped_content are missing from skipped_content -- -- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content, -- 2. call this function create or replace function swh_skipped_content_missing() returns setof content_signature language plpgsql as $$ begin return query select sha1, sha1_git, sha256 from tmp_skipped_content where not exists (select 1 from skipped_content s where sha1 is not distinct from s.sha1 and sha1_git is not distinct from s.sha1_git and sha256 is not distinct from s.sha256); return; end $$; -- Look up content based on one or several different checksums. Return all -- content information if the content is found; a NULL row otherwise. -- -- At least one checksum should be not NULL. If several are not NULL, they will -- be AND-ed together in the lookup query. -- -- Note: this function is meant to be used to look up individual contents -- (e.g., for the web app), for batch lookup of missing content (e.g., to be -- added) see swh_content_missing create or replace function swh_content_find( sha1 sha1 default NULL, sha1_git sha1_git default NULL, sha256 sha256 default NULL ) returns content language plpgsql as $$ declare con content; filters text[] := array[] :: text[]; -- AND-clauses used to filter content q text; begin if sha1 is not null then filters := filters || format('sha1 = %L', sha1); end if; if sha1_git is not null then filters := filters || format('sha1_git = %L', sha1_git); end if; if sha256 is not null then filters := filters || format('sha256 = %L', sha256); end if; if cardinality(filters) = 0 then return null; else q = format('select * from content where %s', array_to_string(filters, ' and ')); execute q into con; return con; end if; end $$; -- add tmp_content entries to content, skipping duplicates -- -- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content, -- 2. call this function create or replace function swh_content_add() returns void language plpgsql as $$ begin insert into content (sha1, sha1_git, sha256, length, status) select distinct sha1, sha1_git, sha256, length, status from tmp_content where (sha1, sha1_git, sha256) in (select * from swh_content_missing()); -- TODO XXX use postgres 9.5 "UPSERT" support here, when available. -- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid -- the extra swh_content_missing() query here. return; end $$; -- add tmp_skipped_content entries to skipped_content, skipping duplicates -- -- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content, -- 2. call this function create or replace function swh_skipped_content_add() returns void language plpgsql as $$ begin insert into skipped_content (sha1, sha1_git, sha256, length, status, reason, origin) select distinct sha1, sha1_git, sha256, length, status, reason, origin from tmp_skipped_content where (coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '')) in (select coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '') from swh_skipped_content_missing()); -- TODO XXX use postgres 9.5 "UPSERT" support here, when available. -- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid -- the extra swh_content_missing() query here. return; end $$; -- check which entries of tmp_directory are missing from directory -- -- operates in bulk: 0. swh_mktemp(directory), 1. COPY to tmp_directory, -- 2. call this function create or replace function swh_directory_missing() returns setof directory language plpgsql as $$ begin return query select id from tmp_directory except select id from directory; return; end $$; -- Add tmp_directory_entry_dir entries to directory_entry_dir and -- directory_list_dir, skipping duplicates in directory_entry_dir. -- -- operates in bulk: 0. swh_mktemp_dir_entry('directory_entry_dir'), 1 COPY to -- tmp_directory_entry_dir, 2. call this function +-- +-- Assumption: this function is used in the same transaction that inserts the +-- context directory in table "directory". +-- +-- TODO: refactor with other swh_directory_entry_*_add functions create or replace function swh_directory_entry_dir_add() returns void language plpgsql as $$ begin insert into directory_entry_dir (target, name, perms, atime, mtime, ctime) select distinct t.target, t.name, t.perms, t.atime, t.mtime, t.ctime from tmp_directory_entry_dir t where not exists ( select 1 from directory_entry_dir i where t.target = i.target and t.name = i.name and t.perms = i.perms and t.atime is not distinct from i.atime and t.mtime is not distinct from i.mtime and t.ctime is not distinct from i.ctime); - insert into directory_list_dir (dir_id, entry_ids) - select t.dir_id, array_agg(i.id) - from tmp_directory_entry_dir t - inner join directory_entry_dir i - on t.target = i.target and t.name = i.name and t.perms = i.perms and - t.atime is not distinct from i.atime and - t.mtime is not distinct from i.mtime and - t.ctime is not distinct from i.ctime - group by t.dir_id; + with new_entries as ( + select t.dir_id, array_agg(i.id) as entries + from tmp_directory_entry_dir t + inner join directory_entry_dir i + on t.target = i.target and t.name = i.name and t.perms = i.perms and + t.atime is not distinct from i.atime and + t.mtime is not distinct from i.mtime and + t.ctime is not distinct from i.ctime + group by t.dir_id + ) + update directory as d + set dir_entries = new_entries.entries + from new_entries + where d.id = new_entries.dir_id; + return; end $$; -- Add tmp_directory_entry_file entries to directory_entry_file and -- directory_list_file, skipping duplicates in directory_entry_file. -- -- operates in bulk: 0. swh_mktemp_dir_entry('directory_entry_file'), 1 COPY to -- tmp_directory_entry_file, 2. call this function +-- +-- Assumption: this function is used in the same transaction that inserts the +-- context directory in table "directory". +-- +-- TODO: refactor with other swh_directory_entry_*_add functions create or replace function swh_directory_entry_file_add() returns void language plpgsql as $$ begin insert into directory_entry_file (target, name, perms, atime, mtime, ctime) select distinct t.target, t.name, t.perms, t.atime, t.mtime, t.ctime from tmp_directory_entry_file t where not exists ( select 1 from directory_entry_file i where t.target = i.target and t.name = i.name and t.perms = i.perms and t.atime is not distinct from i.atime and t.mtime is not distinct from i.mtime and t.ctime is not distinct from i.ctime); - insert into directory_list_file (dir_id, entry_ids) - select t.dir_id, array_agg(i.id) - from tmp_directory_entry_file t - inner join directory_entry_file i - on t.target = i.target and t.name = i.name and t.perms = i.perms and - t.atime is not distinct from i.atime and - t.mtime is not distinct from i.mtime and - t.ctime is not distinct from i.ctime - group by t.dir_id; + with new_entries as ( + select t.dir_id, array_agg(i.id) + from tmp_directory_entry_file t + inner join directory_entry_file i + on t.target = i.target and t.name = i.name and t.perms = i.perms and + t.atime is not distinct from i.atime and + t.mtime is not distinct from i.mtime and + t.ctime is not distinct from i.ctime + group by t.dir_id + ) + update directory as d + set file_entries = new_entries.entries + from new_entries + where d.id = new_entries.dir_id; + return; end $$; -- Add tmp_directory_entry_rev entries to directory_entry_rev and -- directory_list_rev, skipping duplicates in directory_entry_rev. -- -- operates in bulk: 0. swh_mktemp_dir_entry('directory_entry_rev'), 1 COPY to -- tmp_directory_entry_rev, 2. call this function +-- +-- Assumption: this function is used in the same transaction that inserts the +-- context directory in table "directory". +-- +-- TODO: refactor with other swh_directory_entry_*_add functions create or replace function swh_directory_entry_rev_add() returns void language plpgsql as $$ begin insert into directory_entry_rev (target, name, perms, atime, mtime, ctime) select distinct t.target, t.name, t.perms, t.atime, t.mtime, t.ctime from tmp_directory_entry_rev t where not exists ( select 1 from directory_entry_rev i where t.target = i.target and t.name = i.name and t.perms = i.perms and t.atime is not distinct from i.atime and t.mtime is not distinct from i.mtime and t.ctime is not distinct from i.ctime); - insert into directory_list_rev (dir_id, entry_ids) - select t.dir_id, array_agg(i.id) - from tmp_directory_entry_rev t - inner join directory_entry_rev i - on t.target = i.target and t.name = i.name and t.perms = i.perms and - t.atime is not distinct from i.atime and - t.mtime is not distinct from i.mtime and - t.ctime is not distinct from i.ctime - group by t.dir_id; + with new_entries as ( + select t.dir_id, array_agg(i.id) + from tmp_directory_entry_rev t + inner join directory_entry_rev i + on t.target = i.target and t.name = i.name and t.perms = i.perms and + t.atime is not distinct from i.atime and + t.mtime is not distinct from i.mtime and + t.ctime is not distinct from i.ctime + group by t.dir_id + ) + update directory as d + set rev_entries = new_entries.entries + from new_entries + where d.id = new_entries.dir_id; + return; end $$; create type directory_entry_type as enum('file', 'dir', 'rev'); -- a directory listing entry with all the metadata -- -- can be used to list a directory, and retrieve all the data in one go. create type directory_entry as ( dir_id sha1_git, -- id of the parent directory type directory_entry_type, -- type of entry target sha1_git, -- id of target name unix_path, -- path name, relative to containing dir perms file_perms, -- unix-like permissions atime timestamptz, -- time of last access mtime timestamptz, -- time of last modification ctime timestamptz -- time of last status change ); -- List a single level of directory walked_dir_id create or replace function swh_directory_walk_one(walked_dir_id sha1_git) returns setof directory_entry language plpgsql as $$ begin - return query ( - (with l as - (select dir_id, unnest(entry_ids) as entry_id - from directory_list_dir - where dir_id = walked_dir_id) - select dir_id, 'dir'::directory_entry_type as type, - target, name, perms, atime, mtime, ctime - from l - left join directory_entry_dir d on l.entry_id = d.id) - union - (with l as - (select dir_id, unnest(entry_ids) as entry_id - from directory_list_file - where dir_id = walked_dir_id) - select dir_id, 'file'::directory_entry_type as type, - target, name, perms, atime, mtime, ctime - from l - left join directory_entry_file d on l.entry_id = d.id) - union - (with l as - (select dir_id, unnest(entry_ids) as entry_id - from directory_list_rev - where dir_id = walked_dir_id) - select dir_id, 'rev'::directory_entry_type as type, - target, name, perms, atime, mtime, ctime - from l - left join directory_entry_rev d on l.entry_id = d.id) - ) order by name; + return query + with dir as ( + select id as dir_id, dir_entries, file_entries, rev_entries + from directory + where id = walked_dir_id), + ls_d as (select dir_id, unnest(dir_entries) as entry_id from dir), + ls_f as (select dir_id, unnest(file_entries) as entry_id from dir), + ls_r as (select dir_id, unnest(rev_entries) as entry_id from dir) + (select dir_id, 'dir'::directory_entry_type as type, + target, name, perms, atime, mtime, ctime + from ls_d + left join directory_entry_dir d on ls_d.entry_id = d.id) + union + (select dir_id, 'file'::directory_entry_type as type, + target, name, perms, atime, mtime, ctime + from ls_f + left join directory_entry_file d on ls_f.entry_id = d.id) + union + (select dir_id, 'rev'::directory_entry_type as type, + target, name, perms, atime, mtime, ctime + from ls_r + left join directory_entry_rev d on ls_r.entry_id = d.id) + order by name; return; end $$; -- List all revision IDs starting from a given revision, going back in time -- -- TODO ordering: should be breadth-first right now (what do we want?) -- TODO ordering: ORDER BY parent_rank somewhere? create or replace function swh_revision_list(root_revision sha1_git) returns setof sha1_git language plpgsql as $$ begin return query with recursive rev_list(id) as ( (select id from revision where id = root_revision) union (select parent_id from revision_history as h join rev_list on h.id = rev_list.id) ) select * from rev_list; return; end $$; -- Detailed entry in a revision log create type revision_log_entry as ( id sha1_git, date timestamptz, date_offset smallint, committer_date timestamptz, committer_date_offset smallint, type revision_type, directory sha1_git, message bytea, author_name text, author_email text, committer_name text, committer_email text ); -- "git style" revision log. Similar to swh_revision_list(), but returning all -- information associated to each revision, and expanding authors/committers create or replace function swh_revision_log(root_revision sha1_git) returns setof revision_log_entry language plpgsql as $$ begin return query select revision.id, date, date_offset, committer_date, committer_date_offset, type, directory, message, author.name as author_name, author.email as author_email, committer.name as committer_name, committer.email as committer_email from swh_revision_list(root_revision) as rev_list join revision on revision.id = rev_list join person as author on revision.author = author.id join person as committer on revision.committer = committer.id; return; end $$; -- List missing revisions from tmp_revision create or replace function swh_revision_missing() returns setof sha1_git language plpgsql as $$ begin return query select id from tmp_revision except select id from revision; return; end $$; -- Create entries in person from tmp_revision create or replace function swh_person_add_from_revision() returns void language plpgsql as $$ begin with t as ( select author_name as name, author_email as email from tmp_revision union select committer_name as name, committer_email as email from tmp_revision ) insert into person (name, email) select distinct name, email from t where not exists ( select 1 from person p where t.name = p.name and t.email = p.email ); return; end $$; -- Create entries in revision from tmp_revision create or replace function swh_revision_add() returns void language plpgsql as $$ begin perform swh_person_add_from_revision(); insert into revision (id, date, date_offset, committer_date, committer_date_offset, type, directory, message, author, committer) select t.id, t.date, t.date_offset, t.committer_date, t.committer_date_offset, t.type, t.directory, t.message, a.id, c.id from tmp_revision t left join person a on a.name = t.author_name and a.email = t.author_email left join person c on c.name = t.committer_name and c.email = t.committer_email; return; end $$; -- List missing releases from tmp_release create or replace function swh_release_missing() returns setof sha1_git language plpgsql as $$ begin return query select id from tmp_release except select id from release; return; end $$; -- Create entries in person from tmp_release create or replace function swh_person_add_from_release() returns void language plpgsql as $$ begin with t as ( select distinct author_name as name, author_email as email from tmp_release ) insert into person (name, email) select name, email from t where not exists ( select 1 from person p where t.name = p.name and t.email = p.email ); return; end $$; -- Create entries in release from tmp_release create or replace function swh_release_add() returns void language plpgsql as $$ begin perform swh_person_add_from_release(); insert into release (id, revision, date, date_offset, name, comment, author) select t.id, t.revision, t.date, t.date_offset, t.name, t.comment, a.id from tmp_release t left join person a on a.name = t.author_name and a.email = t.author_email; return; end $$; -- Absolute path: directory reference + complete path relative to it create type content_dir as ( directory sha1_git, path unix_path ); -- Find the containing directory of a given content, specified by sha1 -- (note: *not* sha1_git). -- -- Return a pair (dir_it, path) where path is a UNIX path that, from the -- directory root, reach down to a file with the desired content. -- -- In case of multiple paths (i.e., pretty much always), an arbitrary one is -- chosen. create or replace function swh_content_find_directory(content_id sha1) returns content_dir language plpgsql as $$ declare d content_dir; begin with recursive path as ( -- Recursively build a path from the requested content to a root -- directory. Each iteration returns a pair (dir_id, filename) where -- filename is relative to dir_id. Stops when no parent directory can -- be found. (select dir.id as dir_id, dir_entry_f.name as name, 0 as depth from directory_entry_file as dir_entry_f join content on content.sha1_git = dir_entry_f.target - join directory_list_file as ls_f on ls_f.entry_ids @> array[dir_entry_f.id] - join directory as dir on ls_f.dir_id = dir.id + join directory as dir on dir.file_entries @> array[dir_entry_f.id] where content.sha1 = content_id limit 1) union all (select dir.id as dir_id, (dir_entry_d.name || '/' || path.name)::unix_path as name, path.depth + 1 from path join directory_entry_dir as dir_entry_d on dir_entry_d.target = path.dir_id - join directory_list_dir as ls_d on ls_d.entry_ids @> array[dir_entry_d.id] - join directory as dir on ls_d.dir_id = dir.id + join directory as dir on dir.dir_entries @> array[dir_entry_d.id] limit 1) ) select dir_id, name from path order by depth desc limit 1 into strict d; return d; end $$; -- Walk the revision history starting from a given revision, until a matching -- occurrence is found. Return all occurrence information. create or replace function swh_revision_find_occurrence(revision_id sha1_git) returns occurrence language plpgsql as $$ declare occ occurrence%ROWTYPE; rev sha1_git; begin -- first check to see if revision_id is already pointed by an occurrence select origin, branch, revision from occurrence_history as occ_hist where occ_hist.revision = revision_id order by upper(occ_hist.validity) -- TODO filter by authority? limit 1 into occ; -- no occurrence point to revision_id, walk up the history if not found then -- recursively walk the history, stopping immediately before a revision -- pointed to by an occurrence. -- TODO find a nicer way to stop at, but *including*, that revision with recursive revlog as ( (select revision_id as rev_id, 0 as depth) union all (select hist.parent_id as rev_id, revlog.depth + 1 from revlog join revision_history as hist on hist.id = revlog.rev_id and not exists(select 1 from occurrence_history where revision = hist.parent_id) limit 1) ) select rev_id from revlog order by depth desc limit 1 into strict rev; -- as we stopped before a pointed by revision, look it up again and -- return its data select origin, branch, revision from revision_history as rev_hist, occurrence_history as occ_hist where rev_hist.id = rev and occ_hist.revision = rev_hist.parent_id order by upper(occ_hist.validity) -- TODO filter by authority? limit 1 into strict occ; -- will fail if no occurrence is found, and that's OK end if; return occ; end $$; -- Occurrence of some content in a given context create type content_occurrence as ( origin_type text, origin_url text, branch text, revision_id sha1_git, path unix_path ); -- Given the sha1 of some content, look up an occurrence that points to a -- revision, which in turns reference (transitively) a tree containing the -- content. Answer the question: "where/when did SWH see a given content"? -- Return information about an arbitrary occurrence/revision/tree, with no -- ordering guarantee whatsoever. create or replace function swh_content_find_occurrence(content_id sha1) returns content_occurrence language plpgsql as $$ declare dir content_dir; rev sha1_git; occ occurrence%ROWTYPE; coc content_occurrence; begin -- each step could fail if no results are found, and that's OK select * from swh_content_find_directory(content_id) -- look up directory into strict dir; select id from revision where directory = dir.directory -- look up revision limit 1 into strict rev; select * from swh_revision_find_occurrence(rev) -- look up occurrence into strict occ; select origin.type, origin.url, occ.branch, rev, dir.path from origin where origin.id = occ.origin into strict coc; return coc; end $$; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql index 3112656b..9b4ab86d 100644 --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -1,364 +1,341 @@ --- --- Software Heritage Data Model --- -- drop schema if exists swh cascade; -- create schema swh; -- set search_path to swh; create table dbversion ( version int primary key, release timestamptz, description text ); insert into dbversion(version, release, description) - values(14, now(), 'Work In Progress'); + values(15, now(), 'Work In Progress'); -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); -- a SHA256 checksum create domain sha256 as bytea check (length(value) = 32); -- UNIX path (absolute, relative, individual path component, etc.) -- TODO should this be bytea or similar to avoid encoding/decoding issues? create domain unix_path as text; -- a set of UNIX-like access permissions, as manipulated by, e.g., chmod create domain file_perms as int; create type content_status as enum ('absent', 'visible', 'hidden'); -- An origin is a place, identified by an URL, where software can be found. We -- support different kinds of origins, e.g., git and other VCS repositories, -- web pages that list tarballs URLs (e.g., http://www.kernel.org), indirect -- tarball URLs (e.g., http://www.example.org/latest.tar.gz), etc. The key -- feature of an origin is that it can be *fetched* (wget, git clone, svn -- checkout, etc.) to retrieve all the contained software. create table origin ( id bigserial primary key, type text, -- TODO use an enum here (?) url text not null ); -- Checksums about actual file content. Note that the content itself is not -- stored in the DB, but on external (key-value) storage. A single checksum is -- used as key there, but the other can be used to verify that we do not inject -- content collisions not knowingly. create table content ( sha1 sha1 primary key, sha1_git sha1_git not null, sha256 sha256 not null, length bigint not null, ctime timestamptz not null default now(), -- creation time, i.e. time of (first) injection into the storage status content_status not null default 'visible' ); create unique index on content(sha1_git); create unique index on content(sha256); -- Content we have seen but skipped for some reason. This table is -- separate from the content table as we might not have the sha1 -- checksum of that data (for instance when we inject git -- repositories, objects that are too big will be skipped here, and we -- will only know their sha1_git). 'reason' contains the reason the -- content was skipped. origin is a nullable column allowing to find -- out which origin contains that skipped content. create table skipped_content ( sha1 sha1, sha1_git sha1_git, sha256 sha256, length bigint not null, ctime timestamptz not null default now(), status content_status not null default 'absent', reason text not null, origin bigint references origin(id), unique (sha1, sha1_git, sha256) ); -- those indexes support multiple NULL values. create unique index on skipped_content(sha1); create unique index on skipped_content(sha1_git); create unique index on skipped_content(sha256); -- An organization (or part thereof) that might be in charge of running -- software projects. Examples: Debian, GNU, GitHub, Apache, The Linux -- Foundation. The data model is hierarchical (via parent_id) and might store -- sub-branches of existing organizations. The key feature of an organization -- is that it can be *listed* to retrieve information about its content, i.e: -- sub-organizations, projects, origins. create table organization ( id bigserial primary key, parent_id bigint references organization(id), name text not null, description text, homepage text, list_engine text, -- crawler to be used to org's content list_url text, -- root URL to start the listing list_params json, -- org-specific listing parameter latest_list timestamptz -- last time the org's content has been listed ); -- Log of all organization listings (i.e., organization crawling) that have -- been done in the past, or are still ongoing. Similar to fetch_history, but -- for organizations. create table list_history ( id bigserial primary key, organization bigint references organization(id), date timestamptz not null, status boolean, -- true if and only if the listing has been successful result json, -- more detailed return value, depending on status stdout text, stderr text, duration interval -- fetch duration of NULL if still ongoing ); -- Log of all origin fetches (i.e., origin crawling) that have been done in the -- past, or are still ongoing. Similar to list_history, but for origins. create table fetch_history ( id bigserial primary key, origin bigint references origin(id), date timestamptz not null, status boolean, -- true if and only if the fetch has been successful result json, -- more detailed returned values, times, etc... stdout text, stderr text, -- null when status is true, filled otherwise duration interval -- fetch duration of NULL if still ongoing ); -- A specific software project, e.g., the Linux kernel, Apache httpd. A -- software project is version-less at this level, but is associated to several -- metadata. Metadata can evolve over time, this table only contains the most -- recent version of them; for old versions of project see table -- project_history. create table project ( id bigserial primary key, organization bigint references organization(id), -- the "owning" organization origin bigint references origin(id), -- where to find project releases name text, description text, homepage text, doap jsonb -- other kinds of metadata/software project description ontologies can be -- added here, in addition to DOAP ); -- History of project metadata. Time-sensitive version of the table project. create table project_history ( id bigserial primary key, project bigint references project(id), validity tstzrange, organization bigint references organization(id), origin bigint references origin(id), name text, description text, homepage text, doap jsonb ); -- A file-system directory. A directory is a list of directory entries (see -- tables: directory_entry_{dir,file}). -- -- To list the contents of a directory: --- 1. list the contained directory_entry_dir using table directory_list_dir --- 2. list the contained directory_entry_file using table directory_list_file --- 3. list the contained directory_entry_rev using table directory_list_rev +-- 1. list the contained directory_entry_dir using array dir_entries +-- 2. list the contained directory_entry_file using array file_entries +-- 3. list the contained directory_entry_rev using array rev_entries -- 4. UNION -- -- Synonyms/mappings: -- * git: tree create table directory ( - id sha1_git primary key + id sha1_git primary key, + dir_entries bigint[], -- sub-directories, reference directory_entry_dir + file_entries bigint[], -- contained files, reference directory_entry_file + rev_entries bigint[] -- mounted revisions, reference directory_entry_rev ); +create index on directory using gin (dir_entries); +create index on directory using gin (file_entries); +create index on directory using gin (rev_entries); + -- A directory entry pointing to a sub-directory. create table directory_entry_dir ( id bigserial primary key, target sha1_git, -- references directory(id) deferrable initially deferred, -- id of target directory name unix_path, -- path name, relative to containing dir perms file_perms, -- unix-like permissions atime timestamptz, -- time of last access mtime timestamptz, -- time of last modification ctime timestamptz -- time of last status change ); create unique index on directory_entry_dir(target, name, perms, atime, mtime, ctime); create unique index on directory_entry_dir(target, name, perms) where atime is null and mtime is null and ctime is null; --- Mapping between directories and contained sub-directories. -create table directory_list_dir -( - dir_id sha1_git references directory(id), - entry_ids bigint[], - primary key (dir_id) -); - -create index on directory_list_dir using gin (entry_ids); - -- A directory entry pointing to a file. create table directory_entry_file ( id bigserial primary key, target sha1_git, -- id of target file name unix_path, -- path name, relative to containing dir perms file_perms, -- unix-like permissions atime timestamptz, -- time of last access mtime timestamptz, -- time of last modification ctime timestamptz -- time of last status change ); create unique index on directory_entry_file(target, name, perms, atime, mtime, ctime); create unique index on directory_entry_file(target, name, perms) where atime is null and mtime is null and ctime is null; --- Mapping between directories and contained files. -create table directory_list_file -( - dir_id sha1_git references directory(id), - entry_ids bigint[], - primary key (dir_id) -); - -create index on directory_list_file using gin (entry_ids); - -- A directory entry pointing to a revision. create table directory_entry_rev ( id bigserial primary key, target sha1_git, -- id of target revision name unix_path, -- path name, relative to containing dir perms file_perms, -- unix-like permissions atime timestamptz, -- time of last access mtime timestamptz, -- time of last modification ctime timestamptz -- time of last status change ); create unique index on directory_entry_rev(target, name, perms, atime, mtime, ctime); create unique index on directory_entry_rev(target, name, perms) where atime is null and mtime is null and ctime is null; --- Mapping between directories and contained files. -create table directory_list_rev -( - dir_id sha1_git references directory(id), - entry_ids bigint[], - primary key (dir_id) -); - -create index on directory_list_rev using gin (entry_ids); - create table person ( id bigserial primary key, name text not null default '', email text not null default '' ); create unique index on person(name, email); create type revision_type as enum ('git', 'tar', 'dsc'); -- A snapshot of a software project at a specific point in time. -- -- Synonyms/mappings: -- * git / subversion / etc: commit -- * tarball: a specific tarball -- -- Revisions are organized as DAGs. Each revision points to 0, 1, or more (in -- case of merges) parent revisions. Each revision points to a directory, i.e., -- a file-system tree containing files and directories. create table revision ( id sha1_git primary key, date timestamptz, date_offset smallint, committer_date timestamptz, committer_date_offset smallint, type revision_type not null, directory sha1_git, -- file-system tree message bytea, author bigint references person(id), committer bigint references person(id) ); -- either this table or the sha1_git[] column on the revision table create table revision_history ( id sha1_git references revision(id), parent_id sha1_git, parent_rank int not null default 0, -- parent position in merge commits, 0-based primary key (id, parent_rank) ); -- The content of software origins is indexed starting from top-level pointers -- called "branches". Every time we fetch some origin we store in this table -- where the branches pointed to at fetch time. -- -- Synonyms/mappings: -- * git: ref (in the "git update-ref" sense) create table occurrence_history ( origin bigint references origin(id), branch text, -- e.g., "master" (for VCS), or "sid" (for Debian) revision sha1_git, -- ref target, e.g., commit id authority bigint references organization(id) not null, -- who is claiming to have seen the occurrence. -- Note: SWH is such an authority, and has an entry in -- the organization table. validity tstzrange, -- The time validity of this table entry. If the upper -- bound is missing, the entry is still valid. exclude using gist (origin with =, branch with =, revision with =, authority with =, validity with &&), -- unicity exclusion constraint on lines where the same value is found for -- `origin`, `reference`, `revision`, `authority` and overlapping values for -- `validity`. primary key (origin, branch, revision, authority, validity) ); -- Materialized view of occurrence_history, storing the *current* value of each -- branch, as last seen by SWH. create table occurrence ( origin bigint references origin(id), branch text, revision sha1_git, primary key(origin, branch, revision) ); -- A "memorable" point in the development history of a project. -- -- Synonyms/mappings: -- * git: tag (of the annotated kind, otherwise they are just references) -- * tarball: the release version number create table release ( id sha1_git primary key, revision sha1_git, date timestamptz, date_offset smallint, name text, comment bytea, author bigint references person(id) ); diff --git a/sql/upgrades/015.sql b/sql/upgrades/015.sql new file mode 100644 index 00000000..950695ea --- /dev/null +++ b/sql/upgrades/015.sql @@ -0,0 +1,203 @@ +-- SWH DB schema upgrade +-- from_version: 14 +-- to_version: 15 +-- description: merge directory_list_* tables into directory + +alter table directory + add column dir_entries bigint[], + add column file_entries bigint[], + add column rev_entries bigint[]; + +with ls as ( + -- we need an explicit sub-query here, because left joins aren't allowed in + -- update from_list + select id, + ls_d.entry_ids as dir_entries, + ls_f.entry_ids as file_entries, + ls_r.entry_ids as rev_entries + from directory as d + left join directory_list_dir as ls_d on ls_d.dir_id = d.id + left join directory_list_file as ls_f on ls_f.dir_id = d.id + left join directory_list_rev as ls_r on ls_r.dir_id = d.id +) +update directory + set dir_entries = ls.dir_entries, + file_entries = ls.file_entries, + rev_entries = ls.rev_entries + from ls + where ls.id = directory.id; + +create index on directory using gin (dir_entries); +create index on directory using gin (file_entries); +create index on directory using gin (rev_entries); + +drop table directory_list_dir; +drop table directory_list_file; +drop table directory_list_rev; + +create or replace function swh_directory_entry_dir_add() + returns void + language plpgsql +as $$ +begin + insert into directory_entry_dir (target, name, perms, atime, mtime, ctime) + select distinct t.target, t.name, t.perms, t.atime, t.mtime, t.ctime + from tmp_directory_entry_dir t + where not exists ( + select 1 + from directory_entry_dir i + where t.target = i.target and t.name = i.name and t.perms = i.perms and + t.atime is not distinct from i.atime and + t.mtime is not distinct from i.mtime and + t.ctime is not distinct from i.ctime); + + with new_entries as ( + select t.dir_id, array_agg(i.id) as entries + from tmp_directory_entry_dir t + inner join directory_entry_dir i + on t.target = i.target and t.name = i.name and t.perms = i.perms and + t.atime is not distinct from i.atime and + t.mtime is not distinct from i.mtime and + t.ctime is not distinct from i.ctime + group by t.dir_id + ) + update directory as d + set dir_entries = new_entries.entries + from new_entries + where d.id = new_entries.dir_id; + + return; +end +$$; + +create or replace function swh_directory_entry_file_add() + returns void + language plpgsql +as $$ +begin + insert into directory_entry_file (target, name, perms, atime, mtime, ctime) + select distinct t.target, t.name, t.perms, t.atime, t.mtime, t.ctime + from tmp_directory_entry_file t + where not exists ( + select 1 + from directory_entry_file i + where t.target = i.target and t.name = i.name and t.perms = i.perms and + t.atime is not distinct from i.atime and + t.mtime is not distinct from i.mtime and + t.ctime is not distinct from i.ctime); + + with new_entries as ( + select t.dir_id, array_agg(i.id) + from tmp_directory_entry_file t + inner join directory_entry_file i + on t.target = i.target and t.name = i.name and t.perms = i.perms and + t.atime is not distinct from i.atime and + t.mtime is not distinct from i.mtime and + t.ctime is not distinct from i.ctime + group by t.dir_id + ) + update directory as d + set file_entries = new_entries.entries + from new_entries + where d.id = new_entries.dir_id; + + return; +end +$$; + +create or replace function swh_directory_entry_rev_add() + returns void + language plpgsql +as $$ +begin + insert into directory_entry_rev (target, name, perms, atime, mtime, ctime) + select distinct t.target, t.name, t.perms, t.atime, t.mtime, t.ctime + from tmp_directory_entry_rev t + where not exists ( + select 1 + from directory_entry_rev i + where t.target = i.target and t.name = i.name and t.perms = i.perms and + t.atime is not distinct from i.atime and + t.mtime is not distinct from i.mtime and + t.ctime is not distinct from i.ctime); + + with new_entries as ( + select t.dir_id, array_agg(i.id) + from tmp_directory_entry_rev t + inner join directory_entry_rev i + on t.target = i.target and t.name = i.name and t.perms = i.perms and + t.atime is not distinct from i.atime and + t.mtime is not distinct from i.mtime and + t.ctime is not distinct from i.ctime + group by t.dir_id + ) + update directory as d + set rev_entries = new_entries.entries + from new_entries + where d.id = new_entries.dir_id; + + return; +end +$$; + +create or replace function swh_directory_walk_one(walked_dir_id sha1_git) + returns setof directory_entry + language plpgsql +as $$ +begin + return query + with dir as ( + select id as dir_id, dir_entries, file_entries, rev_entries + from directory + where id = walked_dir_id), + ls_d as (select dir_id, unnest(dir_entries) as entry_id from dir), + ls_f as (select dir_id, unnest(file_entries) as entry_id from dir), + ls_r as (select dir_id, unnest(rev_entries) as entry_id from dir) + (select dir_id, 'dir'::directory_entry_type as type, + target, name, perms, atime, mtime, ctime + from ls_d + left join directory_entry_dir d on ls_d.entry_id = d.id) + union + (select dir_id, 'file'::directory_entry_type as type, + target, name, perms, atime, mtime, ctime + from ls_f + left join directory_entry_file d on ls_f.entry_id = d.id) + union + (select dir_id, 'rev'::directory_entry_type as type, + target, name, perms, atime, mtime, ctime + from ls_r + left join directory_entry_rev d on ls_r.entry_id = d.id) + order by name; + return; +end +$$; + +create or replace function swh_content_find_directory(content_id sha1) + returns content_dir + language plpgsql +as $$ +declare + d content_dir; +begin + with recursive path as ( + (select dir.id as dir_id, dir_entry_f.name as name, 0 as depth + from directory_entry_file as dir_entry_f + join content on content.sha1_git = dir_entry_f.target + join directory as dir on dir.file_entries @> array[dir_entry_f.id] + where content.sha1 = content_id + limit 1) + union all + (select dir.id as dir_id, + (dir_entry_d.name || '/' || path.name)::unix_path as name, + path.depth + 1 + from path + join directory_entry_dir as dir_entry_d on dir_entry_d.target = path.dir_id + join directory as dir on dir.dir_entries @> array[dir_entry_d.id] + limit 1) + ) + select dir_id, name from path order by depth desc limit 1 + into strict d; + + return d; +end +$$;