diff --git a/sql/swh-func.sql b/sql/swh-func.sql index 2c301345..7ce0397c 100644 --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -1,696 +1,696 @@ -- create a temporary table called tmp_TBLNAME, mimicking existing table -- TBLNAME -- -- Args: -- tblname: name of the table to mimick create or replace function swh_mktemp(tblname regclass) returns void language plpgsql as $$ begin execute format(' create temporary table tmp_%I (like %I including defaults) on commit drop ', tblname, tblname); return; end $$; -- create a temporary table for directory entries called tmp_TBLNAME, -- mimicking existing table TBLNAME with an extra dir_id (sha1_git) -- column, and dropping the id column. -- -- This is used to create the tmp_directory_entry_ tables. -- -- Args: -- tblname: name of the table to mimick create or replace function swh_mktemp_dir_entry(tblname regclass) returns void language plpgsql as $$ begin execute format(' create temporary table tmp_%I (like %I including defaults, dir_id sha1_git) on commit drop; alter table tmp_%I drop column id; ', tblname, tblname, tblname, tblname); return; end $$; -- create a temporary table for revisions called tmp_revisions, -- mimicking existing table revision, replacing the foreign keys to -- people with an email and name field -- create or replace function swh_mktemp_revision() returns void language sql as $$ create temporary table tmp_revision ( like revision including defaults, - author_name text not null default '', - author_email text not null default '', - committer_name text not null default '', - committer_email text not null default '' + author_name bytea not null default '', + author_email bytea not null default '', + committer_name bytea not null default '', + committer_email bytea not null default '' ) on commit drop; alter table tmp_revision drop column author; alter table tmp_revision drop column committer; $$; -- create a temporary table for releases called tmp_release, -- mimicking existing table release, replacing the foreign keys to -- people with an email and name field -- create or replace function swh_mktemp_release() returns void language sql as $$ create temporary table tmp_release ( like release including defaults, - author_name text not null default '', - author_email text not null default '' + author_name bytea not null default '', + author_email bytea not null default '' ) on commit drop; alter table tmp_release drop column author; $$; -- a content signature is a set of cryptographic checksums that we use to -- uniquely identify content, for the purpose of verifying if we already have -- some content or not during content injection create type content_signature as ( sha1 sha1, sha1_git sha1_git, sha256 sha256 ); -- check which entries of tmp_content are missing from content -- -- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content, -- 2. call this function create or replace function swh_content_missing() returns setof content_signature language plpgsql as $$ begin -- This query is critical for (single-algorithm) hash collision detection, -- so we cannot rely only on the fact that a single hash (e.g., sha1) is -- missing from the table content to conclude that a given content is -- missing. Ideally, we would want to (try to) add to content all entries -- in tmp_content that, when considering all columns together, are missing -- from content. -- -- But doing that naively would require a *compound* index on all checksum -- columns; that index would not be significantly smaller than the content -- table itself, and therefore won't be used. Therefore we union together -- all contents that differ on at least one column from what is already -- available. If there is a collision on some (but not all) columns, the -- relevant tmp_content entry will be included in the set of content to be -- added, causing a downstream violation of unicity constraint. return query (select sha1, sha1_git, sha256 from tmp_content as tmp where not exists (select 1 from content as c where c.sha1 = tmp.sha1)) union (select sha1, sha1_git, sha256 from tmp_content as tmp where not exists (select 1 from content as c where c.sha1_git = tmp.sha1_git)) union (select sha1, sha1_git, sha256 from tmp_content as tmp where not exists (select 1 from content as c where c.sha256 = tmp.sha256)); return; end $$; -- check which entries of tmp_skipped_content are missing from skipped_content -- -- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content, -- 2. call this function create or replace function swh_skipped_content_missing() returns setof content_signature language plpgsql as $$ begin return query select sha1, sha1_git, sha256 from tmp_skipped_content t where not exists (select 1 from skipped_content s where s.sha1 is not distinct from t.sha1 and s.sha1_git is not distinct from t.sha1_git and s.sha256 is not distinct from t.sha256); return; end $$; -- Look up content based on one or several different checksums. Return all -- content information if the content is found; a NULL row otherwise. -- -- At least one checksum should be not NULL. If several are not NULL, they will -- be AND-ed together in the lookup query. -- -- Note: this function is meant to be used to look up individual contents -- (e.g., for the web app), for batch lookup of missing content (e.g., to be -- added) see swh_content_missing create or replace function swh_content_find( sha1 sha1 default NULL, sha1_git sha1_git default NULL, sha256 sha256 default NULL ) returns content language plpgsql as $$ declare con content; filters text[] := array[] :: text[]; -- AND-clauses used to filter content q text; begin if sha1 is not null then filters := filters || format('sha1 = %L', sha1); end if; if sha1_git is not null then filters := filters || format('sha1_git = %L', sha1_git); end if; if sha256 is not null then filters := filters || format('sha256 = %L', sha256); end if; if cardinality(filters) = 0 then return null; else q = format('select * from content where %s', array_to_string(filters, ' and ')); execute q into con; return con; end if; end $$; -- add tmp_content entries to content, skipping duplicates -- -- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content, -- 2. call this function create or replace function swh_content_add() returns void language plpgsql as $$ begin insert into content (sha1, sha1_git, sha256, length, status) select distinct sha1, sha1_git, sha256, length, status from tmp_content where (sha1, sha1_git, sha256) in (select * from swh_content_missing()); -- TODO XXX use postgres 9.5 "UPSERT" support here, when available. -- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid -- the extra swh_content_missing() query here. return; end $$; -- add tmp_skipped_content entries to skipped_content, skipping duplicates -- -- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content, -- 2. call this function create or replace function swh_skipped_content_add() returns void language plpgsql as $$ begin insert into skipped_content (sha1, sha1_git, sha256, length, status, reason, origin) select distinct sha1, sha1_git, sha256, length, status, reason, origin from tmp_skipped_content where (coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '')) in (select coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '') from swh_skipped_content_missing()); -- TODO XXX use postgres 9.5 "UPSERT" support here, when available. -- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid -- the extra swh_content_missing() query here. return; end $$; -- check which entries of tmp_directory are missing from directory -- -- operates in bulk: 0. swh_mktemp(directory), 1. COPY to tmp_directory, -- 2. call this function create or replace function swh_directory_missing() returns setof sha1_git language plpgsql as $$ begin return query select id from tmp_directory t where not exists ( select 1 from directory d where d.id = t.id); return; end $$; create type directory_entry_type as enum('file', 'dir', 'rev'); -- Add tmp_directory_entry_* entries to directory_entry_* and directory, -- skipping duplicates in directory_entry_*. This is a generic function that -- works on all kind of directory entries. -- -- operates in bulk: 0. swh_mktemp_dir_entry('directory_entry_*'), 1 COPY to -- tmp_directory_entry_*, 2. call this function -- -- Assumption: this function is used in the same transaction that inserts the -- context directory in table "directory". create or replace function swh_directory_entry_add(typ directory_entry_type) returns void language plpgsql as $$ begin execute format(' insert into directory_entry_%1$s (target, name, perms) select distinct t.target, t.name, t.perms from tmp_directory_entry_%1$s t where not exists ( select 1 from directory_entry_%1$s i where t.target = i.target and t.name = i.name and t.perms = i.perms) ', typ); execute format(' with new_entries as ( select t.dir_id, array_agg(i.id) as entries from tmp_directory_entry_%1$s t inner join directory_entry_%1$s i using (target, name, perms) group by t.dir_id ) update directory as d set %1$s_entries = new_entries.entries from new_entries where d.id = new_entries.dir_id ', typ); return; end $$; -- a directory listing entry with all the metadata -- -- can be used to list a directory, and retrieve all the data in one go. create type directory_entry as ( dir_id sha1_git, -- id of the parent directory type directory_entry_type, -- type of entry target sha1_git, -- id of target name unix_path, -- path name, relative to containing dir perms file_perms -- unix-like permissions ); -- List a single level of directory walked_dir_id create or replace function swh_directory_walk_one(walked_dir_id sha1_git) returns setof directory_entry language sql as $$ with dir as ( select id as dir_id, dir_entries, file_entries, rev_entries from directory where id = walked_dir_id), ls_d as (select dir_id, unnest(dir_entries) as entry_id from dir), ls_f as (select dir_id, unnest(file_entries) as entry_id from dir), ls_r as (select dir_id, unnest(rev_entries) as entry_id from dir) (select dir_id, 'dir'::directory_entry_type as type, target, name, perms from ls_d left join directory_entry_dir d on ls_d.entry_id = d.id) union (select dir_id, 'file'::directory_entry_type as type, target, name, perms from ls_f left join directory_entry_file d on ls_f.entry_id = d.id) union (select dir_id, 'rev'::directory_entry_type as type, target, name, perms from ls_r left join directory_entry_rev d on ls_r.entry_id = d.id) order by name; $$; -- List all revision IDs starting from a given revision, going back in time -- -- TODO ordering: should be breadth-first right now (what do we want?) -- TODO ordering: ORDER BY parent_rank somewhere? create or replace function swh_revision_list(root_revision sha1_git) returns setof sha1_git language sql as $$ with recursive rev_list(id) as ( (select id from revision where id = root_revision) union (select parent_id from revision_history as h join rev_list on h.id = rev_list.id) ) select * from rev_list; $$; -- Detailed entry in a revision log create type revision_log_entry as ( id sha1_git, date timestamptz, date_offset smallint, committer_date timestamptz, committer_date_offset smallint, type revision_type, directory sha1_git, message bytea, - author_name text, - author_email text, - committer_name text, - committer_email text + author_name bytea, + author_email bytea, + committer_name bytea, + committer_email bytea ); -- "git style" revision log. Similar to swh_revision_list(), but returning all -- information associated to each revision, and expanding authors/committers create or replace function swh_revision_log(root_revision sha1_git) returns setof revision_log_entry language sql as $$ select revision.id, date, date_offset, committer_date, committer_date_offset, type, directory, message, author.name as author_name, author.email as author_email, committer.name as committer_name, committer.email as committer_email from swh_revision_list(root_revision) as rev_list join revision on revision.id = rev_list join person as author on revision.author = author.id join person as committer on revision.committer = committer.id; $$; -- List missing revisions from tmp_revision create or replace function swh_revision_missing() returns setof sha1_git language plpgsql as $$ begin return query select id from tmp_revision t where not exists ( select 1 from revision r where r.id = t.id); return; end $$; -- Create entries in person from tmp_revision create or replace function swh_person_add_from_revision() returns void language plpgsql as $$ begin with t as ( select author_name as name, author_email as email from tmp_revision union select committer_name as name, committer_email as email from tmp_revision ) insert into person (name, email) select distinct name, email from t where not exists ( select 1 from person p where t.name = p.name and t.email = p.email ); return; end $$; -- Create entries in revision from tmp_revision create or replace function swh_revision_add() returns void language plpgsql as $$ begin perform swh_person_add_from_revision(); insert into revision (id, date, date_offset, committer_date, committer_date_offset, type, directory, message, author, committer) select t.id, t.date, t.date_offset, t.committer_date, t.committer_date_offset, t.type, t.directory, t.message, a.id, c.id from tmp_revision t left join person a on a.name = t.author_name and a.email = t.author_email left join person c on c.name = t.committer_name and c.email = t.committer_email; return; end $$; -- List missing releases from tmp_release create or replace function swh_release_missing() returns setof sha1_git language plpgsql as $$ begin return query select id from tmp_release t where not exists ( select 1 from release r where r.id = t.id); return; end $$; -- Create entries in person from tmp_release create or replace function swh_person_add_from_release() returns void language plpgsql as $$ begin with t as ( select distinct author_name as name, author_email as email from tmp_release ) insert into person (name, email) select name, email from t where not exists ( select 1 from person p where t.name = p.name and t.email = p.email ); return; end $$; -- Create entries in release from tmp_release create or replace function swh_release_add() returns void language plpgsql as $$ begin perform swh_person_add_from_release(); insert into release (id, revision, date, date_offset, name, comment, author) select t.id, t.revision, t.date, t.date_offset, t.name, t.comment, a.id from tmp_release t left join person a on a.name = t.author_name and a.email = t.author_email; return; end $$; -- add tmp_occurrence_history entries to occurrence_history -- -- operates in bulk: 0. swh_mktemp(occurrence_history), 1. COPY to tmp_occurrence_history, -- 2. call this function create or replace function swh_occurrence_history_add() returns void language plpgsql as $$ begin -- Update intervals we have the data to update with new_intervals as ( select t.origin, t.branch, t.authority, t.validity, o.validity - t.validity as new_validity from tmp_occurrence_history t left join occurrence_history o using (origin, branch, authority) where o.origin is not null), -- do not update intervals if they would become empty (perfect overlap) to_update as ( select * from new_intervals where not isempty(new_validity)) update occurrence_history o set validity = t.new_validity from to_update t where o.origin = t.origin and o.branch = t.branch and o.authority = t.authority; -- Now only insert intervals that aren't already present insert into occurrence_history (origin, branch, revision, authority, validity) select distinct origin, branch, revision, authority, validity from tmp_occurrence_history t where not exists ( select 1 from occurrence_history o where o.origin = t.origin and o.branch = t.branch and o.authority = t.authority and o.revision = t.revision and o.validity = t.validity); return; end $$; -- Absolute path: directory reference + complete path relative to it create type content_dir as ( directory sha1_git, path unix_path ); -- Find the containing directory of a given content, specified by sha1 -- (note: *not* sha1_git). -- -- Return a pair (dir_it, path) where path is a UNIX path that, from the -- directory root, reach down to a file with the desired content. Return NULL -- if no match is found. -- -- In case of multiple paths (i.e., pretty much always), an arbitrary one is -- chosen. create or replace function swh_content_find_directory(content_id sha1) returns content_dir language sql as $$ with recursive path as ( -- Recursively build a path from the requested content to a root -- directory. Each iteration returns a pair (dir_id, filename) where -- filename is relative to dir_id. Stops when no parent directory can -- be found. (select dir.id as dir_id, dir_entry_f.name as name, 0 as depth from directory_entry_file as dir_entry_f join content on content.sha1_git = dir_entry_f.target join directory as dir on dir.file_entries @> array[dir_entry_f.id] where content.sha1 = content_id limit 1) union all (select dir.id as dir_id, (dir_entry_d.name || '/' || path.name)::unix_path as name, path.depth + 1 from path join directory_entry_dir as dir_entry_d on dir_entry_d.target = path.dir_id join directory as dir on dir.dir_entries @> array[dir_entry_d.id] limit 1) ) select dir_id, name from path order by depth desc limit 1; $$; -- Walk the revision history starting from a given revision, until a matching -- occurrence is found. Return all occurrence information if one is found, NULL -- otherwise. create or replace function swh_revision_find_occurrence(revision_id sha1_git) returns occurrence language plpgsql as $$ declare occ occurrence%ROWTYPE; rev sha1_git; begin -- first check to see if revision_id is already pointed by an occurrence select origin, branch, revision from occurrence_history as occ_hist where occ_hist.revision = revision_id order by upper(occ_hist.validity) -- TODO filter by authority? limit 1 into occ; -- no occurrence point to revision_id, walk up the history if not found then -- recursively walk the history, stopping immediately before a revision -- pointed to by an occurrence. -- TODO find a nicer way to stop at, but *including*, that revision with recursive revlog as ( (select revision_id as rev_id, 0 as depth) union all (select hist.parent_id as rev_id, revlog.depth + 1 from revlog join revision_history as hist on hist.id = revlog.rev_id and not exists(select 1 from occurrence_history where revision = hist.parent_id) limit 1) ) select rev_id from revlog order by depth desc limit 1 into rev; if not found then return null; end if; -- as we stopped before a pointed by revision, look it up again and -- return its data select origin, branch, revision from revision_history as rev_hist, occurrence_history as occ_hist where rev_hist.id = rev and occ_hist.revision = rev_hist.parent_id order by upper(occ_hist.validity) -- TODO filter by authority? limit 1 into occ; end if; return occ; -- might be NULL end $$; -- Occurrence of some content in a given context create type content_occurrence as ( origin_type text, origin_url text, branch text, revision_id sha1_git, path unix_path ); -- Given the sha1 of some content, look up an occurrence that points to a -- revision, which in turns reference (transitively) a tree containing the -- content. Answer the question: "where/when did SWH see a given content"? -- Return information about an arbitrary occurrence/revision/tree if one is -- found, NULL otherwise. create or replace function swh_content_find_occurrence(content_id sha1) returns content_occurrence language plpgsql as $$ declare dir content_dir; rev sha1_git; occ occurrence%ROWTYPE; coc content_occurrence; begin -- each step could fail if no results are found, and that's OK select * from swh_content_find_directory(content_id) -- look up directory into dir; if not found then return null; end if; select id from revision where directory = dir.directory -- look up revision limit 1 into rev; if not found then return null; end if; select * from swh_revision_find_occurrence(rev) -- look up occurrence into occ; if not found then return null; end if; select origin.type, origin.url, occ.branch, rev, dir.path from origin where origin.id = occ.origin into coc; return coc; -- might be NULL end $$; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql index 4c0d4655..485fc15a 100644 --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -1,324 +1,324 @@ --- --- Software Heritage Data Model --- -- drop schema if exists swh cascade; -- create schema swh; -- set search_path to swh; create table dbversion ( version int primary key, release timestamptz, description text ); insert into dbversion(version, release, description) - values(21, now(), 'Work In Progress'); + values(22, now(), 'Work In Progress'); -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); -- a SHA256 checksum create domain sha256 as bytea check (length(value) = 32); -- UNIX path (absolute, relative, individual path component, etc.) create domain unix_path as bytea; -- a set of UNIX-like access permissions, as manipulated by, e.g., chmod create domain file_perms as int; create type content_status as enum ('absent', 'visible', 'hidden'); -- An origin is a place, identified by an URL, where software can be found. We -- support different kinds of origins, e.g., git and other VCS repositories, -- web pages that list tarballs URLs (e.g., http://www.kernel.org), indirect -- tarball URLs (e.g., http://www.example.org/latest.tar.gz), etc. The key -- feature of an origin is that it can be *fetched* (wget, git clone, svn -- checkout, etc.) to retrieve all the contained software. create table origin ( id bigserial primary key, type text, -- TODO use an enum here (?) url text not null ); -- Checksums about actual file content. Note that the content itself is not -- stored in the DB, but on external (key-value) storage. A single checksum is -- used as key there, but the other can be used to verify that we do not inject -- content collisions not knowingly. create table content ( sha1 sha1 primary key, sha1_git sha1_git not null, sha256 sha256 not null, length bigint not null, ctime timestamptz not null default now(), -- creation time, i.e. time of (first) injection into the storage status content_status not null default 'visible' ); create unique index on content(sha1_git); create unique index on content(sha256); -- Content we have seen but skipped for some reason. This table is -- separate from the content table as we might not have the sha1 -- checksum of that data (for instance when we inject git -- repositories, objects that are too big will be skipped here, and we -- will only know their sha1_git). 'reason' contains the reason the -- content was skipped. origin is a nullable column allowing to find -- out which origin contains that skipped content. create table skipped_content ( sha1 sha1, sha1_git sha1_git, sha256 sha256, length bigint not null, ctime timestamptz not null default now(), status content_status not null default 'absent', reason text not null, origin bigint references origin(id), unique (sha1, sha1_git, sha256) ); -- those indexes support multiple NULL values. create unique index on skipped_content(sha1); create unique index on skipped_content(sha1_git); create unique index on skipped_content(sha256); -- An organization (or part thereof) that might be in charge of running -- software projects. Examples: Debian, GNU, GitHub, Apache, The Linux -- Foundation. The data model is hierarchical (via parent_id) and might store -- sub-branches of existing organizations. The key feature of an organization -- is that it can be *listed* to retrieve information about its content, i.e: -- sub-organizations, projects, origins. create table organization ( id bigserial primary key, parent_id bigint references organization(id), name text not null, description text, homepage text, list_engine text, -- crawler to be used to org's content list_url text, -- root URL to start the listing list_params json, -- org-specific listing parameter latest_list timestamptz -- last time the org's content has been listed ); -- Log of all organization listings (i.e., organization crawling) that have -- been done in the past, or are still ongoing. Similar to fetch_history, but -- for organizations. create table list_history ( id bigserial primary key, organization bigint references organization(id), date timestamptz not null, status boolean, -- true if and only if the listing has been successful result json, -- more detailed return value, depending on status stdout text, stderr text, duration interval -- fetch duration of NULL if still ongoing ); -- Log of all origin fetches (i.e., origin crawling) that have been done in the -- past, or are still ongoing. Similar to list_history, but for origins. create table fetch_history ( id bigserial primary key, origin bigint references origin(id), date timestamptz not null, status boolean, -- true if and only if the fetch has been successful result json, -- more detailed returned values, times, etc... stdout text, stderr text, -- null when status is true, filled otherwise duration interval -- fetch duration of NULL if still ongoing ); -- A specific software project, e.g., the Linux kernel, Apache httpd. A -- software project is version-less at this level, but is associated to several -- metadata. Metadata can evolve over time, this table only contains the most -- recent version of them; for old versions of project see table -- project_history. create table project ( id bigserial primary key, organization bigint references organization(id), -- the "owning" organization origin bigint references origin(id), -- where to find project releases name text, description text, homepage text, doap jsonb -- other kinds of metadata/software project description ontologies can be -- added here, in addition to DOAP ); -- History of project metadata. Time-sensitive version of the table project. create table project_history ( id bigserial primary key, project bigint references project(id), validity tstzrange, organization bigint references organization(id), origin bigint references origin(id), name text, description text, homepage text, doap jsonb ); -- A file-system directory. A directory is a list of directory entries (see -- tables: directory_entry_{dir,file}). -- -- To list the contents of a directory: -- 1. list the contained directory_entry_dir using array dir_entries -- 2. list the contained directory_entry_file using array file_entries -- 3. list the contained directory_entry_rev using array rev_entries -- 4. UNION -- -- Synonyms/mappings: -- * git: tree create table directory ( id sha1_git primary key, dir_entries bigint[], -- sub-directories, reference directory_entry_dir file_entries bigint[], -- contained files, reference directory_entry_file rev_entries bigint[] -- mounted revisions, reference directory_entry_rev ); create index on directory using gin (dir_entries); create index on directory using gin (file_entries); create index on directory using gin (rev_entries); -- A directory entry pointing to a sub-directory. create table directory_entry_dir ( id bigserial primary key, target sha1_git, -- id of target directory name unix_path, -- path name, relative to containing dir perms file_perms -- unix-like permissions ); create unique index on directory_entry_dir(target, name, perms); -- A directory entry pointing to a file. create table directory_entry_file ( id bigserial primary key, target sha1_git, -- id of target file name unix_path, -- path name, relative to containing dir perms file_perms -- unix-like permissions ); create unique index on directory_entry_file(target, name, perms); -- A directory entry pointing to a revision. create table directory_entry_rev ( id bigserial primary key, target sha1_git, -- id of target revision name unix_path, -- path name, relative to containing dir perms file_perms -- unix-like permissions ); create unique index on directory_entry_rev(target, name, perms); create table person ( id bigserial primary key, - name text not null default '', - email text not null default '' + name bytea not null default '', + email bytea not null default '' ); create unique index on person(name, email); create type revision_type as enum ('git', 'tar', 'dsc'); -- A snapshot of a software project at a specific point in time. -- -- Synonyms/mappings: -- * git / subversion / etc: commit -- * tarball: a specific tarball -- -- Revisions are organized as DAGs. Each revision points to 0, 1, or more (in -- case of merges) parent revisions. Each revision points to a directory, i.e., -- a file-system tree containing files and directories. create table revision ( id sha1_git primary key, date timestamptz, date_offset smallint, committer_date timestamptz, committer_date_offset smallint, type revision_type not null, directory sha1_git, -- file-system tree message bytea, author bigint references person(id), committer bigint references person(id) ); -- either this table or the sha1_git[] column on the revision table create table revision_history ( id sha1_git references revision(id), parent_id sha1_git, parent_rank int not null default 0, -- parent position in merge commits, 0-based primary key (id, parent_rank) ); -- The content of software origins is indexed starting from top-level pointers -- called "branches". Every time we fetch some origin we store in this table -- where the branches pointed to at fetch time. -- -- Synonyms/mappings: -- * git: ref (in the "git update-ref" sense) create table occurrence_history ( origin bigint references origin(id), branch text, -- e.g., "master" (for VCS), or "sid" (for Debian) revision sha1_git, -- ref target, e.g., commit id authority bigint references organization(id) not null, -- who is claiming to have seen the occurrence. -- Note: SWH is such an authority, and has an entry in -- the organization table. validity tstzrange, -- The time validity of this table entry. If the upper -- bound is missing, the entry is still valid. exclude using gist (origin with =, branch with =, revision with =, authority with =, validity with &&), -- unicity exclusion constraint on lines where the same value is found for -- `origin`, `reference`, `revision`, `authority` and overlapping values for -- `validity`. primary key (origin, branch, revision, authority, validity) ); -- Materialized view of occurrence_history, storing the *current* value of each -- branch, as last seen by SWH. create table occurrence ( origin bigint references origin(id), branch text, revision sha1_git, primary key(origin, branch, revision) ); -- A "memorable" point in the development history of a project. -- -- Synonyms/mappings: -- * git: tag (of the annotated kind, otherwise they are just references) -- * tarball: the release version number create table release ( id sha1_git primary key, revision sha1_git, date timestamptz, date_offset smallint, name text, comment bytea, author bigint references person(id) ); diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py index 13a407ea..7a8b82e4 100644 --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1,490 +1,490 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import os import psycopg2 import shutil import tempfile import unittest from nose.tools import istest from nose.plugins.attrib import attr from swh.core.tests.db_testing import DbTestFixture from swh.core.hashutil import hex_to_hash from swh.storage import Storage TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DATA_DIR = os.path.join(TEST_DIR, '../../../../swh-storage-testdata') @attr('db') class AbstractTestStorage(DbTestFixture): """Base class for Storage testing. This class is used as-is to test local storage (see TestStorage below) and remote storage (see TestRemoteStorage in test_remote_storage.py. We need to have the two classes inherit from this base class separately to avoid nosetests running the tests from the base class twice. """ TEST_DB_DUMP = os.path.join(TEST_DATA_DIR, 'dumps/swh.dump') def setUp(self): super().setUp() self.objroot = tempfile.mkdtemp() self.storage = Storage(self.conn, self.objroot) self.cont = { 'data': b'42\n', 'length': 3, 'sha1': hex_to_hash( '34973274ccef6ab4dfaaf86599792fa9c3fe4689'), 'sha1_git': hex_to_hash( 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd'), 'sha256': hex_to_hash( '673650f936cb3b0a2f93ce09d81be107' '48b1b203c19e8176b4eefc1964a0cf3a'), } self.cont2 = { 'data': b'4242\n', 'length': 5, 'sha1': hex_to_hash( '61c2b3a30496d329e21af70dd2d7e097046d07b7'), 'sha1_git': hex_to_hash( '36fade77193cb6d2bd826161a0979d64c28ab4fa'), 'sha256': hex_to_hash( '859f0b154fdb2d630f45e1ecae4a8629' '15435e663248bb8461d914696fc047cd'), } self.missing_cont = { 'data': b'missing\n', 'length': 8, 'sha1': hex_to_hash( 'f9c24e2abb82063a3ba2c44efd2d3c797f28ac90'), 'sha1_git': hex_to_hash( '33e45d56f88993aae6a0198013efa80716fd8919'), 'sha256': hex_to_hash( '6bbd052ab054ef222c1c87be60cd191a' 'ddedd24cc882d1f5f7f7be61dc61bb3a'), } self.skipped_cont = { 'length': 1024 * 1024 * 200, 'sha1_git': hex_to_hash( '33e45d56f88993aae6a0198013efa80716fd8920'), 'reason': 'Content too long', 'status': 'absent', } self.skipped_cont2 = { 'length': 1024 * 1024 * 300, 'sha1_git': hex_to_hash( '33e45d56f88993aae6a0198013efa80716fd8921'), 'reason': 'Content too long', 'status': 'absent', } self.dir = { 'id': b'4\x013\x422\x531\x000\xf51\xe62\xa73\xff7\xc3\xa90', 'entries': [ { 'name': b'foo', 'type': 'file', 'target': self.cont['sha1_git'], 'perms': 0o644, }, { 'name': b'bar\xc3', 'type': 'dir', 'target': b'12345678901234567890', 'perms': 0o2000, }, ], } self.dir2 = { 'id': b'4\x013\x422\x531\x000\xf51\xe62\xa73\xff7\xc3\xa95', 'entries': [ { 'name': b'oof', 'type': 'file', 'target': self.cont2['sha1_git'], 'perms': 0o644, } ], } self.revision = { 'id': b'56789012345678901234', 'message': 'hello', 'author_name': 'Nicolas Dandrimont', 'author_email': 'nicolas@example.com', - 'committer_name': 'Stefano Zacchiroli', + 'committer_name': b'St\xc3fano Zacchiroli', 'committer_email': 'stefano@example.com', 'parents': [b'01234567890123456789'], 'date': datetime.datetime(2015, 1, 1, 22, 0, 0), 'date_offset': 120, 'committer_date': datetime.datetime(2015, 1, 2, 22, 0, 0), 'committer_date_offset': -120, 'type': 'git', 'directory': self.dir['id'], } self.revision2 = { 'id': b'87659012345678904321', 'message': 'hello', 'author_name': 'Roberto Dicosmo', 'author_email': 'roberto@example.com', 'committer_name': 'tony', 'committer_email': 'ar@dumont.fr', 'parents': [b'01234567890123456789'], 'date': datetime.datetime(2015, 1, 1, 22, 0, 0), 'date_offset': 120, 'committer_date': datetime.datetime(2015, 1, 2, 22, 0, 0), 'committer_date_offset': -120, 'type': 'git', 'directory': self.dir2['id'], } self.origin = { 'url': 'file:///dev/null', 'type': 'git', } self.origin2 = { 'url': 'file:///dev/zero', 'type': 'git', } self.occurrence = { 'branch': 'master', 'revision': b'67890123456789012345', 'authority': 1, 'validity': datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc), } self.occurrence2 = { 'branch': 'master', 'revision': self.revision2['id'], 'authority': 1, 'validity': datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc), } def tearDown(self): shutil.rmtree(self.objroot) self.cursor.execute("""SELECT table_name FROM information_schema.tables WHERE table_schema = %s""", ('public',)) tables = set(table for (table,) in self.cursor.fetchall()) tables -= {'dbversion', 'organization'} for table in tables: self.cursor.execute('truncate table %s cascade' % table) self.conn.commit() super().tearDown() @istest def content_add(self): cont = self.cont self.storage.content_add([cont]) if hasattr(self.storage, 'objstorage'): self.assertIn(cont['sha1'], self.storage.objstorage) self.cursor.execute('SELECT sha1, sha1_git, sha256, length, status' ' FROM content WHERE sha1 = %s', (cont['sha1'],)) datum = self.cursor.fetchone() self.assertEqual( (datum[0].tobytes(), datum[1].tobytes(), datum[2].tobytes(), datum[3], datum[4]), (cont['sha1'], cont['sha1_git'], cont['sha256'], cont['length'], 'visible')) @istest def content_add_collision(self): cont1 = self.cont # create (corrupted) content with same sha1{,_git} but != sha256 cont1b = cont1.copy() sha256_array = bytearray(cont1b['sha256']) sha256_array[0] += 1 cont1b['sha256'] = bytes(sha256_array) with self.assertRaises(psycopg2.IntegrityError): self.storage.content_add([cont1, cont1b]) @istest def skipped_content_add(self): cont = self.skipped_cont cont2 = self.skipped_cont2 self.storage.content_add([cont]) self.storage.content_add([cont2]) self.cursor.execute('SELECT sha1, sha1_git, sha256, length, status,' 'reason FROM skipped_content ORDER BY sha1_git') datum = self.cursor.fetchone() self.assertEqual( (datum[0], datum[1].tobytes(), datum[2], datum[3], datum[4], datum[5]), (None, cont['sha1_git'], None, cont['length'], 'absent', 'Content too long')) datum2 = self.cursor.fetchone() self.assertEqual( (datum2[0], datum2[1].tobytes(), datum2[2], datum2[3], datum2[4], datum2[5]), (None, cont2['sha1_git'], None, cont2['length'], 'absent', 'Content too long')) @istest def content_missing(self): cont2 = self.cont2 missing_cont = self.missing_cont self.storage.content_add([cont2]) gen = self.storage.content_missing([cont2, missing_cont]) self.assertEqual(list(gen), [missing_cont['sha1']]) @istest def content_exist_with_present_content(self): # 1. with something to find cont = self.cont self.storage.content_add([cont]) actually_present = self.storage.content_exist({'sha1': cont['sha1']}) self.assertEquals(actually_present, True, "Should be present") # 2. with something to find actually_present = self.storage.content_exist( {'sha1_git': cont['sha1_git']}) self.assertEquals(actually_present, True, "Should be present") # 3. with something to find actually_present = self.storage.content_exist( {'sha256': cont['sha256']}) self.assertEquals(actually_present, True, "Should be present") # 4. with something to find actually_present = self.storage.content_exist( {'sha1': cont['sha1'], 'sha1_git': cont['sha1_git'], 'sha256': cont['sha256']}) self.assertEquals(actually_present, True, "Should be present") @istest def content_exist_with_non_present_content(self): # 1. with something that does not exist missing_cont = self.missing_cont actually_present = self.storage.content_exist( {'sha1': missing_cont['sha1']}) self.assertEquals(actually_present, False, "Should be missing") # 2. with something that does not exist actually_present = self.storage.content_exist( {'sha1_git': missing_cont['sha1_git']}) self.assertEquals(actually_present, False, "Should be missing") # 3. with something that does not exist actually_present = self.storage.content_exist( {'sha256': missing_cont['sha256']}) self.assertEquals(actually_present, False, "Should be missing") @istest def content_exist_bad_input(self): # 1. with bad input with self.assertRaises(ValueError): self.storage.content_exist({}) # empty is bad # 2. with bad input with self.assertRaises(ValueError): self.storage.content_exist( {'unknown-sha1': 'something'}) # not the right key @istest def directory_add(self): init_missing = list(self.storage.directory_missing([self.dir['id']])) self.assertEqual([self.dir['id']], init_missing) self.storage.directory_add([self.dir]) stored_data = list(self.storage.directory_get(self.dir['id'])) data_to_store = [ (self.dir['id'], ent['type'], ent['target'], ent['name'], ent['perms']) for ent in sorted(self.dir['entries'], key=lambda ent: ent['name']) ] self.assertEqual(data_to_store, stored_data) after_missing = list(self.storage.directory_missing([self.dir['id']])) self.assertEqual([], after_missing) @istest def revision_add(self): init_missing = self.storage.revision_missing([self.revision['id']]) self.assertEqual([self.revision['id']], list(init_missing)) self.storage.revision_add([self.revision]) end_missing = self.storage.revision_missing([self.revision['id']]) self.assertEqual([], list(end_missing)) @istest def origin_add(self): self.assertIsNone(self.storage.origin_get(self.origin)) id = self.storage.origin_add_one(self.origin) self.assertEqual(self.storage.origin_get(self.origin), id) @istest def occurrence_add(self): origin_id = self.storage.origin_add_one(self.origin2) revision = self.revision.copy() revision['id'] = self.occurrence['revision'] self.storage.revision_add([revision]) occur = self.occurrence occur['origin'] = origin_id self.storage.occurrence_add([occur]) self.storage.occurrence_add([occur]) test_query = '''select origin, branch, revision, authority, validity from occurrence_history order by origin, validity''' self.cursor.execute(test_query) ret = self.cursor.fetchall() self.assertEqual(len(ret), 1) self.assertEqual((ret[0][0], ret[0][1], ret[0][2].tobytes(), ret[0][3]), (occur['origin'], occur['branch'], occur['revision'], occur['authority'])) self.assertEqual(ret[0][4].lower, occur['validity']) self.assertEqual(ret[0][4].lower_inc, True) self.assertEqual(ret[0][4].upper, datetime.datetime.max) orig_validity = occur['validity'] occur['validity'] += datetime.timedelta(hours=10) self.storage.occurrence_add([occur]) self.cursor.execute(test_query) ret = self.cursor.fetchall() self.assertEqual(len(ret), 2) self.assertEqual(ret[0][4].lower, orig_validity) self.assertEqual(ret[0][4].lower_inc, True) self.assertEqual(ret[0][4].upper, occur['validity']) self.assertEqual(ret[0][4].upper_inc, False) self.assertEqual(ret[1][4].lower, occur['validity']) self.assertEqual(ret[1][4].lower_inc, True) self.assertEqual(ret[1][4].upper, datetime.datetime.max) @istest def content_find_occurrence_with_present_content(self): # 1. with something to find # given self.storage.content_add([self.cont2]) self.storage.directory_add([self.dir2]) # point to self.cont self.storage.revision_add([self.revision2]) # points to self.dir origin_id = self.storage.origin_add_one(self.origin2) occurrence = self.occurrence2 occurrence.update({'origin': origin_id}) self.storage.occurrence_add([occurrence]) # when occ = self.storage.content_find_occurrence( {'sha1': self.cont2['sha1']}) # then self.assertEquals(occ['origin_type'], self.origin2['type']) self.assertEquals(occ['origin_url'], self.origin2['url']) self.assertEquals(occ['branch'], self.occurrence2['branch']) self.assertEquals(occ['revision'], self.revision2['id']) self.assertEquals(occ['path'], self.dir2['entries'][0]['name']) occ2 = self.storage.content_find_occurrence( {'sha1_git': self.cont2['sha1_git']}) self.assertEquals(occ2['origin_type'], self.origin2['type']) self.assertEquals(occ2['origin_url'], self.origin2['url']) self.assertEquals(occ2['branch'], self.occurrence2['branch']) self.assertEquals(occ2['revision'], self.revision2['id']) self.assertEquals(occ2['path'], self.dir2['entries'][0]['name']) occ3 = self.storage.content_find_occurrence( {'sha256': self.cont2['sha256']}) self.assertEquals(occ3['origin_type'], self.origin2['type']) self.assertEquals(occ3['origin_url'], self.origin2['url']) self.assertEquals(occ3['branch'], self.occurrence2['branch']) self.assertEquals(occ3['revision'], self.revision2['id']) self.assertEquals(occ3['path'], self.dir2['entries'][0]['name']) @istest def content_find_occurrence_with_non_present_content(self): # 1. with something that does not exist missing_cont = self.missing_cont occ = self.storage.content_find_occurrence( {'sha1': missing_cont['sha1']}) self.assertEquals(occ, None, "Content does not exist so no occurrence") # 2. with something that does not exist occ = self.storage.content_find_occurrence( {'sha1_git': missing_cont['sha1_git']}) self.assertEquals(occ, None, "Content does not exist so no occurrence") # 3. with something that does not exist occ = self.storage.content_find_occurrence( {'sha256': missing_cont['sha256']}) self.assertEquals(occ, None, "Content does not exist so no occurrence") @istest def content_find_occurrence_bad_input(self): # 1. with bad input with self.assertRaises(ValueError) as cm: self.storage.content_find_occurrence({}) # empty is bad self.assertIn('content keys', cm.exception.args[0]) # 2. with bad input with self.assertRaises(ValueError) as cm: self.storage.content_find_occurrence( {'unknown-sha1': 'something'}) # not the right key self.assertIn('content keys', cm.exception.args[0]) class TestStorage(AbstractTestStorage, unittest.TestCase): """Test the local storage""" pass