diff --git a/sql/clusters.dot b/sql/clusters.dot index 2fb7eb7..d2d6055 100644 --- a/sql/clusters.dot +++ b/sql/clusters.dot @@ -1,85 +1,85 @@ subgraph "logical_grouping" { style = rounded; bgcolor = gray95; color = gray; - + subgraph cluster_meta { label = <schema versioning
version: @@VERSION@@>; dbversion; } subgraph cluster_content { label = <content>; content; skipped_content; } subgraph cluster_directory { label = <directories>; directory; directory_entry_dir; directory_entry_file; directory_entry_rev; } subgraph cluster_revision { label = <revisions>; revision; revision_history; person; } subgraph cluster_release { label = <releases>; release; } subgraph cluster_snapshots { label = <snapshots>; snapshot; snapshot_branch; snapshot_branches; } subgraph cluster_origins { label = <origins>; origin; fetch_history; origin_visit; } subgraph cluster_metadata { label = <metadata>; metadata_provider; origin_metadata; tool; } subgraph cluster_statistics { label = <statistics>; object_counts; object_counts_bucketed; } { edge [style = dashed]; # "rtcolN" identifies the N-th row (1-based) in a table, as a source # "ltcolN" identifies the N-th row (1-based) in a table, as a destination "snapshot_branch":rtcol3 -> "release":ltcol1; "snapshot_branch":rtcol3 -> "revision":ltcol1; "snapshot_branch":rtcol3 -> "directory":ltcol1; "snapshot_branch":rtcol3 -> "content":ltcol2; "directory_entry_dir":ltcol2 -> "directory":rtcol1; "directory_entry_file":rtcol2 -> "content":ltcol2; "directory_entry_file":rtcol2 -> "skipped_content":ltcol2; "directory_entry_rev":rtcol2 -> "revision":ltcol1; "directory":rtcol2 -> "directory_entry_dir":ltcol1; "directory":rtcol3 -> "directory_entry_file":ltcol1; "directory":rtcol4 -> "directory_entry_rev":ltcol1; "release":rtcol2 -> "revision":ltcol1; "revision":ltcol7 -> "directory":rtcol1; "revision_history":rtcol2 -> "revision":ltcol1; } } diff --git a/sql/upgrades/049.sql b/sql/upgrades/049.sql index 8d5573a..2e4fbbc 100644 --- a/sql/upgrades/049.sql +++ b/sql/upgrades/049.sql @@ -1,269 +1,269 @@ -- SWH DB schema upgrade -- from_version: 48 -- to_version: 49 -- description: update the schema for occurrence and occurrence_history insert into dbversion(version, release, description) values(49, now(), 'Work In Progress'); CREATE TABLE origin_visit ( origin bigint NOT NULL, visit bigint NOT NULL, "date" timestamp with time zone NOT NULL ); -- move occurrence_history to another table alter table occurrence_history rename to old_occurrence_history; alter index occurrence_history_pkey rename to old_occurrence_history_pkey; alter index occurrence_history_origin_branch_idx rename to old_occurrence_history_origin_branch_idx; alter index occurrence_history_target_target_type_idx rename to old_occurrence_history_target_target_type_idx; alter table old_occurrence_history rename constraint occurrence_history_authority_fkey to old_occurrence_history_authority_fkey; alter table old_occurrence_history rename constraint occurrence_history_origin_fkey to old_occurrence_history_origin_fkey; create table occurrence_history ( origin bigint, branch bytea, -- e.g., b"master" (for VCS), or b"sid" (for Debian) target sha1_git, -- ref target, e.g., commit id target_type object_type, -- ref target type visits bigint[], object_id bigserial -- short object identifier ); -- create origin_visit contents with origins_visited as ( select distinct origin, lower(validity) as date from old_occurrence_history where authority = '5f4d4c51-498a-4e28-88b3-b3e4e8396cba' -- swh order by origin, date ) insert into origin_visit (origin, date, visit) select origin, date, row_number() over (partition by origin) from origins_visited; ALTER TABLE origin_visit ADD CONSTRAINT origin_visit_pkey PRIMARY KEY (origin, visit); ALTER TABLE origin_visit ADD CONSTRAINT origin_visit_origin_fkey FOREIGN KEY (origin) REFERENCES origin(id); CREATE INDEX origin_visit_date_idx ON origin_visit USING btree (date); -- create new occurrence_history contents insert into occurrence_history (origin, branch, target, target_type, object_id, visits) select ooh.origin, branch, target, target_type, object_id, array[visit] from old_occurrence_history ooh left join origin_visit ov on ov.origin = ooh.origin and ov.date = lower(ooh.validity) where ov.visit is not null; ALTER TABLE occurrence_history ADD CONSTRAINT occurrence_history_pkey PRIMARY KEY (object_id), ADD CONSTRAINT occurrence_history_origin_fkey FOREIGN KEY (origin) REFERENCES origin(id); CREATE INDEX on occurrence_history(target, target_type); CREATE INDEX on occurrence_history(origin, branch); -- drop table old_occurrence_history; -- create new occurrence contents alter table occurrence drop constraint occurrence_pkey, drop constraint occurrence_origin_fkey; drop index if exists occurrence_target_target_type_idx; create or replace function update_occurrence_for_origin(origin_id bigint) returns void language sql as $$ delete from occurrence where origin = origin_id; insert into occurrence (origin, branch, target, target_type) select origin, branch, target, target_type from occurrence_history where origin = origin_id and (select visit from origin_visit where origin = origin_id order by date desc - limit 1) = any(visits); + limit 1) = any(visits); $$; create or replace function update_occurrence() returns void language plpgsql as $$ declare origin_id origin.id%type; begin for origin_id in select distinct id from origin loop perform update_occurrence_for_origin(origin_id); end loop; return; end; $$; select update_occurrence(); ALTER TABLE occurrence ADD CONSTRAINT occurrence_pkey PRIMARY KEY (origin, branch), ADD CONSTRAINT occurrence_origin_fkey FOREIGN KEY (origin) REFERENCES origin(id); CREATE INDEX occurrence_target_target_type_idx on occurrence(target, target_type); CREATE OR REPLACE FUNCTION swh_mktemp_occurrence_history() RETURNS void LANGUAGE sql AS $$ create temporary table tmp_occurrence_history( like occurrence_history including defaults, date timestamptz not null ) on commit drop; alter table tmp_occurrence_history drop column visits, drop column object_id; $$; DROP FUNCTION swh_occurrence_get_by(bigint,bytea,timestamp with time zone); CREATE OR REPLACE FUNCTION swh_occurrence_get_by(origin_id bigint, branch_name bytea = NULL::bytea, "date" timestamp with time zone = NULL::timestamp with time zone) RETURNS SETOF occurrence_history LANGUAGE plpgsql AS $$ declare filters text[] := array[] :: text[]; -- AND-clauses used to filter content visit_id bigint; q text; begin if origin_id is not null then filters := filters || format('origin = %L', origin_id); end if; if branch_name is not null then filters := filters || format('branch = %L', branch_name); end if; if date is not null then if origin_id is null then raise exception 'Needs an origin_id to filter by date.'; end if; select visit from swh_visit_find_by_date(origin_id, date) into visit_id; if visit_id is null then return; end if; filters := filters || format('%L = any(visits)', visit_id); end if; if cardinality(filters) = 0 then raise exception 'At least one filter amongst (origin_id, branch_name, validity) is needed'; else q = format('select * ' || 'from occurrence_history ' || 'where %s', array_to_string(filters, ' and ')); return query execute q; end if; end $$; CREATE OR REPLACE FUNCTION swh_occurrence_history_add() RETURNS void LANGUAGE plpgsql AS $$ declare origin_id origin.id%type; begin -- Create new visits with current_visits as ( select distinct origin, date from tmp_occurrence_history ), new_visits as ( select origin, date, (select coalesce(max(visit), 0) from origin_visit ov where ov.origin = origin) + row_number() over(partition by origin order by origin, date) from current_visits cv where not exists (select 1 from origin_visit ov where ov.origin = cv.origin and ov.date = cv.date) ) insert into origin_visit (origin, date, visit) select * from new_visits; -- Create or update occurrence_history with occurrence_history_id_visit as ( select tmp_occurrence_history.*, object_id, visits, visit from tmp_occurrence_history left join occurrence_history using(origin, target, target_type) left join origin_visit using(origin, date) ), occurrences_to_update as ( select object_id, visit from occurrence_history_id_visit where object_id is not null ), update_occurrences as ( update occurrence_history set visits = array(select unnest(occurrence_history.visits) as e union select occurrences_to_update.visit as e order by e) from occurrences_to_update where occurrence_history.object_id = occurrences_to_update.object_id ) insert into occurrence_history (origin, branch, target, target_type, visits) select origin, branch, target, target_type, ARRAY[visit] from occurrence_history_id_visit where object_id is null; -- update occurrence for origin_id in select distinct origin from tmp_occurrence_history loop perform update_occurrence_for_origin(origin_id); end loop; return; end $$; CREATE OR REPLACE FUNCTION swh_revision_find_occurrence(revision_id sha1_git) RETURNS occurrence LANGUAGE sql STABLE AS $$ select origin, branch, target, target_type from swh_revision_list_children(ARRAY[revision_id] :: bytea[]) as rev_list left join occurrence_history occ_hist on rev_list.id = occ_hist.target where occ_hist.origin is not null and occ_hist.target_type = 'revision' limit 1; $$; DROP FUNCTION swh_revision_get_by(bigint,bytea,timestamp with time zone); CREATE OR REPLACE FUNCTION swh_revision_get_by(origin_id bigint, branch_name bytea = NULL::bytea, "date" timestamp with time zone = NULL::timestamp with time zone) RETURNS SETOF revision_entry LANGUAGE sql STABLE AS $$ select r.id, r.date, r.date_offset, r.committer_date, r.committer_date_offset, r.type, r.directory, r.message, a.name, a.email, c.name, c.email, r.metadata, r.synthetic, array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank ) as parents from swh_occurrence_get_by(origin_id, branch_name, date) as occ inner join revision r on occ.target = r.id left join person a on a.id = r.author left join person c on c.id = r.committer; $$; CREATE OR REPLACE FUNCTION swh_visit_find_by_date(origin bigint, visit_date timestamp with time zone = now()) RETURNS origin_visit LANGUAGE sql STABLE AS $$ with closest_two_visits as (( select origin_visit, (date - visit_date) as interval from origin_visit where date >= visit_date order by date asc limit 1 ) union ( select origin_visit, (visit_date - date) as interval from origin_visit where date < visit_date order by date desc limit 1 )) select (origin_visit).* from closest_two_visits order by interval limit 1 $$; diff --git a/sql/upgrades/137.sql b/sql/upgrades/137.sql index bb26f77..d6220a2 100644 --- a/sql/upgrades/137.sql +++ b/sql/upgrades/137.sql @@ -1,166 +1,166 @@ -- SWH DB schema upgrade -- from_version: 136 -- to_version: 137 -- description: Add comment columns to all tables insert into dbversion(version, release, description) values(137, now(), 'Work In Progress'); -- comment for columns of dbversion table comment on table dbversion is 'Details of current db version'; comment on column dbversion.version is 'SQL schema version'; comment on column dbversion.release is 'Version deployment timestamp'; comment on column dbversion.description is 'Release description'; -- comment for columns of content table comment on table content is 'Checksums of file content which is actually stored externally'; comment on column content.sha1 is 'Content sha1 hash'; comment on column content.sha1_git is 'Git object sha1 hash'; comment on column content.sha256 is 'Content Sha256 hash'; comment on column content.blake2s256 is 'Content blake2s hash'; comment on column content.length is 'Content length'; comment on column content.ctime is 'First seen time'; comment on column content.status is 'Content status (absent, visible, hidden)'; comment on column content.object_id is 'Content identifier'; -- comment for columns of origin table comment on column origin.id is 'Artifact origin id'; comment on column origin.type is 'Type of origin'; comment on column origin.url is 'URL of origin'; -- comment for columns of skipped_content comment on table skipped_content is 'Content blobs observed, but not ingested in the archive'; comment on column skipped_content.sha1 is 'Skipped content sha1 hash'; comment on column skipped_content.sha1_git is 'Git object sha1 hash'; comment on column skipped_content.sha256 is 'Skipped content sha256 hash'; comment on column skipped_content.blake2s256 is 'Skipped content blake2s hash'; comment on column skipped_content.length is 'Skipped content length'; comment on column skipped_content.ctime is 'First seen time'; comment on column skipped_content.status is 'Skipped content status (absent, visible, hidden)'; comment on column skipped_content.reason is 'Reason for skipping'; comment on column skipped_content.origin is 'Origin table identifier'; comment on column skipped_content.object_id is 'Skipped content identifier'; -- comment for columns of fetch_history comment on table fetch_history is 'Log of all origin fetches'; comment on column fetch_history.id is 'Identifier for fetch history'; comment on column fetch_history.origin is 'Origin table identifier'; comment on column fetch_history.date is 'Fetch start time'; comment on column fetch_history.status is 'True indicates successful fetch'; comment on column fetch_history.result is 'Detailed return values, times etc'; comment on column fetch_history.stdout is 'Standard output of fetch operation'; comment on column fetch_history.stderr is 'Standard error of fetch operation'; comment on column fetch_history.duration is 'Time taken to complete fetch, NULL if ongoing'; -- comment for columns of directory comment on table directory is 'Contents of a directory, synonymous to tree (git)'; comment on column directory.id is 'Git object sha1 hash'; comment on column directory.dir_entries is 'Sub-directories, reference directory_entry_dir'; comment on column directory.file_entries is 'Contained files, reference directory_entry_file'; comment on column directory.rev_entries is 'Mounted revisions, reference directory_entry_rev'; comment on column directory.object_id is 'Short object identifier'; -- comment for columns of directory_entry_dir comment on table directory_entry_dir is 'Directory entry for directory'; comment on column directory_entry_dir.id is 'Directory identifier'; comment on column directory_entry_dir.target is 'Target directory identifier'; comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; comment on column directory_entry_dir.perms is 'Unix-like permissions'; -- comment for columns of directory_entry_file comment on table directory_entry_file is 'Directory entry for file'; comment on column directory_entry_file.id is 'File identifier'; comment on column directory_entry_file.target is 'Target file identifier'; comment on column directory_entry_file.name is 'Path name, relative to containing directory'; comment on column directory_entry_file.perms is 'Unix-like permissions'; -- comment for columns of directory_entry_rev comment on table directory_entry_rev is 'Directory entry for revision'; comment on column directory_entry_dir.id is 'Revision identifier'; comment on column directory_entry_dir.target is 'Target revision in identifier'; comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; comment on column directory_entry_dir.perms is 'Unix-like permissions'; -- comment for columns of person comment on table person is 'Person referenced in code artifact release metadata'; comment on column person.id is 'Person identifier'; comment on column person.name is 'Name'; comment on column person.email is 'Email'; comment on column person.fullname is 'Full name (raw name)'; -- comment for columns of revision -comment on table revision is 'Revision represents the state of a source code tree at a +comment on table revision is 'Revision represents the state of a source code tree at a specific point in time'; comment on column revision.id is 'Git id of sha1 checksum'; comment on column revision.date is 'Timestamp when revision was authored'; comment on column revision.date_offset is 'Authored timestamp offset from UTC'; comment on column revision.committer_date is 'Timestamp when revision was committed'; comment on column revision.committer_date_offset is 'Committed timestamp offset from UTC'; comment on column revision.type is 'Possible revision types (''git'', ''tar'', ''dsc'', ''svn'', ''hg'')'; comment on column revision.directory is 'Directory identifier'; comment on column revision.message is 'Revision message'; comment on column revision.author is 'Author identifier'; comment on column revision.committer is 'Committer identifier'; comment on column revision.synthetic is 'true iff revision has been created by Software Heritage'; comment on column revision.metadata is 'extra metadata (tarball checksums, extra commit information, etc...)'; comment on column revision.object_id is 'Object identifier'; comment on column revision.date_neg_utc_offset is 'True indicates -0 UTC offset for author timestamp'; comment on column revision.committer_date_neg_utc_offset is 'True indicates -0 UTC offset for committer timestamp'; -- comment for columns of revision_history comment on table revision_history is 'Sequence of revision history with parent and position in history'; comment on column revision_history.id is 'Revision history git object sha1 checksum'; comment on column revision_history.parent_id is 'Parent revision git object identifier'; comment on column revision_history.parent_rank is 'Parent position in merge commits, 0-based'; -- comment for columns of snapshot comment on table snapshot is 'State of a software origin as crawled by Software Heritage'; comment on column snapshot.object_id is 'Internal object identifier'; comment on column snapshot.id is 'Intrinsic snapshot identifier'; -- comment for columns of snapshot_branch comment on table snapshot_branch is 'Associates branches with objects in Heritage Merkle DAG'; comment on column snapshot_branch.object_id is 'Internal object identifier'; comment on column snapshot_branch.name is 'Branch name'; comment on column snapshot_branch.target is 'Target object identifier'; comment on column snapshot_branch.target_type is 'Target object type'; -- comment for columns of snapshot_branches comment on table snapshot_branches is 'Mapping between snapshot and their branches'; comment on column snapshot_branches.snapshot_id is 'Snapshot identifier'; comment on column snapshot_branches.branch_id is 'Branch identifier'; -- comment for columns of release comment on table release is 'Details of a software release, synonymous with a tag (git) or version number (tarball)'; comment on column release.id is 'Release git identifier'; comment on column release.target is 'Target git identifier'; comment on column release.date is 'Release timestamp'; comment on column release.date_offset is 'Timestamp offset from UTC'; comment on column release.name is 'Name'; comment on column release.comment is 'Comment'; comment on column release.author is 'Author'; comment on column release.synthetic is 'Indicates if created by Software Heritage'; comment on column release.object_id is 'Object identifier'; comment on column release.target_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp'; -- comment for columns of object_counts comment on table object_counts is 'Cache of object counts'; comment on column object_counts.object_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column object_counts.value is 'Count of objects in the table'; comment on column object_counts.last_update is 'Last update for object count'; comment on column object_counts.single_update is 'standalone (true) or bucketed counts (false)'; -- comment for columns of object_counts_bucketed comment on table object_counts_bucketed is 'Bucketed count for objects ordered by type'; comment on column object_counts_bucketed.line is 'Auto incremented idenitfier value'; comment on column object_counts_bucketed.object_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column object_counts_bucketed.identifier is 'Common identifier for bucketed objects'; comment on column object_counts_bucketed.bucket_start is 'Lower bound (inclusive) for the bucket'; comment on column object_counts_bucketed.bucket_end is 'Upper bound (exclusive) for the bucket'; comment on column object_counts_bucketed.value is 'Count of objects in the bucket'; comment on column object_counts_bucketed.last_update is 'Last update for the object count in this bucket'; diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql index dca2434..6da6bf3 100644 --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -1,469 +1,469 @@ --- --- SQL implementation of the Software Heritage data model --- -- schema versions create table dbversion ( version int primary key, release timestamptz, description text ); comment on table dbversion is 'Details of current db version'; comment on column dbversion.version is 'SQL schema version'; comment on column dbversion.release is 'Version deployment timestamp'; comment on column dbversion.description is 'Release description'; -- latest schema version insert into dbversion(version, release, description) values(143, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a Git-style salted SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); -- a SHA256 checksum create domain sha256 as bytea check (length(value) = 32); -- a blake2 checksum create domain blake2s256 as bytea check (length(value) = 32); -- UNIX path (absolute, relative, individual path component, etc.) create domain unix_path as bytea; -- a set of UNIX-like access permissions, as manipulated by, e.g., chmod create domain file_perms as int; -- Checksums about actual file content. Note that the content itself is not -- stored in the DB, but on external (key-value) storage. A single checksum is -- used as key there, but the other can be used to verify that we do not inject -- content collisions not knowingly. create table content ( sha1 sha1 not null, sha1_git sha1_git not null, sha256 sha256 not null, blake2s256 blake2s256, length bigint not null, ctime timestamptz not null default now(), -- creation time, i.e. time of (first) injection into the storage status content_status not null default 'visible', object_id bigserial ); comment on table content is 'Checksums of file content which is actually stored externally'; comment on column content.sha1 is 'Content sha1 hash'; comment on column content.sha1_git is 'Git object sha1 hash'; comment on column content.sha256 is 'Content Sha256 hash'; comment on column content.blake2s256 is 'Content blake2s hash'; comment on column content.length is 'Content length'; comment on column content.ctime is 'First seen time'; comment on column content.status is 'Content status (absent, visible, hidden)'; comment on column content.object_id is 'Content identifier'; -- An origin is a place, identified by an URL, where software source code -- artifacts can be found. We support different kinds of origins, e.g., git and -- other VCS repositories, web pages that list tarballs URLs (e.g., -- http://www.kernel.org), indirect tarball URLs (e.g., -- http://www.example.org/latest.tar.gz), etc. The key feature of an origin is -- that it can be *fetched* from (wget, git clone, svn checkout, etc.) to -- retrieve all the contained software. create table origin ( id bigserial not null, url text not null ); comment on column origin.id is 'Artifact origin id'; comment on column origin.url is 'URL of origin'; -- Content blobs observed somewhere, but not ingested into the archive for -- whatever reason. This table is separate from the content table as we might -- not have the sha1 checksum of skipped contents (for instance when we inject -- git repositories, objects that are too big will be skipped here, and we will -- only know their sha1_git). 'reason' contains the reason the content was -- skipped. origin is a nullable column allowing to find out which origin -- contains that skipped content. create table skipped_content ( sha1 sha1, sha1_git sha1_git, sha256 sha256, blake2s256 blake2s256, length bigint not null, ctime timestamptz not null default now(), status content_status not null default 'absent', reason text not null, origin bigint, object_id bigserial ); comment on table skipped_content is 'Content blobs observed, but not ingested in the archive'; comment on column skipped_content.sha1 is 'Skipped content sha1 hash'; comment on column skipped_content.sha1_git is 'Git object sha1 hash'; comment on column skipped_content.sha256 is 'Skipped content sha256 hash'; comment on column skipped_content.blake2s256 is 'Skipped content blake2s hash'; comment on column skipped_content.length is 'Skipped content length'; comment on column skipped_content.ctime is 'First seen time'; comment on column skipped_content.status is 'Skipped content status (absent, visible, hidden)'; comment on column skipped_content.reason is 'Reason for skipping'; comment on column skipped_content.origin is 'Origin table identifier'; comment on column skipped_content.object_id is 'Skipped content identifier'; -- A file-system directory. A directory is a list of directory entries (see -- tables: directory_entry_{dir,file}). -- -- To list the contents of a directory: -- 1. list the contained directory_entry_dir using array dir_entries -- 2. list the contained directory_entry_file using array file_entries -- 3. list the contained directory_entry_rev using array rev_entries -- 4. UNION -- -- Synonyms/mappings: -- * git: tree create table directory ( id sha1_git not null, dir_entries bigint[], -- sub-directories, reference directory_entry_dir file_entries bigint[], -- contained files, reference directory_entry_file rev_entries bigint[], -- mounted revisions, reference directory_entry_rev object_id bigserial -- short object identifier ); comment on table directory is 'Contents of a directory, synonymous to tree (git)'; comment on column directory.id is 'Git object sha1 hash'; comment on column directory.dir_entries is 'Sub-directories, reference directory_entry_dir'; comment on column directory.file_entries is 'Contained files, reference directory_entry_file'; comment on column directory.rev_entries is 'Mounted revisions, reference directory_entry_rev'; comment on column directory.object_id is 'Short object identifier'; -- A directory entry pointing to a (sub-)directory. create table directory_entry_dir ( id bigserial, target sha1_git not null, -- id of target directory name unix_path not null, -- path name, relative to containing dir perms file_perms not null -- unix-like permissions ); comment on table directory_entry_dir is 'Directory entry for directory'; comment on column directory_entry_dir.id is 'Directory identifier'; comment on column directory_entry_dir.target is 'Target directory identifier'; comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; comment on column directory_entry_dir.perms is 'Unix-like permissions'; -- A directory entry pointing to a file content. create table directory_entry_file ( id bigserial, target sha1_git not null, -- id of target file name unix_path not null, -- path name, relative to containing dir perms file_perms not null -- unix-like permissions ); comment on table directory_entry_file is 'Directory entry for file'; comment on column directory_entry_file.id is 'File identifier'; comment on column directory_entry_file.target is 'Target file identifier'; comment on column directory_entry_file.name is 'Path name, relative to containing directory'; comment on column directory_entry_file.perms is 'Unix-like permissions'; -- A directory entry pointing to a revision. create table directory_entry_rev ( id bigserial, target sha1_git not null, -- id of target revision name unix_path not null, -- path name, relative to containing dir perms file_perms not null -- unix-like permissions ); comment on table directory_entry_rev is 'Directory entry for revision'; comment on column directory_entry_dir.id is 'Revision identifier'; comment on column directory_entry_dir.target is 'Target revision in identifier'; comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; comment on column directory_entry_dir.perms is 'Unix-like permissions'; -- A person referenced by some source code artifacts, e.g., a VCS revision or -- release metadata. create table person ( id bigserial, name bytea, -- advisory: not null if we managed to parse a name email bytea, -- advisory: not null if we managed to parse an email fullname bytea not null -- freeform specification; what is actually used in the checksums -- will usually be of the form 'name ' ); comment on table person is 'Person referenced in code artifact release metadata'; comment on column person.id is 'Person identifier'; comment on column person.name is 'Name'; comment on column person.email is 'Email'; comment on column person.fullname is 'Full name (raw name)'; -- The state of a source code tree at a specific point in time. -- -- Synonyms/mappings: -- * git / subversion / etc: commit -- * tarball: a specific tarball -- -- Revisions are organized as DAGs. Each revision points to 0, 1, or more (in -- case of merges) parent revisions. Each revision points to a directory, i.e., -- a file-system tree containing files and directories. create table revision ( id sha1_git not null, date timestamptz, date_offset smallint, committer_date timestamptz, committer_date_offset smallint, type revision_type not null, directory sha1_git, -- source code 'root' directory message bytea, author bigint, committer bigint, synthetic boolean not null default false, -- true iff revision has been created by Software Heritage metadata jsonb, -- extra metadata (tarball checksums, extra commit information, etc...) object_id bigserial, date_neg_utc_offset boolean, committer_date_neg_utc_offset boolean ); -comment on table revision is 'Revision represents the state of a source code tree at a +comment on table revision is 'Revision represents the state of a source code tree at a specific point in time'; comment on column revision.id is 'Git id of sha1 checksum'; comment on column revision.date is 'Timestamp when revision was authored'; comment on column revision.date_offset is 'Authored timestamp offset from UTC'; comment on column revision.committer_date is 'Timestamp when revision was committed'; comment on column revision.committer_date_offset is 'Committed timestamp offset from UTC'; comment on column revision.type is 'Possible revision types (''git'', ''tar'', ''dsc'', ''svn'', ''hg'')'; comment on column revision.directory is 'Directory identifier'; comment on column revision.message is 'Revision message'; comment on column revision.author is 'Author identifier'; comment on column revision.committer is 'Committer identifier'; comment on column revision.synthetic is 'true iff revision has been created by Software Heritage'; comment on column revision.metadata is 'extra metadata (tarball checksums, extra commit information, etc...)'; comment on column revision.object_id is 'Object identifier'; comment on column revision.date_neg_utc_offset is 'True indicates -0 UTC offset for author timestamp'; comment on column revision.committer_date_neg_utc_offset is 'True indicates -0 UTC offset for committer timestamp'; -- either this table or the sha1_git[] column on the revision table create table revision_history ( id sha1_git not null, parent_id sha1_git not null, parent_rank int not null default 0 -- parent position in merge commits, 0-based ); comment on table revision_history is 'Sequence of revision history with parent and position in history'; comment on column revision_history.id is 'Revision history git object sha1 checksum'; comment on column revision_history.parent_id is 'Parent revision git object identifier'; comment on column revision_history.parent_rank is 'Parent position in merge commits, 0-based'; -- Crawling history of software origins visited by Software Heritage. Each -- visit is a 3-way mapping between a software origin, a timestamp, and a -- snapshot object capturing the full-state of the origin at visit time. create table origin_visit ( origin bigint not null, visit bigint not null, date timestamptz not null, type text not null, status origin_visit_status not null, metadata jsonb, snapshot sha1_git ); comment on column origin_visit.origin is 'Visited origin'; comment on column origin_visit.visit is 'Sequential visit number for the origin'; comment on column origin_visit.date is 'Visit timestamp'; comment on column origin_visit.type is 'Type of loader that did the visit (hg, git, ...)'; comment on column origin_visit.status is 'Visit result'; comment on column origin_visit.metadata is 'Origin metadata at visit time'; comment on column origin_visit.snapshot is 'Origin snapshot at visit time'; -- A snapshot represents the entire state of a software origin as crawled by -- Software Heritage. This table is a simple mapping between (public) intrinsic -- snapshot identifiers and (private) numeric sequential identifiers. create table snapshot ( object_id bigserial not null, -- PK internal object identifier id sha1_git not null -- snapshot intrinsic identifier ); comment on table snapshot is 'State of a software origin as crawled by Software Heritage'; comment on column snapshot.object_id is 'Internal object identifier'; comment on column snapshot.id is 'Intrinsic snapshot identifier'; -- Each snapshot associate "branch" names to other objects in the Software -- Heritage Merkle DAG. This table describes branches as mappings between names -- and target typed objects. create table snapshot_branch ( object_id bigserial not null, -- PK internal object identifier name bytea not null, -- branch name, e.g., "master" or "feature/drag-n-drop" target bytea, -- target object identifier, e.g., a revision identifier target_type snapshot_target -- target object type, e.g., "revision" ); comment on table snapshot_branch is 'Associates branches with objects in Heritage Merkle DAG'; comment on column snapshot_branch.object_id is 'Internal object identifier'; comment on column snapshot_branch.name is 'Branch name'; comment on column snapshot_branch.target is 'Target object identifier'; comment on column snapshot_branch.target_type is 'Target object type'; -- Mapping between snapshots and their branches. create table snapshot_branches ( snapshot_id bigint not null, -- snapshot identifier, ref. snapshot.object_id branch_id bigint not null -- branch identifier, ref. snapshot_branch.object_id ); comment on table snapshot_branches is 'Mapping between snapshot and their branches'; comment on column snapshot_branches.snapshot_id is 'Snapshot identifier'; comment on column snapshot_branches.branch_id is 'Branch identifier'; -- A "memorable" point in time in the development history of a software -- project. -- -- Synonyms/mappings: -- * git: tag (of the annotated kind, otherwise they are just references) -- * tarball: the release version number create table release ( id sha1_git not null, target sha1_git, date timestamptz, date_offset smallint, name bytea, comment bytea, author bigint, synthetic boolean not null default false, -- true iff release has been created by Software Heritage object_id bigserial, target_type object_type not null, date_neg_utc_offset boolean ); comment on table release is 'Details of a software release, synonymous with a tag (git) or version number (tarball)'; comment on column release.id is 'Release git identifier'; comment on column release.target is 'Target git identifier'; comment on column release.date is 'Release timestamp'; comment on column release.date_offset is 'Timestamp offset from UTC'; comment on column release.name is 'Name'; comment on column release.comment is 'Comment'; comment on column release.author is 'Author'; comment on column release.synthetic is 'Indicates if created by Software Heritage'; comment on column release.object_id is 'Object identifier'; comment on column release.target_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp'; -- Tools create table tool ( id serial not null, name text not null, version text not null, configuration jsonb ); comment on table tool is 'Tool information'; comment on column tool.id is 'Tool identifier'; comment on column tool.version is 'Tool name'; comment on column tool.version is 'Tool version'; comment on column tool.configuration is 'Tool configuration: command line, flags, etc...'; create table metadata_provider ( id serial not null, provider_name text not null, provider_type text not null, provider_url text, metadata jsonb ); comment on table metadata_provider is 'Metadata provider information'; comment on column metadata_provider.id is 'Provider''s identifier'; comment on column metadata_provider.provider_name is 'Provider''s name'; comment on column metadata_provider.provider_url is 'Provider''s url'; comment on column metadata_provider.metadata is 'Other metadata about provider'; -- Discovery of metadata during a listing, loading, deposit or external_catalog of an origin -- also provides a translation to a defined json schema using a translation tool (tool_id) create table origin_metadata ( id bigserial not null, -- PK internal object identifier origin_id bigint not null, -- references origin(id) discovery_date timestamptz not null, -- when it was extracted provider_id bigint not null, -- ex: 'hal', 'lister-github', 'loader-github' tool_id bigint not null, metadata jsonb not null ); comment on table origin_metadata is 'keeps all metadata found concerning an origin'; comment on column origin_metadata.id is 'the origin_metadata object''s id'; comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found'; comment on column origin_metadata.discovery_date is 'the date of retrieval'; comment on column origin_metadata.provider_id is 'the metadata provider: github, openhub, deposit, etc.'; comment on column origin_metadata.tool_id is 'the tool used for extracting metadata: lister-github, etc.'; comment on column origin_metadata.metadata is 'metadata in json format but with original terms'; -- Keep a cache of object counts create table object_counts ( object_type text, -- table for which we're counting objects (PK) value bigint, -- count of objects in the table last_update timestamptz, -- last update for the object count in this table single_update boolean -- whether we update this table standalone (true) or through bucketed counts (false) ); comment on table object_counts is 'Cache of object counts'; comment on column object_counts.object_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column object_counts.value is 'Count of objects in the table'; comment on column object_counts.last_update is 'Last update for object count'; comment on column object_counts.single_update is 'standalone (true) or bucketed counts (false)'; create table object_counts_bucketed ( line serial not null, -- PK object_type text not null, -- table for which we're counting objects identifier text not null, -- identifier across which we're bucketing objects bucket_start bytea, -- lower bound (inclusive) for the bucket bucket_end bytea, -- upper bound (exclusive) for the bucket value bigint, -- count of objects in the bucket last_update timestamptz -- last update for the object count in this bucket ); comment on table object_counts_bucketed is 'Bucketed count for objects ordered by type'; comment on column object_counts_bucketed.line is 'Auto incremented idenitfier value'; comment on column object_counts_bucketed.object_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column object_counts_bucketed.identifier is 'Common identifier for bucketed objects'; comment on column object_counts_bucketed.bucket_start is 'Lower bound (inclusive) for the bucket'; comment on column object_counts_bucketed.bucket_end is 'Upper bound (exclusive) for the bucket'; comment on column object_counts_bucketed.value is 'Count of objects in the bucket'; comment on column object_counts_bucketed.last_update is 'Last update for the object count in this bucket';