diff --git a/sql/Makefile b/sql/Makefile index 20a10b1ba..825616686 100644 --- a/sql/Makefile +++ b/sql/Makefile @@ -1,54 +1,55 @@ # Depends: postgresql-client, postgresql-autodoc DBNAME = softwareheritage-dev DOCDIR = autodoc -SQL_INIT = swh-init.sql -SQL_ENUMS = swh-enums.sql -SQL_SCHEMA = swh-schema.sql -SQL_FUNC = swh-func.sql -SQL_DATA = swh-data.sql -SQLS = $(SQL_INIT) $(SQL_ENUMS) $(SQL_SCHEMA) $(SQL_FUNC) $(SQL_DATA) +SQL_INIT = swh-init.sql +SQL_ENUMS = swh-enums.sql +SQL_SCHEMA = swh-schema.sql +SQL_FUNC = swh-func.sql +SQL_DATA = swh-data.sql +SQL_INDEX = swh-indexes.sql +SQL_TRIGGER = swh-triggers.sql +SQLS = $(SQL_INIT) $(SQL_ENUMS) $(SQL_SCHEMA) $(SQL_FUNC) $(SQL_INDEX) $(SQL_TRIGGER) $(SQL_DATA) PSQL_BIN = psql -PSQL_FLAGS = --single-transaction --echo-all -X +PSQL_FLAGS = --echo-all -X -v ON_ERROR_STOP=1 PSQL = $(PSQL_BIN) $(PSQL_FLAGS) - all: createdb: createdb-stamp createdb-stamp: $(SQL_INIT) createdb $(DBNAME) touch $@ filldb: filldb-stamp filldb-stamp: createdb-stamp cat $(SQLS) | $(PSQL) $(DBNAME) touch $@ dropdb: -dropdb $(DBNAME) dumpdb: swh.dump swh.dump: filldb-stamp pg_dump -Fc $(DBNAME) > $@ doc: autodoc-stamp $(DOCDIR)/swh.pdf autodoc-stamp: filldb-stamp test -d $(DOCDIR)/ || mkdir $(DOCDIR) postgresql_autodoc -d $(DBNAME) -f $(DOCDIR)/swh cp -a $(DOCDIR)/swh.dot $(DOCDIR)/swh.dot.orig touch $@ $(DOCDIR)/swh.pdf: clusters.dot autodoc-stamp bin/dot_add_content $(DOCDIR)/swh.dot.orig clusters.dot > $(DOCDIR)/swh.dot dot -T pdf $(DOCDIR)/swh.dot > $(DOCDIR)/swh.pdf clean: rm -rf *-stamp $(DOCDIR)/ distclean: clean dropdb rm -f swh.dump .PHONY: all initdb createdb dropdb doc clean diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql new file mode 100644 index 000000000..e2fd58dc4 --- /dev/null +++ b/sql/swh-indexes.sql @@ -0,0 +1,272 @@ +-- content + +create unique index concurrently content_pkey on content(sha1); +create unique index concurrently on content(sha1_git); +create unique index concurrently on content(sha256); +create index concurrently on content(ctime); -- TODO use a BRIN index here (postgres >= 9.5) +create index concurrently on content(object_id); + +alter table content add primary key using index content_pkey; + + +-- entity_history + +create unique index concurrently entity_history_pkey on entity_history(id); +create index concurrently on entity_history(uuid); +create index concurrently on entity_history(name); + +alter table entity_history add primary key using index entity_history_pkey; + +-- entity + +create unique index concurrently entity_pkey on entity(uuid); + +create index concurrently on entity(name); +create index concurrently on entity using gin(lister_metadata jsonb_path_ops); + +alter table entity add primary key using index entity_pkey; +alter table entity add constraint entity_parent_fkey foreign key (parent) references entity(uuid) deferrable initially deferred not valid; +alter table entity validate constraint entity_parent_fkey; +alter table entity add constraint entity_last_id_fkey foreign key (last_id) references entity_history(id) not valid; +alter table entity validate constraint entity_last_id_fkey; + +-- entity_equivalence + +create unique index concurrently entity_equivalence_pkey on entity_equivalence(entity1, entity2); +alter table entity_equivalence add primary key using index entity_equivalence_pkey; + + +alter table entity_equivalence add constraint "entity_equivalence_entity1_fkey" foreign key (entity1) references entity(uuid) not valid; +alter table entity_equivalence validate constraint entity_equivalence_entity1_fkey; +alter table entity_equivalence add constraint "entity_equivalence_entity2_fkey" foreign key (entity2) references entity(uuid) not valid; +alter table entity_equivalence validate constraint entity_equivalence_entity2_fkey; +alter table entity_equivalence add constraint "order_entities" check (entity1 < entity2) not valid; +alter table entity_equivalence validate constraint order_entities; + +-- listable_entity + +create unique index concurrently listable_entity_pkey on listable_entity(uuid); +alter table listable_entity add primary key using index listable_entity_pkey; + +alter table listable_entity add constraint listable_entity_uuid_fkey foreign key (uuid) references entity(uuid) not valid; +alter table listable_entity validate constraint listable_entity_uuid_fkey; + +-- list_history + +create unique index concurrently list_history_pkey on list_history(id); +alter table list_history add primary key using index list_history_pkey; + +alter table list_history add constraint list_history_entity_fkey foreign key (entity) references listable_entity(uuid) not valid; +alter table list_history validate constraint list_history_entity_fkey; + +-- origin +create unique index concurrently origin_pkey on origin(id); +alter table origin add primary key using index origin_pkey; + +create index concurrently on origin(type, url); + +alter table origin add constraint origin_lister_fkey foreign key (lister) references listable_entity(uuid) not valid; +alter table origin validate constraint origin_lister_fkey; + +alter table origin add constraint origin_project_fkey foreign key (project) references entity(uuid) not valid; +alter table origin validate constraint origin_project_fkey; + +-- skipped_content + +alter table skipped_content add constraint skipped_content_sha1_sha1_git_sha256_key unique (sha1, sha1_git, sha256); + +create unique index concurrently on skipped_content(sha1); +create unique index concurrently on skipped_content(sha1_git); +create unique index concurrently on skipped_content(sha256); +create index concurrently on skipped_content(object_id); + +alter table skipped_content add constraint skipped_content_origin_fkey foreign key (origin) references origin(id) not valid; +alter table skipped_content validate constraint skipped_content_origin_fkey; + +-- fetch_history + +create unique index concurrently fetch_history_pkey on fetch_history(id); +alter table fetch_history add primary key using index fetch_history_pkey; + +alter table fetch_history add constraint fetch_history_origin_fkey foreign key (origin) references origin(id) not valid; +alter table fetch_history validate constraint fetch_history_origin_fkey; + +-- directory + +create unique index concurrently directory_pkey on directory(id); +alter table directory add primary key using index directory_pkey; + +create index concurrently on directory using gin (dir_entries); +create index concurrently on directory using gin (file_entries); +create index concurrently on directory using gin (rev_entries); +create index concurrently on directory(object_id); + +-- directory_entry_dir + +create unique index concurrently directory_entry_dir_pkey on directory_entry_dir(id); +alter table directory_entry_dir add primary key using index directory_entry_dir_pkey; + +create unique index concurrently on directory_entry_dir(target, name, perms); + +-- directory_entry_file + +create unique index concurrently directory_entry_file_pkey on directory_entry_file(id); +alter table directory_entry_file add primary key using index directory_entry_file_pkey; + +create unique index concurrently on directory_entry_file(target, name, perms); + +-- directory_entry_rev + +create unique index concurrently directory_entry_rev_pkey on directory_entry_rev(id); +alter table directory_entry_rev add primary key using index directory_entry_rev_pkey; + +create unique index concurrently on directory_entry_rev(target, name, perms); + +-- person +create unique index concurrently person_pkey on person(id); +alter table person add primary key using index person_pkey; + +create unique index concurrently on person(fullname); +create index concurrently on person(name); +create index concurrently on person(email); + +-- revision +create unique index concurrently revision_pkey on revision(id); +alter table revision add primary key using index revision_pkey; + +alter table revision add constraint revision_author_fkey foreign key (author) references person(id) not valid; +alter table revision validate constraint revision_author_fkey; +alter table revision add constraint revision_committer_fkey foreign key (committer) references person(id) not valid; +alter table revision validate constraint revision_committer_fkey; + +create index concurrently on revision(directory); +create index concurrently on revision(object_id); + +-- revision_history +create unique index concurrently revision_history_pkey on revision_history(id, parent_rank); +alter table revision_history add primary key using index revision_history_pkey; + +create index concurrently on revision_history(parent_id); + +alter table revision_history add constraint revision_history_id_fkey foreign key (id) references revision(id) not valid; +alter table revision_history validate constraint revision_history_id_fkey; + +-- origin_visit +create unique index concurrently origin_visit_pkey on origin_visit(origin, visit); +alter table origin_visit add primary key using index origin_visit_pkey; + +create index concurrently on origin_visit(date); + +alter table origin_visit add constraint origin_visit_origin_fkey foreign key (origin) references origin(id) not valid; +alter table origin_visit validate constraint origin_visit_origin_fkey; + +-- occurrence_history +create unique index concurrently occurrence_history_pkey on occurrence_history(object_id); +alter table occurrence_history add primary key using index occurrence_history_pkey; + +create index concurrently on occurrence_history(target, target_type); +create index concurrently on occurrence_history(origin, branch); +create unique index concurrently on occurrence_history(origin, branch, target, target_type); + +alter table occurrence_history add constraint occurrence_history_origin_fkey foreign key (origin) references origin(id) not valid; +alter table occurrence_history validate constraint occurrence_history_origin_fkey; + +-- occurrence +create unique index concurrently occurrence_pkey on occurrence(origin, branch); +alter table occurrence add primary key using index occurrence_pkey; + +alter table occurrence add constraint occurrence_origin_fkey foreign key (origin) references origin(id) not valid; +alter table occurrence validate constraint occurrence_origin_fkey; + + +-- release +create unique index concurrently release_pkey on release(id); +alter table release add primary key using index release_pkey; + +create index concurrently on release(target, target_type); +create index concurrently on release(object_id); + +alter table release add constraint release_author_fkey foreign key (author) references person(id) not valid; +alter table release validate constraint release_author_fkey; + +-- cache_content_revision +create unique index concurrently cache_content_revision_pkey on cache_content_revision(content); +alter table cache_content_revision add primary key using index cache_content_revision_pkey; + +alter table cache_content_revision add constraint cache_content_revision_content_fkey foreign key (content) references content(sha1_git) not valid; +alter table cache_content_revision validate constraint cache_content_revision_content_fkey; + +-- cache_content_revision_processed +create unique index concurrently cache_content_revision_processed_pkey on cache_content_revision_processed(revision); +alter table cache_content_revision_processed add primary key using index cache_content_revision_processed_pkey; + +alter table cache_content_revision_processed add constraint cache_content_revision_processed_revision_fkey foreign key (revision) references revision(id) not valid; +alter table cache_content_revision_processed validate constraint cache_content_revision_processed_revision_fkey; + +-- cache_revision_origin +create unique index concurrently cache_revision_origin_pkey on cache_revision_origin(revision, origin, visit); +alter table cache_revision_origin add primary key using index cache_revision_origin_pkey; + +alter table cache_revision_origin add constraint cache_revision_origin_revision_fkey foreign key (revision) references revision(id) not valid; +alter table cache_revision_origin validate constraint cache_revision_origin_revision_fkey; + +alter table cache_revision_origin add constraint cache_revision_origin_origin_fkey foreign key (origin, visit) references origin_visit(origin, visit) not valid; +alter table cache_revision_origin validate constraint cache_revision_origin_origin_fkey; + +create index concurrently on cache_revision_origin(revision); + +-- indexer_configuration +create unique index concurrently indexer_configuration_pkey on indexer_configuration(id); +alter table indexer_configuration add primary key using index indexer_configuration_pkey; + +create unique index on indexer_configuration(tool_name, tool_version); + +-- content_mimetype +create unique index concurrently content_mimetype_pkey on content_mimetype(id, indexer_configuration_id); +alter table content_mimetype add primary key using index content_mimetype_pkey; + +alter table content_mimetype add constraint content_mimetype_id_fkey foreign key (id) references content(sha1) not valid; +alter table content_mimetype validate constraint content_mimetype_id_fkey; + +alter table content_mimetype add constraint content_mimetype_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_mimetype validate constraint content_mimetype_indexer_configuration_id_fkey; + +-- content_language +create unique index concurrently content_language_pkey on content_language(id, indexer_configuration_id); +alter table content_language add primary key using index content_language_pkey; + +alter table content_language add constraint content_language_id_fkey foreign key (id) references content(sha1) not valid; +alter table content_language validate constraint content_language_id_fkey; + +alter table content_language add constraint content_language_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_language validate constraint content_language_indexer_configuration_id_fkey; + +-- content_ctags +create index concurrently on content_ctags(id); +create index concurrently on content_ctags(hash_sha1(name)); +create unique index concurrently on content_ctags(id, hash_sha1(name), kind, line, lang, indexer_configuration_id); + +alter table content_ctags add constraint content_ctags_id_fkey foreign key (id) references content(sha1) not valid; +alter table content_ctags validate constraint content_ctags_id_fkey; + +alter table content_ctags add constraint content_ctags_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_ctags validate constraint content_ctags_indexer_configuration_id_fkey; + +-- fossology_license +create unique index concurrently fossology_license_pkey on fossology_license(id); +alter table fossology_license add primary key using index fossology_license_pkey; + +create unique index on fossology_license(name); + +-- content_fossology_license +create unique index concurrently content_fossology_license_pkey on content_fossology_license(id, license_id, indexer_configuration_id); +alter table content_fossology_license add primary key using index content_fossology_license_pkey; + +alter table content_fossology_license add constraint content_fossology_license_id_fkey foreign key (id) references content(sha1) not valid; +alter table content_fossology_license validate constraint content_fossology_license_id_fkey; + +alter table content_fossology_license add constraint content_fossology_license_license_id_fkey foreign key (license_id) references fossology_license(id) not valid; +alter table content_fossology_license validate constraint content_fossology_license_license_id_fkey; + +alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql index f05c73a85..d800862c1 100644 --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -1,652 +1,460 @@ --- --- Software Heritage Data Model --- -- drop schema if exists swh cascade; -- create schema swh; -- set search_path to swh; create table dbversion ( version int primary key, release timestamptz, description text ); insert into dbversion(version, release, description) - values(97, now(), 'Work In Progress'); + values(98, now(), 'Work In Progress'); -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); -- a SHA256 checksum create domain sha256 as bytea check (length(value) = 32); -- UNIX path (absolute, relative, individual path component, etc.) create domain unix_path as bytea; -- a set of UNIX-like access permissions, as manipulated by, e.g., chmod create domain file_perms as int; -- Checksums about actual file content. Note that the content itself is not -- stored in the DB, but on external (key-value) storage. A single checksum is -- used as key there, but the other can be used to verify that we do not inject -- content collisions not knowingly. create table content ( - sha1 sha1 primary key, + sha1 sha1 not null, sha1_git sha1_git not null, sha256 sha256 not null, length bigint not null, ctime timestamptz not null default now(), -- creation time, i.e. time of (first) injection into the storage status content_status not null default 'visible', object_id bigserial ); -create unique index on content(sha1_git); -create unique index on content(sha256); -create index on content(ctime); -- TODO use a BRIN index here (postgres >= 9.5) -create index on content(object_id); - --- Asynchronous notification of new content insertions -create function notify_new_content() - returns trigger - language plpgsql -as $$ - begin - perform pg_notify('new_content', encode(new.sha1, 'hex')); - return null; - end; -$$; - -create trigger notify_new_content - after insert on content - for each row - execute procedure notify_new_content(); - -- Entities constitute a typed hierarchy of organization, hosting -- facilities, groups, people and software projects. -- -- Examples of entities: Software Heritage, Debian, GNU, GitHub, -- Apache, The Linux Foundation, the Debian Python Modules Team, the -- torvalds GitHub user, the torvalds/linux GitHub project. -- -- The data model is hierarchical (via the parent attribute) and might -- store sub-branches of existing entities. The key feature of an -- entity is might be *listed* (if it is available in listable_entity) -- to retrieve information about its content, i.e: sub-entities, -- projects, origins. -- The history of entities. Allows us to keep historical metadata -- about entities. The temporal invariant is the uuid. Root -- organization uuids are manually generated (and available in -- swh-data.sql). -- -- For generated entities (generated = true), we can provide -- generation_metadata to allow listers to retrieve the uuids of previous -- iterations of the entity. -- -- Inactive entities that have been active in the past (active = -- false) should register the timestamp at which we saw them -- deactivate, in a new entry of entity_history. create table entity_history ( - id bigserial primary key, + id bigserial not null, uuid uuid, parent uuid, -- should reference entity_history(uuid) name text not null, type entity_type not null, description text, homepage text, active boolean not null, -- whether the entity was seen on the last listing generated boolean not null, -- whether this entity has been generated by a lister lister_metadata jsonb, -- lister-specific metadata, used for queries metadata jsonb, validity timestamptz[] -- timestamps at which we have seen this entity ); -create index on entity_history(uuid); -create index on entity_history(name); - -- The entity table provides a view of the latest information on a -- given entity. It is updated via a trigger on entity_history. create table entity ( - uuid uuid primary key, - parent uuid references entity(uuid) deferrable initially deferred, + uuid uuid not null, + parent uuid, name text not null, type entity_type not null, description text, homepage text, active boolean not null, -- whether the entity was seen on the last listing generated boolean not null, -- whether this entity has been generated by a lister lister_metadata jsonb, -- lister-specific metadata, used for queries metadata jsonb, last_seen timestamptz, -- last listing time or disappearance time for active=false - last_id bigint references entity_history(id) -- last listing id + last_id bigint -- last listing id ); -create index on entity(name); -create index on entity using gin(lister_metadata jsonb_path_ops); - -- Register the equivalence between two entities. Allows sideways -- navigation in the entity table create table entity_equivalence ( - entity1 uuid references entity(uuid), - entity2 uuid references entity(uuid), - primary key (entity1, entity2), - constraint order_entities check (entity1 < entity2) + entity1 uuid, + entity2 uuid ); -- Register a lister for a specific entity. create table listable_entity ( - uuid uuid references entity(uuid) primary key, + uuid uuid, enabled boolean not null default true, -- do we list this entity automatically? list_engine text, -- crawler to be used to list entity's content list_url text, -- root URL to start the listing list_params jsonb, -- org-specific listing parameter latest_list timestamptz -- last time the entity's content has been listed ); -- Log of all entity listings (i.e., entity crawling) that have been -- done in the past, or are still ongoing. create table list_history ( - id bigserial primary key, - entity uuid references listable_entity(uuid), + id bigserial not null, + entity uuid, date timestamptz not null, status boolean, -- true if and only if the listing has been successful result jsonb, -- more detailed return value, depending on status stdout text, stderr text, duration interval -- fetch duration of NULL if still ongoing ); -- An origin is a place, identified by an URL, where software can be found. We -- support different kinds of origins, e.g., git and other VCS repositories, -- web pages that list tarballs URLs (e.g., http://www.kernel.org), indirect -- tarball URLs (e.g., http://www.example.org/latest.tar.gz), etc. The key -- feature of an origin is that it can be *fetched* (wget, git clone, svn -- checkout, etc.) to retrieve all the contained software. create table origin ( - id bigserial primary key, + id bigserial not null, type text, -- TODO use an enum here (?) url text not null, - lister uuid references listable_entity(uuid), - project uuid references entity(uuid) + lister uuid, + project uuid ); -create index on origin(type, url); - --- Asynchronous notification of new origin insertions -create function notify_new_origin() - returns trigger - language plpgsql -as $$ - begin - perform pg_notify('new_origin', new.id::text); - return null; - end; -$$; - -create trigger notify_new_origin - after insert on origin - for each row - execute procedure notify_new_origin(); - -- Content we have seen but skipped for some reason. This table is -- separate from the content table as we might not have the sha1 -- checksum of that data (for instance when we inject git -- repositories, objects that are too big will be skipped here, and we -- will only know their sha1_git). 'reason' contains the reason the -- content was skipped. origin is a nullable column allowing to find -- out which origin contains that skipped content. create table skipped_content ( sha1 sha1, sha1_git sha1_git, sha256 sha256, length bigint not null, ctime timestamptz not null default now(), status content_status not null default 'absent', reason text not null, - origin bigint references origin(id), - object_id bigserial, - unique (sha1, sha1_git, sha256) + origin bigint, + object_id bigserial ); --- Those indexes support multiple NULL values. -create unique index on skipped_content(sha1); -create unique index on skipped_content(sha1_git); -create unique index on skipped_content(sha256); -create index on skipped_content(object_id); - --- Asynchronous notification of new skipped content insertions -create function notify_new_skipped_content() - returns trigger - language plpgsql -as $$ - begin - perform pg_notify('new_skipped_content', json_build_object( - 'sha1', encode(new.sha1, 'hex'), - 'sha1_git', encode(new.sha1_git, 'hex'), - 'sha256', encode(new.sha256, 'hex') - )::text); - return null; - end; -$$; - -create trigger notify_new_skipped_content - after insert on skipped_content - for each row - execute procedure notify_new_skipped_content(); - - -- Log of all origin fetches (i.e., origin crawling) that have been done in the -- past, or are still ongoing. Similar to list_history, but for origins. create table fetch_history ( - id bigserial primary key, - origin bigint references origin(id), + id bigserial, + origin bigint, date timestamptz not null, status boolean, -- true if and only if the fetch has been successful result jsonb, -- more detailed returned values, times, etc... stdout text, stderr text, -- null when status is true, filled otherwise duration interval -- fetch duration of NULL if still ongoing ); -- A file-system directory. A directory is a list of directory entries (see -- tables: directory_entry_{dir,file}). -- -- To list the contents of a directory: -- 1. list the contained directory_entry_dir using array dir_entries -- 2. list the contained directory_entry_file using array file_entries -- 3. list the contained directory_entry_rev using array rev_entries -- 4. UNION -- -- Synonyms/mappings: -- * git: tree create table directory ( - id sha1_git primary key, + id sha1_git, dir_entries bigint[], -- sub-directories, reference directory_entry_dir file_entries bigint[], -- contained files, reference directory_entry_file rev_entries bigint[], -- mounted revisions, reference directory_entry_rev object_id bigserial -- short object identifier ); -create index on directory using gin (dir_entries); -create index on directory using gin (file_entries); -create index on directory using gin (rev_entries); -create index on directory(object_id); - --- Asynchronous notification of new directory insertions -create function notify_new_directory() - returns trigger - language plpgsql -as $$ - begin - perform pg_notify('new_directory', encode(new.id, 'hex')); - return null; - end; -$$; - -create trigger notify_new_directory - after insert on directory - for each row - execute procedure notify_new_directory(); - - -- A directory entry pointing to a sub-directory. create table directory_entry_dir ( - id bigserial primary key, + id bigserial, target sha1_git, -- id of target directory name unix_path, -- path name, relative to containing dir perms file_perms -- unix-like permissions ); -create unique index on directory_entry_dir(target, name, perms); - -- A directory entry pointing to a file. create table directory_entry_file ( - id bigserial primary key, + id bigserial, target sha1_git, -- id of target file name unix_path, -- path name, relative to containing dir perms file_perms -- unix-like permissions ); -create unique index on directory_entry_file(target, name, perms); - -- A directory entry pointing to a revision. create table directory_entry_rev ( - id bigserial primary key, + id bigserial, target sha1_git, -- id of target revision name unix_path, -- path name, relative to containing dir perms file_perms -- unix-like permissions ); -create unique index on directory_entry_rev(target, name, perms); - create table person ( - id bigserial primary key, + id bigserial, fullname bytea not null, -- freeform specification; what is actually used in the checksums -- will usually be of the form 'name ' name bytea, -- advisory: not null if we managed to parse a name email bytea -- advisory: not null if we managed to parse an email ); -create unique index on person(fullname); -create index on person(name); -create index on person(email); - -- A snapshot of a software project at a specific point in time. -- -- Synonyms/mappings: -- * git / subversion / etc: commit -- * tarball: a specific tarball -- -- Revisions are organized as DAGs. Each revision points to 0, 1, or more (in -- case of merges) parent revisions. Each revision points to a directory, i.e., -- a file-system tree containing files and directories. create table revision ( - id sha1_git primary key, + id sha1_git, date timestamptz, date_offset smallint, date_neg_utc_offset boolean, committer_date timestamptz, committer_date_offset smallint, committer_date_neg_utc_offset boolean, type revision_type not null, directory sha1_git, -- file-system tree message bytea, - author bigint references person(id), - committer bigint references person(id), + author bigint, + committer bigint, metadata jsonb, -- extra metadata (tarball checksums, extra commit information, etc...) synthetic boolean not null default false, -- true if synthetic (cf. swh-loader-tar) object_id bigserial ); -create index on revision(directory); -create index on revision(object_id); - --- Asynchronous notification of new revision insertions -create function notify_new_revision() - returns trigger - language plpgsql -as $$ - begin - perform pg_notify('new_revision', encode(new.id, 'hex')); - return null; - end; -$$; - -create trigger notify_new_revision - after insert on revision - for each row - execute procedure notify_new_revision(); - -- either this table or the sha1_git[] column on the revision table create table revision_history ( - id sha1_git references revision(id), + id sha1_git, parent_id sha1_git, - parent_rank int not null default 0, + parent_rank int not null default 0 -- parent position in merge commits, 0-based - primary key (id, parent_rank) ); -create index on revision_history(parent_id); - -- The timestamps at which Software Heritage has made a visit of the given origin. create table origin_visit ( - origin bigint not null references origin(id), + origin bigint not null, visit bigint not null, date timestamptz not null, status origin_visit_status not null, - metadata jsonb, - primary key (origin, visit) + metadata jsonb ); comment on column origin_visit.origin is 'Visited origin'; comment on column origin_visit.visit is 'Visit number the visit occurred for that origin'; comment on column origin_visit.date is 'Visit date for that origin'; comment on column origin_visit.status is 'Visit status for that origin'; comment on column origin_visit.metadata is 'Metadata associated with the visit'; -create index on origin_visit(date); - --- Asynchronous notification of new origin visits -create function notify_new_origin_visit() - returns trigger - language plpgsql -as $$ - begin - perform pg_notify('new_origin_visit', json_build_object( - 'origin', new.origin, - 'visit', new.visit - )::text); - return null; - end; -$$; - -create trigger notify_new_origin_visit - after insert on origin_visit - for each row - execute procedure notify_new_origin_visit(); - -- The content of software origins is indexed starting from top-level pointers -- called "branches". Every time we fetch some origin we store in this table -- where the branches pointed to at fetch time. -- -- Synonyms/mappings: -- * git: ref (in the "git update-ref" sense) create table occurrence_history ( - origin bigint references origin(id) not null, + origin bigint not null, branch bytea not null, -- e.g., b"master" (for VCS), or b"sid" (for Debian) target sha1_git not null, -- ref target, e.g., commit id target_type object_type not null, -- ref target type object_id bigserial not null, -- short object identifier - visits bigint[] not null, -- the visits where that occurrence was valid. References + visits bigint[] not null -- the visits where that occurrence was valid. References -- origin_visit(visit), where o_h.origin = origin_visit.origin. - primary key (object_id) ); -create index on occurrence_history(target, target_type); -create index on occurrence_history(origin, branch); -create unique index on occurrence_history(origin, branch, target, target_type); -create index on occurrence_history(object_id); - -- Materialized view of occurrence_history, storing the *current* value of each -- branch, as last seen by SWH. create table occurrence ( - origin bigint references origin(id) not null, + origin bigint, branch bytea not null, target sha1_git not null, - target_type object_type not null, - primary key(origin, branch) + target_type object_type not null ); -- A "memorable" point in the development history of a project. -- -- Synonyms/mappings: -- * git: tag (of the annotated kind, otherwise they are just references) -- * tarball: the release version number create table release ( - id sha1_git primary key, + id sha1_git, target sha1_git, target_type object_type, date timestamptz, date_offset smallint, date_neg_utc_offset boolean, name bytea, comment bytea, - author bigint references person(id), + author bigint, synthetic boolean not null default false, -- true if synthetic (cf. swh-loader-tar) object_id bigserial ); -create index on release(target, target_type); -create index on release(object_id); - --- Asynchronous notification of new release insertions -create function notify_new_release() - returns trigger - language plpgsql -as $$ - begin - perform pg_notify('new_release', encode(new.id, 'hex')); - return null; - end; -$$; - -create trigger notify_new_release - after insert on release - for each row - execute procedure notify_new_release(); - -- Content provenance information caches -- https://forge.softwareheritage.org/T547 -- -- Those tables aren't expected to be exhaustive, and get filled on a case by -- case basis: absence of data doesn't mean the data is not there -- content <-> revision mapping cache -- -- semantics: "we have seen the content with given id in the given path inside -- the given revision" create table cache_content_revision ( - content sha1_git not null primary key references content(sha1_git), + content sha1_git not null, blacklisted boolean default false, revision_paths bytea[][] ); create table cache_content_revision_processed ( - revision sha1_git not null primary key references revision(id) + revision sha1_git not null ); -- revision <-> origin_visit mapping cache -- -- semantics: "we have seen the given revision in the given origin during the -- given visit" create table cache_revision_origin ( - revision sha1_git not null references revision(id), + revision sha1_git not null, origin bigint not null, - visit bigint not null, - primary key (revision, origin, visit), - foreign key (origin, visit) references origin_visit (origin, visit) + visit bigint not null ); -create index on cache_revision_origin(revision); - -- Computing metadata on sha1's contents create table indexer_configuration ( - id serial primary key not null, + id serial not null, tool_name text not null, tool_version text not null, tool_configuration jsonb ); comment on table indexer_configuration is 'Indexer''s configuration version'; comment on column indexer_configuration.id is 'Tool identifier'; comment on column indexer_configuration.tool_version is 'Tool name'; comment on column indexer_configuration.tool_version is 'Tool version'; comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...'; -create unique index on indexer_configuration(tool_name, tool_version); - -- Properties (mimetype, encoding, etc...) create table content_mimetype ( - id sha1 references content(sha1) not null, + id sha1 not null, mimetype bytea not null, encoding bytea not null, - indexer_configuration_id bigserial references indexer_configuration(id) not null, - primary key(id, indexer_configuration_id) + indexer_configuration_id bigserial ); comment on table content_mimetype is 'Metadata associated to a raw content'; comment on column content_mimetype.mimetype is 'Raw content Mimetype'; comment on column content_mimetype.encoding is 'Raw content encoding'; comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information'; -- Language metadata create table content_language ( - id sha1 references content(sha1) not null, + id sha1 not null, lang languages not null, - indexer_configuration_id bigserial references indexer_configuration(id) not null, - primary key(id, indexer_configuration_id) + indexer_configuration_id bigserial ); comment on table content_language is 'Language information on a raw content'; comment on column content_language.lang is 'Language information'; comment on column content_language.indexer_configuration_id is 'Tool used to compute the information'; -- ctags information per content create table content_ctags ( - id sha1 references content(sha1) not null, + id sha1 not null, name text not null, kind text not null, line bigint not null, lang ctags_languages not null, - indexer_configuration_id bigserial references indexer_configuration(id) not null + indexer_configuration_id bigserial ); comment on table content_ctags is 'Ctags information on a raw content'; comment on column content_ctags.id is 'Content identifier'; comment on column content_ctags.name is 'Symbol name'; comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)'; comment on column content_ctags.line is 'Symbol line'; comment on column content_ctags.lang is 'Language information for that content'; comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information'; -create index on content_ctags(id); -create index on content_ctags(hash_sha1(name)); -create unique index on content_ctags(id, hash_sha1(name), kind, line, lang, indexer_configuration_id); - create table fossology_license( - id smallserial primary key, + id smallserial, name text not null ); comment on table fossology_license is 'Possible license recognized by license indexer'; comment on column fossology_license.id is 'License identifier'; comment on column fossology_license.name is 'License name'; -create unique index on fossology_license(name); - create table content_fossology_license ( - id sha1 references content(sha1) not null, - license_id smallserial references fossology_license(id) not null, - indexer_configuration_id bigserial references indexer_configuration(id) not null, - primary key(id, license_id, indexer_configuration_id) + id sha1 not null, + license_id smallserial not null, + indexer_configuration_id bigserial not null ); comment on table content_fossology_license is 'license associated to a raw content'; comment on column content_fossology_license.id is 'Raw content identifier'; comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; diff --git a/sql/swh-triggers.sql b/sql/swh-triggers.sql new file mode 100644 index 000000000..c0e7e721d --- /dev/null +++ b/sql/swh-triggers.sql @@ -0,0 +1,124 @@ +-- Asynchronous notification of new content insertions +create function notify_new_content() + returns trigger + language plpgsql +as $$ + begin + perform pg_notify('new_content', encode(new.sha1, 'hex')); + return null; + end; +$$; + +create trigger notify_new_content + after insert on content + for each row + execute procedure notify_new_content(); + + +-- Asynchronous notification of new origin insertions +create function notify_new_origin() + returns trigger + language plpgsql +as $$ + begin + perform pg_notify('new_origin', new.id::text); + return null; + end; +$$; + +create trigger notify_new_origin + after insert on origin + for each row + execute procedure notify_new_origin(); + + +-- Asynchronous notification of new skipped content insertions +create function notify_new_skipped_content() + returns trigger + language plpgsql +as $$ + begin + perform pg_notify('new_skipped_content', json_build_object( + 'sha1', encode(new.sha1, 'hex'), + 'sha1_git', encode(new.sha1_git, 'hex'), + 'sha256', encode(new.sha256, 'hex') + )::text); + return null; + end; +$$; + +create trigger notify_new_skipped_content + after insert on skipped_content + for each row + execute procedure notify_new_skipped_content(); + + +-- Asynchronous notification of new directory insertions +create function notify_new_directory() + returns trigger + language plpgsql +as $$ + begin + perform pg_notify('new_directory', encode(new.id, 'hex')); + return null; + end; +$$; + +create trigger notify_new_directory + after insert on directory + for each row + execute procedure notify_new_directory(); + + +-- Asynchronous notification of new revision insertions +create function notify_new_revision() + returns trigger + language plpgsql +as $$ + begin + perform pg_notify('new_revision', encode(new.id, 'hex')); + return null; + end; +$$; + +create trigger notify_new_revision + after insert on revision + for each row + execute procedure notify_new_revision(); + + +-- Asynchronous notification of new origin visits +create function notify_new_origin_visit() + returns trigger + language plpgsql +as $$ + begin + perform pg_notify('new_origin_visit', json_build_object( + 'origin', new.origin, + 'visit', new.visit + )::text); + return null; + end; +$$; + +create trigger notify_new_origin_visit + after insert on origin_visit + for each row + execute procedure notify_new_origin_visit(); + + +-- Asynchronous notification of new release insertions +create function notify_new_release() + returns trigger + language plpgsql +as $$ + begin + perform pg_notify('new_release', encode(new.id, 'hex')); + return null; + end; +$$; + +create trigger notify_new_release + after insert on release + for each row + execute procedure notify_new_release(); diff --git a/sql/upgrades/098.sql b/sql/upgrades/098.sql new file mode 100644 index 000000000..4e5310aaa --- /dev/null +++ b/sql/upgrades/098.sql @@ -0,0 +1,9 @@ +-- SWH DB schema upgrade +-- from_version: 97 +-- to_version: 98 +-- description: remove duplicate index on occurrence_history(object_id) + +insert into dbversion(version, release, description) + values(98, now(), 'Work In Progress'); + +DROP INDEX occurrence_history_object_id_idx;