Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9342793
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
29 KB
Subscribers
None
View Options
diff --git a/sql/swh-enums.sql b/sql/swh-enums.sql
index bc6a4b28..0ac5510f 100644
--- a/sql/swh-enums.sql
+++ b/sql/swh-enums.sql
@@ -1,146 +1,152 @@
---
--- Software Heritage Data Types
---
create type content_status as enum ('absent', 'visible', 'hidden');
+comment on type content_status is 'Content visibility';
-- Types of entities.
--
-- - organization: a root entity, usually backed by a non-profit, a
-- company, or another kind of "association". (examples: Software
-- Heritage, Debian, GNU, GitHub)
--
-- - group_of_entities: used for hierarchies, doesn't need to have a
-- concrete existence. (examples: GNU hosting facilities, Debian
-- hosting facilities, GitHub users, ...)
--
-- - hosting: a hosting facility, can usually be listed to generate
-- other data. (examples: GitHub git hosting, alioth.debian.org,
-- snapshot.debian.org)
--
-- - group_of_persons: an entity representing a group of
-- persons. (examples: a GitHub organization, a Debian team)
--
-- - person: an entity representing a person. (examples:
-- a GitHub user, a Debian developer)
--
-- - project: an entity representing a software project. (examples: a
-- GitHub project, Apache httpd, a Debian source package, ...)
create type entity_type as enum (
'organization',
'group_of_entities',
'hosting',
'group_of_persons',
'person',
'project'
);
+comment on type entity_type is 'Entity types';
create type revision_type as enum ('git', 'tar', 'dsc', 'svn');
+comment on type revision_type is 'Possible revision types';
--- the data object types stored in our data model
create type object_type as enum ('content', 'directory', 'revision', 'release');
+comment on type object_type is 'Data object types stored in data model';
create type origin_visit_status as enum (
'ongoing',
'full',
'partial'
);
-
comment on type origin_visit_status IS 'Possible visit status';
create type languages as enum ( 'abap', 'abnf', 'actionscript',
'actionscript-3', 'ada', 'adl', 'agda', 'alloy', 'ambienttalk',
'antlr', 'antlr-with-actionscript-target', 'antlr-with-c#-target',
'antlr-with-cpp-target', 'antlr-with-java-target',
'antlr-with-objectivec-target', 'antlr-with-perl-target',
'antlr-with-python-target', 'antlr-with-ruby-target', 'apacheconf',
'apl', 'applescript', 'arduino', 'aspectj', 'aspx-cs', 'aspx-vb',
'asymptote', 'autohotkey', 'autoit', 'awk', 'base-makefile', 'bash',
'bash-session', 'batchfile', 'bbcode', 'bc', 'befunge',
'blitzbasic', 'blitzmax', 'bnf', 'boo', 'boogie', 'brainfuck',
'bro', 'bugs', 'c', 'c#', 'c++', 'c-objdump', 'ca65-assembler',
'cadl', 'camkes', 'cbm-basic-v2', 'ceylon', 'cfengine3',
'cfstatement', 'chaiscript', 'chapel', 'cheetah', 'cirru', 'clay',
'clojure', 'clojurescript', 'cmake', 'cobol', 'cobolfree',
'coffeescript', 'coldfusion-cfc', 'coldfusion-html', 'common-lisp',
'component-pascal', 'coq', 'cpp-objdump', 'cpsa', 'crmsh', 'croc',
'cryptol', 'csound-document', 'csound-orchestra', 'csound-score',
'css', 'css+django/jinja', 'css+genshi-text', 'css+lasso',
'css+mako', 'css+mozpreproc', 'css+myghty', 'css+php', 'css+ruby',
'css+smarty', 'cuda', 'cypher', 'cython', 'd', 'd-objdump',
'darcs-patch', 'dart', 'debian-control-file', 'debian-sourcelist',
'delphi', 'dg', 'diff', 'django/jinja', 'docker', 'dtd', 'duel',
'dylan', 'dylan-session', 'dylanlid', 'earl-grey', 'easytrieve',
'ebnf', 'ec', 'ecl', 'eiffel', 'elixir', 'elixir-iex-session',
'elm', 'emacslisp', 'embedded-ragel', 'erb', 'erlang',
'erlang-erl-session', 'evoque', 'ezhil', 'factor', 'fancy',
'fantom', 'felix', 'fish', 'fortran', 'fortranfixed', 'foxpro',
'fsharp', 'gap', 'gas', 'genshi', 'genshi-text', 'gettext-catalog',
'gherkin', 'glsl', 'gnuplot', 'go', 'golo', 'gooddata-cl', 'gosu',
'gosu-template', 'groff', 'groovy', 'haml', 'handlebars', 'haskell',
'haxe', 'hexdump', 'html', 'html+cheetah', 'html+django/jinja',
'html+evoque', 'html+genshi', 'html+handlebars', 'html+lasso',
'html+mako', 'html+myghty', 'html+php', 'html+smarty', 'html+twig',
'html+velocity', 'http', 'hxml', 'hy', 'hybris', 'idl', 'idris',
'igor', 'inform-6', 'inform-6-template', 'inform-7', 'ini', 'io',
'ioke', 'irc-logs', 'isabelle', 'j', 'jade', 'jags', 'jasmin',
'java', 'java-server-page', 'javascript', 'javascript+cheetah',
'javascript+django/jinja', 'javascript+genshi-text',
'javascript+lasso', 'javascript+mako', 'javascript+mozpreproc',
'javascript+myghty', 'javascript+php', 'javascript+ruby',
'javascript+smarty', 'jcl', 'json', 'json-ld', 'julia',
'julia-console', 'kal', 'kconfig', 'koka', 'kotlin', 'lasso',
'lean', 'lesscss', 'lighttpd-configuration-file', 'limbo', 'liquid',
'literate-agda', 'literate-cryptol', 'literate-haskell',
'literate-idris', 'livescript', 'llvm', 'logos', 'logtalk', 'lsl',
'lua', 'makefile', 'mako', 'maql', 'mask', 'mason', 'mathematica',
'matlab', 'matlab-session', 'minid', 'modelica', 'modula-2',
'moinmoin/trac-wiki-markup', 'monkey', 'moocode', 'moonscript',
'mozhashpreproc', 'mozpercentpreproc', 'mql', 'mscgen',
'msdos-session', 'mupad', 'mxml', 'myghty', 'mysql', 'nasm',
'nemerle', 'nesc', 'newlisp', 'newspeak',
'nginx-configuration-file', 'nimrod', 'nit', 'nix', 'nsis', 'numpy',
'objdump', 'objdump-nasm', 'objective-c', 'objective-c++',
'objective-j', 'ocaml', 'octave', 'odin', 'ooc', 'opa',
'openedge-abl', 'pacmanconf', 'pan', 'parasail', 'pawn', 'perl',
'perl6', 'php', 'pig', 'pike', 'pkgconfig', 'pl/pgsql',
'postgresql-console-(psql)', 'postgresql-sql-dialect', 'postscript',
'povray', 'powershell', 'powershell-session', 'praat', 'prolog',
'properties', 'protocol-buffer', 'puppet', 'pypy-log', 'python',
'python-3', 'python-3.0-traceback', 'python-console-session',
'python-traceback', 'qbasic', 'qml', 'qvto', 'racket', 'ragel',
'ragel-in-c-host', 'ragel-in-cpp-host', 'ragel-in-d-host',
'ragel-in-java-host', 'ragel-in-objective-c-host',
'ragel-in-ruby-host', 'raw-token-data', 'rconsole', 'rd', 'rebol',
'red', 'redcode', 'reg', 'resourcebundle', 'restructuredtext',
'rexx', 'rhtml', 'roboconf-graph', 'roboconf-instances',
'robotframework', 'rpmspec', 'rql', 'rsl', 'ruby',
'ruby-irb-session', 'rust', 's', 'sass', 'scala',
'scalate-server-page', 'scaml', 'scheme', 'scilab', 'scss', 'shen',
'slim', 'smali', 'smalltalk', 'smarty', 'snobol', 'sourcepawn',
'sparql', 'sql', 'sqlite3con', 'squidconf', 'stan', 'standard-ml',
'supercollider', 'swift', 'swig', 'systemverilog', 'tads-3', 'tap',
'tcl', 'tcsh', 'tcsh-session', 'tea', 'termcap', 'terminfo',
'terraform', 'tex', 'text-only', 'thrift', 'todotxt',
'trafficscript', 'treetop', 'turtle', 'twig', 'typescript',
'urbiscript', 'vala', 'vb.net', 'vctreestatus', 'velocity',
'verilog', 'vgl', 'vhdl', 'viml', 'x10', 'xml', 'xml+cheetah',
'xml+django/jinja', 'xml+evoque', 'xml+lasso', 'xml+mako',
'xml+myghty', 'xml+php', 'xml+ruby', 'xml+smarty', 'xml+velocity',
'xquery', 'xslt', 'xtend', 'xul+mozpreproc', 'yaml', 'yaml+jinja',
- 'zephir', 'unknown' );
+ 'zephir', 'unknown'
+);
+comment on type languages is 'Languages recognized by language indexer';
create type ctags_languages as enum ( 'Ada', 'AnsiblePlaybook', 'Ant',
'Asm', 'Asp', 'Autoconf', 'Automake', 'Awk', 'Basic', 'BETA', 'C',
'C#', 'C++', 'Clojure', 'Cobol', 'CoffeeScript [disabled]', 'CSS',
'ctags', 'D', 'DBusIntrospect', 'Diff', 'DosBatch', 'DTS', 'Eiffel',
'Erlang', 'Falcon', 'Flex', 'Fortran', 'gdbinit [disabled]',
'Glade', 'Go', 'HTML', 'Iniconf', 'Java', 'JavaProperties',
'JavaScript', 'JSON', 'Lisp', 'Lua', 'M4', 'Make', 'man [disabled]',
'MatLab', 'Maven2', 'Myrddin', 'ObjectiveC', 'OCaml', 'OldC
[disabled]', 'OldC++ [disabled]', 'Pascal', 'Perl', 'Perl6', 'PHP',
'PlistXML', 'pod', 'Protobuf', 'Python', 'PythonLoggingConfig', 'R',
'RelaxNG', 'reStructuredText', 'REXX', 'RpmSpec', 'Ruby', 'Rust',
'Scheme', 'Sh', 'SLang', 'SML', 'SQL', 'SVG', 'SystemdUnit',
'SystemVerilog', 'Tcl', 'Tex', 'TTCN', 'Vera', 'Verilog', 'VHDL',
- 'Vim', 'WindRes', 'XSLT', 'YACC', 'Yaml', 'YumRepo', 'Zephir' );
+ 'Vim', 'WindRes', 'XSLT', 'YACC', 'Yaml', 'YumRepo', 'Zephir'
+);
+comment on type ctags_languages is 'Languages recognized by ctags indexer';
diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql
index a84b7150..df9e1f48 100644
--- a/sql/swh-schema.sql
+++ b/sql/swh-schema.sql
@@ -1,605 +1,605 @@
---
--- Software Heritage Data Model
---
-- drop schema if exists swh cascade;
-- create schema swh;
-- set search_path to swh;
create table dbversion
(
version int primary key,
release timestamptz,
description text
);
insert into dbversion(version, release, description)
- values(89, now(), 'Work In Progress');
+ values(90, now(), 'Work In Progress');
-- a SHA1 checksum (not necessarily originating from Git)
create domain sha1 as bytea check (length(value) = 20);
-- a Git object ID, i.e., a SHA1 checksum
create domain sha1_git as bytea check (length(value) = 20);
-- a SHA256 checksum
create domain sha256 as bytea check (length(value) = 32);
-- UNIX path (absolute, relative, individual path component, etc.)
create domain unix_path as bytea;
-- a set of UNIX-like access permissions, as manipulated by, e.g., chmod
create domain file_perms as int;
-- Checksums about actual file content. Note that the content itself is not
-- stored in the DB, but on external (key-value) storage. A single checksum is
-- used as key there, but the other can be used to verify that we do not inject
-- content collisions not knowingly.
create table content
(
sha1 sha1 primary key,
sha1_git sha1_git not null,
sha256 sha256 not null,
length bigint not null,
ctime timestamptz not null default now(),
-- creation time, i.e. time of (first) injection into the storage
status content_status not null default 'visible',
object_id bigserial
);
create unique index on content(sha1_git);
create unique index on content(sha256);
create index on content(ctime); -- TODO use a BRIN index here (postgres >= 9.5)
create index on content(object_id);
-- Asynchronous notification of new content insertions
create function notify_new_content()
returns trigger
language plpgsql
as $$
begin
perform pg_notify('new_content', encode(new.sha1, 'hex'));
return null;
end;
$$;
create trigger notify_new_content
after insert on content
for each row
execute procedure notify_new_content();
-- Entities constitute a typed hierarchy of organization, hosting
-- facilities, groups, people and software projects.
--
-- Examples of entities: Software Heritage, Debian, GNU, GitHub,
-- Apache, The Linux Foundation, the Debian Python Modules Team, the
-- torvalds GitHub user, the torvalds/linux GitHub project.
--
-- The data model is hierarchical (via the parent attribute) and might
-- store sub-branches of existing entities. The key feature of an
-- entity is might be *listed* (if it is available in listable_entity)
-- to retrieve information about its content, i.e: sub-entities,
-- projects, origins.
-- The history of entities. Allows us to keep historical metadata
-- about entities. The temporal invariant is the uuid. Root
-- organization uuids are manually generated (and available in
-- swh-data.sql).
--
-- For generated entities (generated = true), we can provide
-- generation_metadata to allow listers to retrieve the uuids of previous
-- iterations of the entity.
--
-- Inactive entities that have been active in the past (active =
-- false) should register the timestamp at which we saw them
-- deactivate, in a new entry of entity_history.
create table entity_history
(
id bigserial primary key,
uuid uuid,
parent uuid, -- should reference entity_history(uuid)
name text not null,
type entity_type not null,
description text,
homepage text,
active boolean not null, -- whether the entity was seen on the last listing
generated boolean not null, -- whether this entity has been generated by a lister
lister_metadata jsonb, -- lister-specific metadata, used for queries
metadata jsonb,
validity timestamptz[] -- timestamps at which we have seen this entity
);
create index on entity_history(uuid);
create index on entity_history(name);
-- The entity table provides a view of the latest information on a
-- given entity. It is updated via a trigger on entity_history.
create table entity
(
uuid uuid primary key,
parent uuid references entity(uuid) deferrable initially deferred,
name text not null,
type entity_type not null,
description text,
homepage text,
active boolean not null, -- whether the entity was seen on the last listing
generated boolean not null, -- whether this entity has been generated by a lister
lister_metadata jsonb, -- lister-specific metadata, used for queries
metadata jsonb,
last_seen timestamptz, -- last listing time or disappearance time for active=false
last_id bigint references entity_history(id) -- last listing id
);
create index on entity(name);
create index on entity using gin(lister_metadata jsonb_path_ops);
-- Register the equivalence between two entities. Allows sideways
-- navigation in the entity table
create table entity_equivalence
(
entity1 uuid references entity(uuid),
entity2 uuid references entity(uuid),
primary key (entity1, entity2),
constraint order_entities check (entity1 < entity2)
);
-- Register a lister for a specific entity.
create table listable_entity
(
uuid uuid references entity(uuid) primary key,
enabled boolean not null default true, -- do we list this entity automatically?
list_engine text, -- crawler to be used to list entity's content
list_url text, -- root URL to start the listing
list_params jsonb, -- org-specific listing parameter
latest_list timestamptz -- last time the entity's content has been listed
);
-- Log of all entity listings (i.e., entity crawling) that have been
-- done in the past, or are still ongoing.
create table list_history
(
id bigserial primary key,
entity uuid references listable_entity(uuid),
date timestamptz not null,
status boolean, -- true if and only if the listing has been successful
result jsonb, -- more detailed return value, depending on status
stdout text,
stderr text,
duration interval -- fetch duration of NULL if still ongoing
);
-- An origin is a place, identified by an URL, where software can be found. We
-- support different kinds of origins, e.g., git and other VCS repositories,
-- web pages that list tarballs URLs (e.g., http://www.kernel.org), indirect
-- tarball URLs (e.g., http://www.example.org/latest.tar.gz), etc. The key
-- feature of an origin is that it can be *fetched* (wget, git clone, svn
-- checkout, etc.) to retrieve all the contained software.
create table origin
(
id bigserial primary key,
type text, -- TODO use an enum here (?)
url text not null,
lister uuid references listable_entity(uuid),
project uuid references entity(uuid)
);
create index on origin(type, url);
-- Asynchronous notification of new origin insertions
create function notify_new_origin()
returns trigger
language plpgsql
as $$
begin
perform pg_notify('new_origin', new.id::text);
return null;
end;
$$;
create trigger notify_new_origin
after insert on origin
for each row
execute procedure notify_new_origin();
-- Content we have seen but skipped for some reason. This table is
-- separate from the content table as we might not have the sha1
-- checksum of that data (for instance when we inject git
-- repositories, objects that are too big will be skipped here, and we
-- will only know their sha1_git). 'reason' contains the reason the
-- content was skipped. origin is a nullable column allowing to find
-- out which origin contains that skipped content.
create table skipped_content
(
sha1 sha1,
sha1_git sha1_git,
sha256 sha256,
length bigint not null,
ctime timestamptz not null default now(),
status content_status not null default 'absent',
reason text not null,
origin bigint references origin(id),
object_id bigserial,
unique (sha1, sha1_git, sha256)
);
-- Those indexes support multiple NULL values.
create unique index on skipped_content(sha1);
create unique index on skipped_content(sha1_git);
create unique index on skipped_content(sha256);
create index on skipped_content(object_id);
-- Asynchronous notification of new skipped content insertions
create function notify_new_skipped_content()
returns trigger
language plpgsql
as $$
begin
perform pg_notify('new_skipped_content', json_build_object(
'sha1', encode(new.sha1, 'hex'),
'sha1_git', encode(new.sha1_git, 'hex'),
'sha256', encode(new.sha256, 'hex')
)::text);
return null;
end;
$$;
create trigger notify_new_skipped_content
after insert on skipped_content
for each row
execute procedure notify_new_skipped_content();
-- Log of all origin fetches (i.e., origin crawling) that have been done in the
-- past, or are still ongoing. Similar to list_history, but for origins.
create table fetch_history
(
id bigserial primary key,
origin bigint references origin(id),
date timestamptz not null,
status boolean, -- true if and only if the fetch has been successful
result jsonb, -- more detailed returned values, times, etc...
stdout text,
stderr text, -- null when status is true, filled otherwise
duration interval -- fetch duration of NULL if still ongoing
);
-- A file-system directory. A directory is a list of directory entries (see
-- tables: directory_entry_{dir,file}).
--
-- To list the contents of a directory:
-- 1. list the contained directory_entry_dir using array dir_entries
-- 2. list the contained directory_entry_file using array file_entries
-- 3. list the contained directory_entry_rev using array rev_entries
-- 4. UNION
--
-- Synonyms/mappings:
-- * git: tree
create table directory
(
id sha1_git primary key,
dir_entries bigint[], -- sub-directories, reference directory_entry_dir
file_entries bigint[], -- contained files, reference directory_entry_file
rev_entries bigint[], -- mounted revisions, reference directory_entry_rev
object_id bigserial -- short object identifier
);
create index on directory using gin (dir_entries);
create index on directory using gin (file_entries);
create index on directory using gin (rev_entries);
create index on directory(object_id);
-- Asynchronous notification of new directory insertions
create function notify_new_directory()
returns trigger
language plpgsql
as $$
begin
perform pg_notify('new_directory', encode(new.id, 'hex'));
return null;
end;
$$;
create trigger notify_new_directory
after insert on directory
for each row
execute procedure notify_new_directory();
-- A directory entry pointing to a sub-directory.
create table directory_entry_dir
(
id bigserial primary key,
target sha1_git, -- id of target directory
name unix_path, -- path name, relative to containing dir
perms file_perms -- unix-like permissions
);
create unique index on directory_entry_dir(target, name, perms);
-- A directory entry pointing to a file.
create table directory_entry_file
(
id bigserial primary key,
target sha1_git, -- id of target file
name unix_path, -- path name, relative to containing dir
perms file_perms -- unix-like permissions
);
create unique index on directory_entry_file(target, name, perms);
-- A directory entry pointing to a revision.
create table directory_entry_rev
(
id bigserial primary key,
target sha1_git, -- id of target revision
name unix_path, -- path name, relative to containing dir
perms file_perms -- unix-like permissions
);
create unique index on directory_entry_rev(target, name, perms);
create table person
(
id bigserial primary key,
fullname bytea not null, -- freeform specification; what is actually used in the checksums
-- will usually be of the form 'name <email>'
name bytea, -- advisory: not null if we managed to parse a name
email bytea -- advisory: not null if we managed to parse an email
);
create unique index on person(fullname);
create index on person(name);
create index on person(email);
-- A snapshot of a software project at a specific point in time.
--
-- Synonyms/mappings:
-- * git / subversion / etc: commit
-- * tarball: a specific tarball
--
-- Revisions are organized as DAGs. Each revision points to 0, 1, or more (in
-- case of merges) parent revisions. Each revision points to a directory, i.e.,
-- a file-system tree containing files and directories.
create table revision
(
id sha1_git primary key,
date timestamptz,
date_offset smallint,
date_neg_utc_offset boolean,
committer_date timestamptz,
committer_date_offset smallint,
committer_date_neg_utc_offset boolean,
type revision_type not null,
directory sha1_git, -- file-system tree
message bytea,
author bigint references person(id),
committer bigint references person(id),
metadata jsonb, -- extra metadata (tarball checksums, extra commit information, etc...)
synthetic boolean not null default false, -- true if synthetic (cf. swh-loader-tar)
object_id bigserial
);
create index on revision(directory);
create index on revision(object_id);
-- Asynchronous notification of new revision insertions
create function notify_new_revision()
returns trigger
language plpgsql
as $$
begin
perform pg_notify('new_revision', encode(new.id, 'hex'));
return null;
end;
$$;
create trigger notify_new_revision
after insert on revision
for each row
execute procedure notify_new_revision();
-- either this table or the sha1_git[] column on the revision table
create table revision_history
(
id sha1_git references revision(id),
parent_id sha1_git,
parent_rank int not null default 0,
-- parent position in merge commits, 0-based
primary key (id, parent_rank)
);
create index on revision_history(parent_id);
-- The timestamps at which Software Heritage has made a visit of the given origin.
create table origin_visit
(
origin bigint not null references origin(id),
visit bigint not null,
date timestamptz not null,
status origin_visit_status not null,
metadata jsonb,
primary key (origin, visit)
);
comment on column origin_visit.origin is 'Visited origin';
comment on column origin_visit.visit is 'Visit number the visit occurred for that origin';
comment on column origin_visit.date is 'Visit date for that origin';
comment on column origin_visit.status is 'Visit status for that origin';
comment on column origin_visit.metadata is 'Metadata associated with the visit';
create index on origin_visit(date);
-- Asynchronous notification of new origin visits
create function notify_new_origin_visit()
returns trigger
language plpgsql
as $$
begin
perform pg_notify('new_origin_visit', json_build_object(
'origin', new.origin,
'visit', new.visit
)::text);
return null;
end;
$$;
create trigger notify_new_origin_visit
after insert on origin_visit
for each row
execute procedure notify_new_origin_visit();
-- The content of software origins is indexed starting from top-level pointers
-- called "branches". Every time we fetch some origin we store in this table
-- where the branches pointed to at fetch time.
--
-- Synonyms/mappings:
-- * git: ref (in the "git update-ref" sense)
create table occurrence_history
(
origin bigint references origin(id) not null,
branch bytea not null, -- e.g., b"master" (for VCS), or b"sid" (for Debian)
target sha1_git not null, -- ref target, e.g., commit id
target_type object_type not null, -- ref target type
object_id bigserial not null, -- short object identifier
visits bigint[] not null, -- the visits where that occurrence was valid. References
-- origin_visit(visit), where o_h.origin = origin_visit.origin.
primary key (object_id)
);
create index on occurrence_history(target, target_type);
create index on occurrence_history(origin, branch);
create unique index on occurrence_history(origin, branch, target, target_type);
create index on occurrence_history(object_id);
-- Materialized view of occurrence_history, storing the *current* value of each
-- branch, as last seen by SWH.
create table occurrence
(
origin bigint references origin(id) not null,
branch bytea not null,
target sha1_git not null,
target_type object_type not null,
primary key(origin, branch)
);
-- A "memorable" point in the development history of a project.
--
-- Synonyms/mappings:
-- * git: tag (of the annotated kind, otherwise they are just references)
-- * tarball: the release version number
create table release
(
id sha1_git primary key,
target sha1_git,
target_type object_type,
date timestamptz,
date_offset smallint,
date_neg_utc_offset boolean,
name bytea,
comment bytea,
author bigint references person(id),
synthetic boolean not null default false, -- true if synthetic (cf. swh-loader-tar)
object_id bigserial
);
create index on release(target, target_type);
create index on release(object_id);
-- Asynchronous notification of new release insertions
create function notify_new_release()
returns trigger
language plpgsql
as $$
begin
perform pg_notify('new_release', encode(new.id, 'hex'));
return null;
end;
$$;
create trigger notify_new_release
after insert on release
for each row
execute procedure notify_new_release();
-- Content provenance information caches
-- https://forge.softwareheritage.org/T547
--
-- Those tables aren't expected to be exhaustive, and get filled on a case by
-- case basis: absence of data doesn't mean the data is not there
-- content <-> revision mapping cache
--
-- semantics: "we have seen the content with given id in the given path inside
-- the given revision"
create table cache_content_revision (
content sha1_git not null primary key references content(sha1_git),
blacklisted boolean default false,
revision_paths bytea[][]
);
create table cache_content_revision_processed (
revision sha1_git not null primary key references revision(id)
);
-- revision <-> origin_visit mapping cache
--
-- semantics: "we have seen the given revision in the given origin during the
-- given visit"
create table cache_revision_origin (
revision sha1_git not null references revision(id),
origin bigint not null,
visit bigint not null,
primary key (revision, origin, visit),
foreign key (origin, visit) references origin_visit (origin, visit)
);
create index on cache_revision_origin(revision);
-- Computing metadata on sha1's contents
-- Properties (mimetype, encoding, etc...)
create table content_mimetype (
id sha1 primary key references content(sha1) not null,
mimetype bytea not null,
encoding bytea not null
);
comment on table content_mimetype is 'Metadata associated to a raw content';
comment on column content_mimetype.mimetype is 'Raw content Mimetype';
comment on column content_mimetype.encoding is 'Raw content encoding';
-- Language metadata
create table content_language (
id sha1 primary key references content(sha1) not null,
lang languages not null
);
comment on table content_language is 'Language information on a raw content';
comment on column content_language.lang is 'Language information';
-- ctags information per content
create table content_ctags (
id sha1 references content(sha1) not null,
name text not null,
kind text not null,
line bigint not null,
lang ctags_languages not null
);
comment on table content_ctags is 'Ctags information on a raw content';
comment on column content_ctags.id is 'Content identifier';
comment on column content_ctags.name is 'Symbol name';
comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)';
comment on column content_ctags.line is 'Symbol line';
comment on column content_ctags.lang is 'Language information for that content';
create index on content_ctags(id);
create unique index on content_ctags(id, md5(name), kind, line, lang);
diff --git a/sql/upgrades/090.sql b/sql/upgrades/090.sql
new file mode 100644
index 00000000..0c0e2878
--- /dev/null
+++ b/sql/upgrades/090.sql
@@ -0,0 +1,14 @@
+-- SWH DB schema upgrade
+-- from_version: 89
+-- to_version: 90
+-- description: indexer: Add content_ctags
+
+insert into dbversion(version, release, description)
+ values(90, now(), 'Work In Progress');
+
+comment on type content_status is 'Content visibility';
+comment on type entity_type is 'Entity types';
+comment on type revision_type is 'Possible revision types';
+comment on type object_type is 'Data object types stored in data model';
+comment on type languages is 'Languages recognized by language indexer';
+comment on type ctags_languages is 'Languages recognized by ctags indexer';
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 1:01 PM (1 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3253269
Attached To
rDSTO Storage manager
Event Timeline
Log In to Comment