Changeset View
Changeset View
Standalone View
Standalone View
sql/swh-schema.sql
--- | --- | ||||
--- Software Heritage Data Model | --- Software Heritage Data Model | ||||
--- | --- | ||||
-- drop schema if exists swh cascade; | -- drop schema if exists swh cascade; | ||||
-- create schema swh; | -- create schema swh; | ||||
-- set search_path to swh; | -- set search_path to swh; | ||||
create table dbversion | create table dbversion | ||||
( | ( | ||||
version int primary key, | version int primary key, | ||||
release timestamptz, | release timestamptz, | ||||
description text | description text | ||||
); | ); | ||||
insert into dbversion(version, release, description) | insert into dbversion(version, release, description) | ||||
values(103, now(), 'Work In Progress'); | values(104, now(), 'Work In Progress'); | ||||
-- a SHA1 checksum (not necessarily originating from Git) | -- a SHA1 checksum (not necessarily originating from Git) | ||||
create domain sha1 as bytea check (length(value) = 20); | create domain sha1 as bytea check (length(value) = 20); | ||||
-- a Git object ID, i.e., a SHA1 checksum | -- a Git object ID, i.e., a SHA1 checksum | ||||
create domain sha1_git as bytea check (length(value) = 20); | create domain sha1_git as bytea check (length(value) = 20); | ||||
-- a SHA256 checksum | -- a SHA256 checksum | ||||
create domain sha256 as bytea check (length(value) = 32); | create domain sha256 as bytea check (length(value) = 32); | ||||
-- a blake2 checksum | |||||
create domain blake2s256 as bytea check (length(value) = 32); | |||||
-- UNIX path (absolute, relative, individual path component, etc.) | -- UNIX path (absolute, relative, individual path component, etc.) | ||||
create domain unix_path as bytea; | create domain unix_path as bytea; | ||||
-- a set of UNIX-like access permissions, as manipulated by, e.g., chmod | -- a set of UNIX-like access permissions, as manipulated by, e.g., chmod | ||||
create domain file_perms as int; | create domain file_perms as int; | ||||
-- Checksums about actual file content. Note that the content itself is not | -- Checksums about actual file content. Note that the content itself is not | ||||
-- stored in the DB, but on external (key-value) storage. A single checksum is | -- stored in the DB, but on external (key-value) storage. A single checksum is | ||||
-- used as key there, but the other can be used to verify that we do not inject | -- used as key there, but the other can be used to verify that we do not inject | ||||
-- content collisions not knowingly. | -- content collisions not knowingly. | ||||
create table content | create table content | ||||
( | ( | ||||
sha1 sha1 not null, | sha1 sha1 not null, | ||||
sha1_git sha1_git not null, | sha1_git sha1_git not null, | ||||
sha256 sha256 not null, | sha256 sha256 not null, | ||||
ardumont: It's not null here for now since we need to run the computations on that column for the… | |||||
blake2s256 blake2s256, | |||||
length bigint not null, | length bigint not null, | ||||
ctime timestamptz not null default now(), | ctime timestamptz not null default now(), | ||||
-- creation time, i.e. time of (first) injection into the storage | -- creation time, i.e. time of (first) injection into the storage | ||||
status content_status not null default 'visible', | status content_status not null default 'visible', | ||||
object_id bigserial | object_id bigserial | ||||
); | ); | ||||
-- Entities constitute a typed hierarchy of organization, hosting | -- Entities constitute a typed hierarchy of organization, hosting | ||||
-- facilities, groups, people and software projects. | -- facilities, groups, people and software projects. | ||||
-- | -- | ||||
-- Examples of entities: Software Heritage, Debian, GNU, GitHub, | -- Examples of entities: Software Heritage, Debian, GNU, GitHub, | ||||
-- Apache, The Linux Foundation, the Debian Python Modules Team, the | -- Apache, The Linux Foundation, the Debian Python Modules Team, the | ||||
▲ Show 20 Lines • Show All 104 Lines • ▼ Show 20 Lines | |||||
-- separate from the content table as we might not have the sha1 | -- separate from the content table as we might not have the sha1 | ||||
-- checksum of that data (for instance when we inject git | -- checksum of that data (for instance when we inject git | ||||
-- repositories, objects that are too big will be skipped here, and we | -- repositories, objects that are too big will be skipped here, and we | ||||
-- will only know their sha1_git). 'reason' contains the reason the | -- will only know their sha1_git). 'reason' contains the reason the | ||||
-- content was skipped. origin is a nullable column allowing to find | -- content was skipped. origin is a nullable column allowing to find | ||||
-- out which origin contains that skipped content. | -- out which origin contains that skipped content. | ||||
create table skipped_content | create table skipped_content | ||||
( | ( | ||||
sha1 sha1, | sha1 sha1, | ||||
sha1_git sha1_git, | sha1_git sha1_git, | ||||
sha256 sha256, | sha256 sha256, | ||||
Done Inline Actionssame here ardumont: same here | |||||
blake2s256 blake2s256, | |||||
length bigint not null, | length bigint not null, | ||||
ctime timestamptz not null default now(), | ctime timestamptz not null default now(), | ||||
status content_status not null default 'absent', | status content_status not null default 'absent', | ||||
reason text not null, | reason text not null, | ||||
origin bigint, | origin bigint, | ||||
object_id bigserial | object_id bigserial | ||||
); | ); | ||||
-- Log of all origin fetches (i.e., origin crawling) that have been done in the | -- Log of all origin fetches (i.e., origin crawling) that have been done in the | ||||
-- past, or are still ongoing. Similar to list_history, but for origins. | -- past, or are still ongoing. Similar to list_history, but for origins. | ||||
create table fetch_history | create table fetch_history | ||||
( | ( | ||||
id bigserial, | id bigserial, | ||||
origin bigint, | origin bigint, | ||||
▲ Show 20 Lines • Show All 276 Lines • Show Last 20 Lines |
It's not null here for now since we need to run the computations on that column for the existing hash.