Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/cassandra/schema.py
Show All 24 Lines | else { | ||||
} | } | ||||
counters.put(bin_name, nb + 1); | counters.put(bin_name, nb + 1); | ||||
state.setMap(1, counters, String.class, Integer.class); | state.setMap(1, counters, String.class, Integer.class); | ||||
} | } | ||||
return state; | return state; | ||||
$$ | $$ | ||||
; | ; | ||||
CREATE OR REPLACE AGGREGATE ascii_bins_count ( ascii ) | CREATE OR REPLACE AGGREGATE ascii_bins_count ( ascii ) | ||||
SFUNC ascii_bins_count_sfunc | SFUNC ascii_bins_count_sfunc | ||||
STYPE tuple<int, map<ascii, int>> | STYPE tuple<int, map<ascii, int>> | ||||
INITCOND (0, {}) | INITCOND (0, {}) | ||||
; | ; | ||||
CREATE TYPE IF NOT EXISTS microtimestamp ( | CREATE TYPE IF NOT EXISTS microtimestamp ( | ||||
seconds bigint, | seconds bigint, | ||||
microseconds int | microseconds int | ||||
); | ); | ||||
CREATE TYPE IF NOT EXISTS microtimestamp_with_timezone ( | CREATE TYPE IF NOT EXISTS microtimestamp_with_timezone ( | ||||
timestamp frozen<microtimestamp>, | timestamp frozen<microtimestamp>, | ||||
offset smallint, | offset smallint, | ||||
negative_utc boolean | negative_utc boolean | ||||
); | ); | ||||
CREATE TYPE IF NOT EXISTS person ( | CREATE TYPE IF NOT EXISTS person ( | ||||
fullname blob, | fullname blob, | ||||
name blob, | name blob, | ||||
email blob | email blob | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS content ( | CREATE TABLE IF NOT EXISTS content ( | ||||
sha1 blob, | sha1 blob, | ||||
sha1_git blob, | sha1_git blob, | ||||
sha256 blob, | sha256 blob, | ||||
blake2s256 blob, | blake2s256 blob, | ||||
length bigint, | length bigint, | ||||
ctime timestamp, | ctime timestamp, | ||||
-- creation time, i.e. time of (first) injection into the storage | -- creation time, i.e. time of (first) injection into the storage | ||||
status ascii, | status ascii, | ||||
PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256)) | PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256)) | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS skipped_content ( | CREATE TABLE IF NOT EXISTS skipped_content ( | ||||
sha1 blob, | sha1 blob, | ||||
sha1_git blob, | sha1_git blob, | ||||
sha256 blob, | sha256 blob, | ||||
blake2s256 blob, | blake2s256 blob, | ||||
length bigint, | length bigint, | ||||
ctime timestamp, | ctime timestamp, | ||||
-- creation time, i.e. time of (first) injection into the storage | -- creation time, i.e. time of (first) injection into the storage | ||||
status ascii, | status ascii, | ||||
reason text, | reason text, | ||||
origin text, | origin text, | ||||
PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256)) | PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256)) | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS revision ( | CREATE TABLE IF NOT EXISTS revision ( | ||||
id blob PRIMARY KEY, | id blob PRIMARY KEY, | ||||
date microtimestamp_with_timezone, | date microtimestamp_with_timezone, | ||||
committer_date microtimestamp_with_timezone, | committer_date microtimestamp_with_timezone, | ||||
type ascii, | type ascii, | ||||
directory blob, -- source code "root" directory | directory blob, -- source code "root" directory | ||||
message blob, | message blob, | ||||
author person, | author person, | ||||
committer person, | committer person, | ||||
synthetic boolean, | synthetic boolean, | ||||
-- true iff revision has been created by Software Heritage | -- true iff revision has been created by Software Heritage | ||||
metadata text | metadata text | ||||
-- extra metadata as JSON(tarball checksums, | -- extra metadata as JSON(tarball checksums, | ||||
-- extra commit information, etc...) | -- extra commit information, etc...) | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS revision_parent ( | CREATE TABLE IF NOT EXISTS revision_parent ( | ||||
id blob, | id blob, | ||||
parent_rank int, | parent_rank int, | ||||
-- parent position in merge commits, 0-based | -- parent position in merge commits, 0-based | ||||
parent_id blob, | parent_id blob, | ||||
PRIMARY KEY ((id), parent_rank) | PRIMARY KEY ((id), parent_rank) | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS release | CREATE TABLE IF NOT EXISTS release | ||||
( | ( | ||||
id blob PRIMARY KEY, | id blob PRIMARY KEY, | ||||
target_type ascii, | target_type ascii, | ||||
target blob, | target blob, | ||||
date microtimestamp_with_timezone, | date microtimestamp_with_timezone, | ||||
name blob, | name blob, | ||||
message blob, | message blob, | ||||
author person, | author person, | ||||
synthetic boolean, | synthetic boolean, | ||||
-- true iff release has been created by Software Heritage | -- true iff release has been created by Software Heritage | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS directory ( | CREATE TABLE IF NOT EXISTS directory ( | ||||
id blob PRIMARY KEY, | id blob PRIMARY KEY, | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS directory_entry ( | CREATE TABLE IF NOT EXISTS directory_entry ( | ||||
directory_id blob, | directory_id blob, | ||||
name blob, -- path name, relative to containing dir | name blob, -- path name, relative to containing dir | ||||
target blob, | target blob, | ||||
perms int, -- unix-like permissions | perms int, -- unix-like permissions | ||||
type ascii, -- target type | type ascii, -- target type | ||||
PRIMARY KEY ((directory_id), name) | PRIMARY KEY ((directory_id), name) | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS snapshot ( | CREATE TABLE IF NOT EXISTS snapshot ( | ||||
id blob PRIMARY KEY, | id blob PRIMARY KEY, | ||||
); | ); | ||||
-- For a given snapshot_id, branches are sorted by their name, | -- For a given snapshot_id, branches are sorted by their name, | ||||
-- allowing easy pagination. | -- allowing easy pagination. | ||||
CREATE TABLE IF NOT EXISTS snapshot_branch ( | CREATE TABLE IF NOT EXISTS snapshot_branch ( | ||||
snapshot_id blob, | snapshot_id blob, | ||||
name blob, | name blob, | ||||
target_type ascii, | target_type ascii, | ||||
target blob, | target blob, | ||||
PRIMARY KEY ((snapshot_id), name) | PRIMARY KEY ((snapshot_id), name) | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS origin_visit ( | CREATE TABLE IF NOT EXISTS origin_visit ( | ||||
origin text, | origin text, | ||||
visit bigint, | visit bigint, | ||||
date timestamp, | date timestamp, | ||||
type text, | type text, | ||||
PRIMARY KEY ((origin), visit) | PRIMARY KEY ((origin), visit) | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS origin_visit_status ( | CREATE TABLE IF NOT EXISTS origin_visit_status ( | ||||
origin text, | origin text, | ||||
visit bigint, | visit bigint, | ||||
date timestamp, | date timestamp, | ||||
status ascii, | status ascii, | ||||
metadata text, | metadata text, | ||||
snapshot blob, | snapshot blob, | ||||
PRIMARY KEY ((origin), visit, date) | PRIMARY KEY ((origin), visit, date) | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS origin ( | CREATE TABLE IF NOT EXISTS origin ( | ||||
sha1 blob PRIMARY KEY, | sha1 blob PRIMARY KEY, | ||||
url text, | url text, | ||||
type text, | type text, | ||||
next_visit_id int, | next_visit_id int, | ||||
-- We need integer visit ids for compatibility with the pgsql | -- We need integer visit ids for compatibility with the pgsql | ||||
-- storage, so we're using lightweight transactions with this trick: | -- storage, so we're using lightweight transactions with this trick: | ||||
-- https://stackoverflow.com/a/29391877/539465 | -- https://stackoverflow.com/a/29391877/539465 | ||||
Show All 11 Lines | |||||
CREATE TABLE IF NOT EXISTS metadata_fetcher ( | CREATE TABLE IF NOT EXISTS metadata_fetcher ( | ||||
name ascii, | name ascii, | ||||
version ascii, | version ascii, | ||||
metadata text, | metadata text, | ||||
PRIMARY KEY ((name), version) | PRIMARY KEY ((name), version) | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS origin_metadata ( | CREATE TABLE IF NOT EXISTS object_metadata ( | ||||
origin text, | type text, | ||||
id text, | |||||
-- metadata source | |||||
authority_type text, | authority_type text, | ||||
authority_url text, | authority_url text, | ||||
discovery_date timestamp, | discovery_date timestamp, | ||||
fetcher_name ascii, | fetcher_name ascii, | ||||
fetcher_version ascii, | fetcher_version ascii, | ||||
-- metadata itself | |||||
format ascii, | format ascii, | ||||
metadata blob, | metadata blob, | ||||
PRIMARY KEY ((origin), authority_type, authority_url, discovery_date, | |||||
fetcher_name, fetcher_version), | -- context | ||||
-- for now, authority_url could be in the partition key; but leaving | origin text, | ||||
-- in the partition key allows listing authorities with metadata on an | visit bigint, | ||||
-- origin if we ever need to do it. | snapshot text, | ||||
release text, | |||||
revision text, | |||||
path blob, | |||||
directory text, | |||||
PRIMARY KEY ((id), authority_type, authority_url, discovery_date, | |||||
fetcher_name, fetcher_version) | |||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS object_count ( | CREATE TABLE IF NOT EXISTS object_count ( | ||||
partition_key smallint, -- Constant, must always be 0 | partition_key smallint, -- Constant, must always be 0 | ||||
object_type ascii, | object_type ascii, | ||||
count counter, | count counter, | ||||
PRIMARY KEY ((partition_key), object_type) | PRIMARY KEY ((partition_key), object_type) | ||||
); | ); | ||||
""".split( | """.split( | ||||
"\n\n" | "\n\n\n" | ||||
) | ) | ||||
CONTENT_INDEX_TEMPLATE = """ | CONTENT_INDEX_TEMPLATE = """ | ||||
-- Secondary table, used for looking up "content" from a single hash | -- Secondary table, used for looking up "content" from a single hash | ||||
CREATE TABLE IF NOT EXISTS content_by_{main_algo} ( | CREATE TABLE IF NOT EXISTS content_by_{main_algo} ( | ||||
{main_algo} blob, | {main_algo} blob, | ||||
target_token bigint, -- value of token(pk) on the "primary" table | target_token bigint, -- value of token(pk) on the "primary" table | ||||
PRIMARY KEY (({main_algo}), target_token) | PRIMARY KEY (({main_algo}), target_token) | ||||
); | ); | ||||
CREATE TABLE IF NOT EXISTS skipped_content_by_{main_algo} ( | CREATE TABLE IF NOT EXISTS skipped_content_by_{main_algo} ( | ||||
{main_algo} blob, | {main_algo} blob, | ||||
target_token bigint, -- value of token(pk) on the "primary" table | target_token bigint, -- value of token(pk) on the "primary" table | ||||
PRIMARY KEY (({main_algo}), target_token) | PRIMARY KEY (({main_algo}), target_token) | ||||
); | ); | ||||
""" | """ | ||||
TABLES = ( | TABLES = ( | ||||
"skipped_content content revision revision_parent release " | "skipped_content content revision revision_parent release " | ||||
"directory directory_entry snapshot snapshot_branch " | "directory directory_entry snapshot snapshot_branch " | ||||
"origin_visit origin origin_metadata object_count " | "origin_visit origin object_metadata object_count " | ||||
"origin_visit_status metadata_authority " | "origin_visit_status metadata_authority " | ||||
"metadata_fetcher" | "metadata_fetcher" | ||||
).split() | ).split() | ||||
HASH_ALGORITHMS = ["sha1", "sha1_git", "sha256", "blake2s256"] | HASH_ALGORITHMS = ["sha1", "sha1_git", "sha256", "blake2s256"] | ||||
for main_algo in HASH_ALGORITHMS: | for main_algo in HASH_ALGORITHMS: | ||||
CREATE_TABLES_QUERIES.extend( | CREATE_TABLES_QUERIES.extend( | ||||
Show All 10 Lines |