# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information CREATE_TABLES_QUERIES = [ """ CREATE OR REPLACE FUNCTION ascii_bins_count_sfunc ( state tuple>, -- (nb_none, map) bin_name ascii ) CALLED ON NULL INPUT RETURNS tuple> LANGUAGE java AS $$ if (bin_name == null) { state.setInt(0, state.getInt(0) + 1); } else { Map counters = state.getMap( 1, String.class, Integer.class); Integer nb = counters.get(bin_name); if (nb == null) { nb = 0; } counters.put(bin_name, nb + 1); state.setMap(1, counters, String.class, Integer.class); } return state; $$ ;""", """ CREATE OR REPLACE AGGREGATE ascii_bins_count ( ascii ) SFUNC ascii_bins_count_sfunc STYPE tuple> INITCOND (0, {}) ;""", """ CREATE TYPE IF NOT EXISTS microtimestamp ( seconds bigint, microseconds int );""", """ CREATE TYPE IF NOT EXISTS microtimestamp_with_timezone ( timestamp frozen, offset smallint, negative_utc boolean );""", """ CREATE TYPE IF NOT EXISTS person ( fullname blob, name blob, email blob );""", """ CREATE TABLE IF NOT EXISTS content ( sha1 blob, sha1_git blob, sha256 blob, blake2s256 blob, length bigint, ctime timestamp, -- creation time, i.e. time of (first) injection into the storage status ascii, PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256)) );""", """ CREATE TABLE IF NOT EXISTS skipped_content ( sha1 blob, sha1_git blob, sha256 blob, blake2s256 blob, length bigint, ctime timestamp, -- creation time, i.e. time of (first) injection into the storage status ascii, reason text, origin text, PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256)) );""", """ CREATE TABLE IF NOT EXISTS revision ( id blob PRIMARY KEY, date microtimestamp_with_timezone, committer_date microtimestamp_with_timezone, type ascii, directory blob, -- source code "root" directory message blob, author person, committer person, synthetic boolean, -- true iff revision has been created by Software Heritage metadata text, -- extra metadata as JSON(tarball checksums, etc...) extra_headers frozen> > -- extra commit information as (tuple(key, value), ...) );""", """ CREATE TABLE IF NOT EXISTS revision_parent ( id blob, parent_rank int, -- parent position in merge commits, 0-based parent_id blob, PRIMARY KEY ((id), parent_rank) );""", """ CREATE TABLE IF NOT EXISTS release ( id blob PRIMARY KEY, target_type ascii, target blob, date microtimestamp_with_timezone, name blob, message blob, author person, synthetic boolean, -- true iff release has been created by Software Heritage );""", """ CREATE TABLE IF NOT EXISTS directory ( id blob PRIMARY KEY, );""", """ CREATE TABLE IF NOT EXISTS directory_entry ( directory_id blob, name blob, -- path name, relative to containing dir target blob, perms int, -- unix-like permissions type ascii, -- target type PRIMARY KEY ((directory_id), name) );""", """ CREATE TABLE IF NOT EXISTS snapshot ( id blob PRIMARY KEY, );""", """ -- For a given snapshot_id, branches are sorted by their name, -- allowing easy pagination. CREATE TABLE IF NOT EXISTS snapshot_branch ( snapshot_id blob, name blob, target_type ascii, target blob, PRIMARY KEY ((snapshot_id), name) );""", """ CREATE TABLE IF NOT EXISTS origin_visit ( origin text, visit bigint, date timestamp, type text, PRIMARY KEY ((origin), visit) );""", """ CREATE TABLE IF NOT EXISTS origin_visit_status ( origin text, visit bigint, date timestamp, type text, status ascii, metadata text, snapshot blob, PRIMARY KEY ((origin), visit, date) );""", """ CREATE TABLE IF NOT EXISTS origin ( sha1 blob PRIMARY KEY, url text, next_visit_id int, -- We need integer visit ids for compatibility with the pgsql -- storage, so we're using lightweight transactions with this trick: -- https://stackoverflow.com/a/29391877/539465 );""", """ CREATE TABLE IF NOT EXISTS metadata_authority ( url text, type ascii, metadata text, PRIMARY KEY ((url), type) );""", """ CREATE TABLE IF NOT EXISTS metadata_fetcher ( name ascii, version ascii, metadata text, PRIMARY KEY ((name), version) );""", """ CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata ( id blob, type text, target text, -- metadata source authority_type text, authority_url text, discovery_date timestamp, fetcher_name ascii, fetcher_version ascii, -- metadata itself format ascii, metadata blob, -- context origin text, visit bigint, snapshot text, release text, revision text, path blob, directory text, PRIMARY KEY ((target), authority_type, authority_url, discovery_date, id) -- An explanation is in order for this primary key: -- -- Intuitively, the primary key should only be 'id', because two metadata -- entries are the same iff the id is the same; and 'id' is used for -- deduplication. -- -- However, we also want to query by -- (target, authority_type, authority_url, discovery_date) -- The naive solution to this would be an extra table, to use as index; -- but it means 1. extra code to keep them in sync 2. overhead when writing -- 3. overhead + random reads (instead of linear) when reading. -- -- Therefore, we use a single table for both, by adding the column -- we want to query with before the id. -- It solves both a) the query/order issues and b) the uniqueness issue because: -- -- a) adding the id at the end of the primary key does not change the rows' order: -- for two different rows, id1 != id2, so -- (target1, ..., date1) < (target2, ..., date2) -- <=> (target1, ..., date1, id1) < (target2, ..., date2, id2) -- -- b) the id is a hash of all the columns, so: -- rows are the same -- <=> id1 == id2 -- <=> (target1, ..., date1, id1) == (target2, ..., date2, id2) );""", """ CREATE TABLE IF NOT EXISTS object_count ( partition_key smallint, -- Constant, must always be 0 object_type ascii, count counter, PRIMARY KEY ((partition_key), object_type) );""", """ CREATE TABLE IF NOT EXISTS extid ( extid_type ascii, extid blob, target_type ascii, target blob, PRIMARY KEY ((extid_type, extid), target_type, target) );""", """ CREATE TABLE IF NOT EXISTS extid_by_target ( target_type ascii, target blob, target_token bigint, -- value of token(pk) on the "primary" table PRIMARY KEY ((target_type, target), target_token) );""", ] CONTENT_INDEX_TEMPLATE = """ -- Secondary table, used for looking up "content" from a single hash CREATE TABLE IF NOT EXISTS content_by_{main_algo} ( {main_algo} blob, target_token bigint, -- value of token(pk) on the "primary" table PRIMARY KEY (({main_algo}), target_token) ); CREATE TABLE IF NOT EXISTS skipped_content_by_{main_algo} ( {main_algo} blob, target_token bigint, -- value of token(pk) on the "primary" table PRIMARY KEY (({main_algo}), target_token) ); """ TABLES = [ "skipped_content", "content", "revision", "revision_parent", "release", "directory", "directory_entry", "snapshot", "snapshot_branch", "origin_visit", "origin", "raw_extrinsic_metadata", "object_count", "origin_visit_status", "metadata_authority", "metadata_fetcher", "extid", "extid_by_target", ] HASH_ALGORITHMS = ["sha1", "sha1_git", "sha256", "blake2s256"] for main_algo in HASH_ALGORITHMS: CREATE_TABLES_QUERIES.extend( CONTENT_INDEX_TEMPLATE.format( main_algo=main_algo, other_algos=", ".join( [algo for algo in HASH_ALGORITHMS if algo != main_algo] ), ).split("\n\n") ) TABLES.append("content_by_%s" % main_algo) TABLES.append("skipped_content_by_%s" % main_algo)