Page MenuHomeSoftware Heritage

schema.py
No OneTemporary

schema.py

# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
CREATE_TABLES_QUERIES = [
"""
CREATE OR REPLACE FUNCTION ascii_bins_count_sfunc (
state tuple<int, map<ascii, int>>, -- (nb_none, map<target_type, nb>)
bin_name ascii
)
CALLED ON NULL INPUT
RETURNS tuple<int, map<ascii, int>>
LANGUAGE java AS
$$
if (bin_name == null) {
state.setInt(0, state.getInt(0) + 1);
}
else {
Map<String, Integer> counters = state.getMap(
1, String.class, Integer.class);
Integer nb = counters.get(bin_name);
if (nb == null) {
nb = 0;
}
counters.put(bin_name, nb + 1);
state.setMap(1, counters, String.class, Integer.class);
}
return state;
$$
;""",
"""
CREATE OR REPLACE AGGREGATE ascii_bins_count ( ascii )
SFUNC ascii_bins_count_sfunc
STYPE tuple<int, map<ascii, int>>
INITCOND (0, {})
;""",
"""
CREATE TYPE IF NOT EXISTS microtimestamp (
seconds bigint,
microseconds int
);""",
"""
CREATE TYPE IF NOT EXISTS microtimestamp_with_timezone (
timestamp frozen<microtimestamp>,
offset smallint,
negative_utc boolean
);""",
"""
CREATE TYPE IF NOT EXISTS person (
fullname blob,
name blob,
email blob
);""",
"""
CREATE TABLE IF NOT EXISTS content (
sha1 blob,
sha1_git blob,
sha256 blob,
blake2s256 blob,
length bigint,
ctime timestamp,
-- creation time, i.e. time of (first) injection into the storage
status ascii,
PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256))
);""",
"""
CREATE TABLE IF NOT EXISTS skipped_content (
sha1 blob,
sha1_git blob,
sha256 blob,
blake2s256 blob,
length bigint,
ctime timestamp,
-- creation time, i.e. time of (first) injection into the storage
status ascii,
reason text,
origin text,
PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256))
);""",
"""
CREATE TABLE IF NOT EXISTS revision (
id blob PRIMARY KEY,
date microtimestamp_with_timezone,
committer_date microtimestamp_with_timezone,
type ascii,
directory blob, -- source code "root" directory
message blob,
author person,
committer person,
synthetic boolean,
-- true iff revision has been created by Software Heritage
metadata text,
-- extra metadata as JSON(tarball checksums, etc...)
extra_headers frozen<list <list<blob>> >
-- extra commit information as (tuple(key, value), ...)
);""",
"""
CREATE TABLE IF NOT EXISTS revision_parent (
id blob,
parent_rank int,
-- parent position in merge commits, 0-based
parent_id blob,
PRIMARY KEY ((id), parent_rank)
);""",
"""
CREATE TABLE IF NOT EXISTS release
(
id blob PRIMARY KEY,
target_type ascii,
target blob,
date microtimestamp_with_timezone,
name blob,
message blob,
author person,
synthetic boolean,
-- true iff release has been created by Software Heritage
);""",
"""
CREATE TABLE IF NOT EXISTS directory (
id blob PRIMARY KEY,
);""",
"""
CREATE TABLE IF NOT EXISTS directory_entry (
directory_id blob,
name blob, -- path name, relative to containing dir
target blob,
perms int, -- unix-like permissions
type ascii, -- target type
PRIMARY KEY ((directory_id), name)
);""",
"""
CREATE TABLE IF NOT EXISTS snapshot (
id blob PRIMARY KEY,
);""",
"""
-- For a given snapshot_id, branches are sorted by their name,
-- allowing easy pagination.
CREATE TABLE IF NOT EXISTS snapshot_branch (
snapshot_id blob,
name blob,
target_type ascii,
target blob,
PRIMARY KEY ((snapshot_id), name)
);""",
"""
CREATE TABLE IF NOT EXISTS origin_visit (
origin text,
visit bigint,
date timestamp,
type text,
PRIMARY KEY ((origin), visit)
);""",
"""
CREATE TABLE IF NOT EXISTS origin_visit_status (
origin text,
visit bigint,
date timestamp,
type text,
status ascii,
metadata text,
snapshot blob,
PRIMARY KEY ((origin), visit, date)
);""",
"""
CREATE TABLE IF NOT EXISTS origin (
sha1 blob PRIMARY KEY,
url text,
next_visit_id int,
-- We need integer visit ids for compatibility with the pgsql
-- storage, so we're using lightweight transactions with this trick:
-- https://stackoverflow.com/a/29391877/539465
);""",
"""
CREATE TABLE IF NOT EXISTS metadata_authority (
url text,
type ascii,
PRIMARY KEY ((url), type)
);""",
"""
CREATE TABLE IF NOT EXISTS metadata_fetcher (
name ascii,
version ascii,
PRIMARY KEY ((name), version)
);""",
"""
CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata (
id blob,
type text,
target text,
-- metadata source
authority_type text,
authority_url text,
discovery_date timestamp,
fetcher_name ascii,
fetcher_version ascii,
-- metadata itself
format ascii,
metadata blob,
-- context
origin text,
visit bigint,
snapshot text,
release text,
revision text,
path blob,
directory text,
PRIMARY KEY ((target), authority_type, authority_url, discovery_date, id)
-- An explanation is in order for this primary key:
--
-- Intuitively, the primary key should only be 'id', because two metadata
-- entries are the same iff the id is the same; and 'id' is used for
-- deduplication.
--
-- However, we also want to query by
-- (target, authority_type, authority_url, discovery_date)
-- The naive solution to this would be an extra table, to use as index;
-- but it means 1. extra code to keep them in sync 2. overhead when writing
-- 3. overhead + random reads (instead of linear) when reading.
--
-- Therefore, we use a single table for both, by adding the column
-- we want to query with before the id.
-- It solves both a) the query/order issues and b) the uniqueness issue because:
--
-- a) adding the id at the end of the primary key does not change the rows' order:
-- for two different rows, id1 != id2, so
-- (target1, ..., date1) < (target2, ..., date2)
-- <=> (target1, ..., date1, id1) < (target2, ..., date2, id2)
--
-- b) the id is a hash of all the columns, so:
-- rows are the same
-- <=> id1 == id2
-- <=> (target1, ..., date1, id1) == (target2, ..., date2, id2)
);""",
"""
CREATE TABLE IF NOT EXISTS object_count (
partition_key smallint, -- Constant, must always be 0
object_type ascii,
count counter,
PRIMARY KEY ((partition_key), object_type)
);""",
"""
CREATE TABLE IF NOT EXISTS extid (
extid_type ascii,
extid blob,
target_type ascii,
target blob,
PRIMARY KEY ((extid_type, extid), target_type, target)
);""",
"""
CREATE TABLE IF NOT EXISTS extid_by_target (
target_type ascii,
target blob,
target_token bigint, -- value of token(pk) on the "primary" table
PRIMARY KEY ((target_type, target), target_token)
);""",
]
CONTENT_INDEX_TEMPLATE = """
-- Secondary table, used for looking up "content" from a single hash
CREATE TABLE IF NOT EXISTS content_by_{main_algo} (
{main_algo} blob,
target_token bigint, -- value of token(pk) on the "primary" table
PRIMARY KEY (({main_algo}), target_token)
);
CREATE TABLE IF NOT EXISTS skipped_content_by_{main_algo} (
{main_algo} blob,
target_token bigint, -- value of token(pk) on the "primary" table
PRIMARY KEY (({main_algo}), target_token)
);
"""
TABLES = [
"skipped_content",
"content",
"revision",
"revision_parent",
"release",
"directory",
"directory_entry",
"snapshot",
"snapshot_branch",
"origin_visit",
"origin",
"raw_extrinsic_metadata",
"object_count",
"origin_visit_status",
"metadata_authority",
"metadata_fetcher",
"extid",
"extid_by_target",
]
HASH_ALGORITHMS = ["sha1", "sha1_git", "sha256", "blake2s256"]
for main_algo in HASH_ALGORITHMS:
CREATE_TABLES_QUERIES.extend(
CONTENT_INDEX_TEMPLATE.format(
main_algo=main_algo,
other_algos=", ".join(
[algo for algo in HASH_ALGORITHMS if algo != main_algo]
),
).split("\n\n")
)
TABLES.append("content_by_%s" % main_algo)
TABLES.append("skipped_content_by_%s" % main_algo)

File Metadata

Mime Type
text/plain
Expires
Fri, Jul 4, 11:17 AM (3 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3419689

Event Timeline