schema.py
No OneTemporary
Actions

Size

9 KB

Subscribers

None

schema.py
View Options

	# Copyright (C) 2019-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information


	CREATE_TABLES_QUERIES = [
	"""
	CREATE OR REPLACE FUNCTION ascii_bins_count_sfunc (
	state tuple<int, map<ascii, int>>, -- (nb_none, map<target_type, nb>)
	bin_name ascii
	)
	CALLED ON NULL INPUT
	RETURNS tuple<int, map<ascii, int>>
	LANGUAGE java AS
	$$
	if (bin_name == null) {
	state.setInt(0, state.getInt(0) + 1);
	}
	else {
	Map<String, Integer> counters = state.getMap(
	1, String.class, Integer.class);
	Integer nb = counters.get(bin_name);
	if (nb == null) {
	nb = 0;
	}
	counters.put(bin_name, nb + 1);
	state.setMap(1, counters, String.class, Integer.class);
	}
	return state;
	$$
	;""",
	"""
	CREATE OR REPLACE AGGREGATE ascii_bins_count ( ascii )
	SFUNC ascii_bins_count_sfunc
	STYPE tuple<int, map<ascii, int>>
	INITCOND (0, {})
	;""",
	"""
	CREATE TYPE IF NOT EXISTS microtimestamp (
	seconds bigint,
	microseconds int
	);""",
	"""
	CREATE TYPE IF NOT EXISTS microtimestamp_with_timezone (
	timestamp frozen<microtimestamp>,
	offset smallint,
	negative_utc boolean
	);""",
	"""
	CREATE TYPE IF NOT EXISTS person (
	fullname blob,
	name blob,
	email blob
	);""",
	"""
	CREATE TABLE IF NOT EXISTS content (
	sha1 blob,
	sha1_git blob,
	sha256 blob,
	blake2s256 blob,
	length bigint,
	ctime timestamp,
	-- creation time, i.e. time of (first) injection into the storage
	status ascii,
	PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256))
	);""",
	"""
	CREATE TABLE IF NOT EXISTS skipped_content (
	sha1 blob,
	sha1_git blob,
	sha256 blob,
	blake2s256 blob,
	length bigint,
	ctime timestamp,
	-- creation time, i.e. time of (first) injection into the storage
	status ascii,
	reason text,
	origin text,
	PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256))
	);""",
	"""
	CREATE TABLE IF NOT EXISTS revision (
	id blob PRIMARY KEY,
	date microtimestamp_with_timezone,
	committer_date microtimestamp_with_timezone,
	type ascii,
	directory blob, -- source code "root" directory
	message blob,
	author person,
	committer person,
	synthetic boolean,
	-- true iff revision has been created by Software Heritage
	metadata text,
	-- extra metadata as JSON(tarball checksums, etc...)
	extra_headers frozen<list <list<blob>> >
	-- extra commit information as (tuple(key, value), ...)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS revision_parent (
	id blob,
	parent_rank int,
	-- parent position in merge commits, 0-based
	parent_id blob,
	PRIMARY KEY ((id), parent_rank)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS release
	(
	id blob PRIMARY KEY,
	target_type ascii,
	target blob,
	date microtimestamp_with_timezone,
	name blob,
	message blob,
	author person,
	synthetic boolean,
	-- true iff release has been created by Software Heritage
	);""",
	"""
	CREATE TABLE IF NOT EXISTS directory (
	id blob PRIMARY KEY,
	);""",
	"""
	CREATE TABLE IF NOT EXISTS directory_entry (
	directory_id blob,
	name blob, -- path name, relative to containing dir
	target blob,
	perms int, -- unix-like permissions
	type ascii, -- target type
	PRIMARY KEY ((directory_id), name)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS snapshot (
	id blob PRIMARY KEY,
	);""",
	"""
	-- For a given snapshot_id, branches are sorted by their name,
	-- allowing easy pagination.
	CREATE TABLE IF NOT EXISTS snapshot_branch (
	snapshot_id blob,
	name blob,
	target_type ascii,
	target blob,
	PRIMARY KEY ((snapshot_id), name)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS origin_visit (
	origin text,
	visit bigint,
	date timestamp,
	type text,
	PRIMARY KEY ((origin), visit)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS origin_visit_status (
	origin text,
	visit bigint,
	date timestamp,
	type text,
	status ascii,
	metadata text,
	snapshot blob,
	PRIMARY KEY ((origin), visit, date)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS origin (
	sha1 blob PRIMARY KEY,
	url text,
	next_visit_id int,
	-- We need integer visit ids for compatibility with the pgsql
	-- storage, so we're using lightweight transactions with this trick:
	-- https://stackoverflow.com/a/29391877/539465
	);""",
	"""
	CREATE TABLE IF NOT EXISTS metadata_authority (
	url text,
	type ascii,
	metadata text,
	PRIMARY KEY ((url), type)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS metadata_fetcher (
	name ascii,
	version ascii,
	metadata text,
	PRIMARY KEY ((name), version)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata (
	id blob,

	type text,
	target text,

	-- metadata source
	authority_type text,
	authority_url text,
	discovery_date timestamp,
	fetcher_name ascii,
	fetcher_version ascii,

	-- metadata itself
	format ascii,
	metadata blob,

	-- context
	origin text,
	visit bigint,
	snapshot text,
	release text,
	revision text,
	path blob,
	directory text,

	PRIMARY KEY ((target), authority_type, authority_url, discovery_date, id)

	-- An explanation is in order for this primary key:
	--
	-- Intuitively, the primary key should only be 'id', because two metadata
	-- entries are the same iff the id is the same; and 'id' is used for
	-- deduplication.
	--
	-- However, we also want to query by
	-- (target, authority_type, authority_url, discovery_date)
	-- The naive solution to this would be an extra table, to use as index;
	-- but it means 1. extra code to keep them in sync 2. overhead when writing
	-- 3. overhead + random reads (instead of linear) when reading.
	--
	-- Therefore, we use a single table for both, by adding the column
	-- we want to query with before the id.
	-- It solves both a) the query/order issues and b) the uniqueness issue because:
	--
	-- a) adding the id at the end of the primary key does not change the rows' order:
	-- for two different rows, id1 != id2, so
	-- (target1, ..., date1) < (target2, ..., date2)
	-- <=> (target1, ..., date1, id1) < (target2, ..., date2, id2)
	--
	-- b) the id is a hash of all the columns, so:
	-- rows are the same
	-- <=> id1 == id2
	-- <=> (target1, ..., date1, id1) == (target2, ..., date2, id2)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS object_count (
	partition_key smallint, -- Constant, must always be 0
	object_type ascii,
	count counter,
	PRIMARY KEY ((partition_key), object_type)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS extid (
	extid_type ascii,
	extid blob,
	target_type ascii,
	target blob,
	PRIMARY KEY ((extid_type, extid), target_type, target)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS extid_by_target (
	target_type ascii,
	target blob,
	target_token bigint, -- value of token(pk) on the "primary" table
	PRIMARY KEY ((target_type, target), target_token)
	);""",
	]

	CONTENT_INDEX_TEMPLATE = """
	-- Secondary table, used for looking up "content" from a single hash
	CREATE TABLE IF NOT EXISTS content_by_{main_algo} (
	{main_algo} blob,
	target_token bigint, -- value of token(pk) on the "primary" table
	PRIMARY KEY (({main_algo}), target_token)
	);

	CREATE TABLE IF NOT EXISTS skipped_content_by_{main_algo} (
	{main_algo} blob,
	target_token bigint, -- value of token(pk) on the "primary" table
	PRIMARY KEY (({main_algo}), target_token)
	);
	"""

	TABLES = [
	"skipped_content",
	"content",
	"revision",
	"revision_parent",
	"release",
	"directory",
	"directory_entry",
	"snapshot",
	"snapshot_branch",
	"origin_visit",
	"origin",
	"raw_extrinsic_metadata",
	"object_count",
	"origin_visit_status",
	"metadata_authority",
	"metadata_fetcher",
	"extid",
	"extid_by_target",
	]

	HASH_ALGORITHMS = ["sha1", "sha1_git", "sha256", "blake2s256"]

	for main_algo in HASH_ALGORITHMS:
	CREATE_TABLES_QUERIES.extend(
	CONTENT_INDEX_TEMPLATE.format(
	main_algo=main_algo,
	other_algos=", ".join(
	[algo for algo in HASH_ALGORITHMS if algo != main_algo]
	),
	).split("\n\n")
	)

	TABLES.append("content_by_%s" % main_algo)
	TABLES.append("skipped_content_by_%s" % main_algo)

File Metadata

Mime Type: text/plain
Expires: Fri, Jul 4, 11:56 AM (3 w, 2 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3321887

schema.pyNo OneTemporaryActions

schema.pyView Options

File Metadata

Event Timeline

schema.py
No OneTemporary
Actions

schema.py
View Options