Differential D5030 Diff 18979 swh/storage/cassandra/schema.py

Changeset View

Standalone View

swh/storage/cassandra/schema.py

	Show First 20 Lines • Show All 182 Lines • ▼ Show 20 Lines
	CREATE TABLE IF NOT EXISTS metadata_fetcher (			CREATE TABLE IF NOT EXISTS metadata_fetcher (
	name ascii,			name ascii,
	version ascii,			version ascii,
	metadata text,			metadata text,
	PRIMARY KEY ((name), version)			PRIMARY KEY ((name), version)
	);""",			);""",
	"""			"""
	CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata (			CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata (
				id blob,

	type text,			type text,
	target text,			target text,

	-- metadata source			-- metadata source
	authority_type text,			authority_type text,
	authority_url text,			authority_url text,
	discovery_date timestamp,			discovery_date timestamp,
	fetcher_name ascii,			fetcher_name ascii,
	fetcher_version ascii,			fetcher_version ascii,

	-- metadata itself			-- metadata itself
	format ascii,			format ascii,
	metadata blob,			metadata blob,

	-- context			-- context
	origin text,			origin text,
	visit bigint,			visit bigint,
	snapshot text,			snapshot text,
	release text,			release text,
	revision text,			revision text,
	path blob,			path blob,
	directory text,			directory text,

	PRIMARY KEY ((target), authority_type, authority_url, discovery_date,			PRIMARY KEY ((target), authority_type, authority_url, discovery_date, id)
	fetcher_name, fetcher_version)
				-- An explanation is in order for this primary key:
				--
				-- Intuitively, the primary key should only be 'id', because two metadata
				-- entries are the same iff the id is the same; and 'id' is used for
				-- deduplication.
				--
				-- However, we also want to query by
				-- (target, authority_type, authority_url, discovery_date)
				-- The naive solution to this would be an extra table, to use as index;
				-- but it means 1. extra code to keep them in sync 2. overhead when writing
				-- 3. overhead + random reads (instead of linear) when reading.
				--
				-- Therefore, we use a single table for both, by adding the column
				-- we want to query with before the id.
				-- It solves both a) the query/order issues and b) the uniqueness issue because:
				--
				-- a) adding the id at the end of the primary key does not change the rows' order:
				-- for two different rows, id1 != id2, so
				-- (target1, ..., date1) < (target2, ..., date2)
				-- <=> (target1, ..., date1, id1) < (target2, ..., date2, id2)
				--
				-- b) the id is a hash of all the columns, so:
				-- rows are the same
				-- <=> id1 == id2
				-- <=> (target1, ..., date1, id1) == (target2, ..., date2, id2)
	);""",			);""",
	"""			"""
	CREATE TABLE IF NOT EXISTS object_count (			CREATE TABLE IF NOT EXISTS object_count (
	partition_key smallint, -- Constant, must always be 0			partition_key smallint, -- Constant, must always be 0
	object_type ascii,			object_type ascii,
	count counter,			count counter,
	PRIMARY KEY ((partition_key), object_type)			PRIMARY KEY ((partition_key), object_type)
	);""",			);""",
	▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines