Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/cassandra/schema.py
Show First 20 Lines • Show All 182 Lines • ▼ Show 20 Lines | |||||
CREATE TABLE IF NOT EXISTS metadata_fetcher ( | CREATE TABLE IF NOT EXISTS metadata_fetcher ( | ||||
name ascii, | name ascii, | ||||
version ascii, | version ascii, | ||||
metadata text, | metadata text, | ||||
PRIMARY KEY ((name), version) | PRIMARY KEY ((name), version) | ||||
);""", | );""", | ||||
""" | """ | ||||
CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata ( | CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata ( | ||||
id blob, | |||||
type text, | type text, | ||||
target text, | target text, | ||||
-- metadata source | -- metadata source | ||||
authority_type text, | authority_type text, | ||||
authority_url text, | authority_url text, | ||||
discovery_date timestamp, | discovery_date timestamp, | ||||
fetcher_name ascii, | fetcher_name ascii, | ||||
fetcher_version ascii, | fetcher_version ascii, | ||||
-- metadata itself | -- metadata itself | ||||
format ascii, | format ascii, | ||||
metadata blob, | metadata blob, | ||||
-- context | -- context | ||||
origin text, | origin text, | ||||
visit bigint, | visit bigint, | ||||
snapshot text, | snapshot text, | ||||
release text, | release text, | ||||
revision text, | revision text, | ||||
path blob, | path blob, | ||||
directory text, | directory text, | ||||
PRIMARY KEY ((target), authority_type, authority_url, discovery_date, | PRIMARY KEY ((target), authority_type, authority_url, discovery_date, id) | ||||
fetcher_name, fetcher_version) | |||||
-- An explanation is in order for this primary key: | |||||
-- | |||||
-- Intuitively, the primary key should only be 'id', because two metadata | |||||
-- entries are the same iff the id is the same; and 'id' is used for | |||||
-- deduplication. | |||||
-- | |||||
-- However, we also want to query by | |||||
-- (target, authority_type, authority_url, discovery_date) | |||||
-- The naive solution to this would be an extra table, to use as index; | |||||
-- but it means 1. extra code to keep them in sync 2. overhead when writing | |||||
-- 3. overhead + random reads (instead of linear) when reading. | |||||
-- | |||||
-- Therefore, we use a single table for both, by adding the column | |||||
-- we want to query with before the id. | |||||
-- It solves both a) the query/order issues and b) the uniqueness issue because: | |||||
-- | |||||
-- a) adding the id at the end of the primary key does not change the rows' order: | |||||
-- for two different rows, id1 != id2, so | |||||
-- (target1, ..., date1) < (target2, ..., date2) | |||||
-- <=> (target1, ..., date1, id1) < (target2, ..., date2, id2) | |||||
-- | |||||
-- b) the id is a hash of all the columns, so: | |||||
-- rows are the same | |||||
-- <=> id1 == id2 | |||||
-- <=> (target1, ..., date1, id1) == (target2, ..., date2, id2) | |||||
);""", | );""", | ||||
""" | """ | ||||
CREATE TABLE IF NOT EXISTS object_count ( | CREATE TABLE IF NOT EXISTS object_count ( | ||||
partition_key smallint, -- Constant, must always be 0 | partition_key smallint, -- Constant, must always be 0 | ||||
object_type ascii, | object_type ascii, | ||||
count counter, | count counter, | ||||
PRIMARY KEY ((partition_key), object_type) | PRIMARY KEY ((partition_key), object_type) | ||||
);""", | );""", | ||||
▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines |