Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/cassandra/schema.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
douardda: It's not clear to me: who is responsible for setting this variable? Is there a big "feature… | |||||
Done Inline ActionsNo, for now you need to manually change the code to use the schema. vlorentz: No, for now you need to manually change the code to use the schema. | |||||
import os | |||||
_use_scylla = bool(os.environ.get("SWH_USE_SCYLLADB", "")) | |||||
UDF_LANGUAGE = "lua" if _use_scylla else "java" | |||||
if UDF_LANGUAGE == "java": | |||||
# For Cassandra | |||||
CREATE_TABLES_QUERIES = [ | CREATE_TABLES_QUERIES = [ | ||||
""" | """ | ||||
CREATE OR REPLACE FUNCTION ascii_bins_count_sfunc ( | CREATE OR REPLACE FUNCTION ascii_bins_count_sfunc ( | ||||
state tuple<int, map<ascii, int>>, -- (nb_none, map<target_type, nb>) | state tuple<int, map<ascii, int>>, -- (nb_none, map<target_type, nb>) | ||||
bin_name ascii | bin_name ascii | ||||
) | ) | ||||
CALLED ON NULL INPUT | CALLED ON NULL INPUT | ||||
RETURNS tuple<int, map<ascii, int>> | RETURNS tuple<int, map<ascii, int>> | ||||
LANGUAGE java AS | LANGUAGE java AS | ||||
$$ | $$ | ||||
if (bin_name == null) { | if (bin_name == null) { | ||||
state.setInt(0, state.getInt(0) + 1); | state.setInt(0, state.getInt(0) + 1); | ||||
} | } | ||||
else { | else { | ||||
Map<String, Integer> counters = state.getMap( | Map<String, Integer> counters = state.getMap( | ||||
1, String.class, Integer.class); | 1, String.class, Integer.class); | ||||
Integer nb = counters.get(bin_name); | Integer nb = counters.get(bin_name); | ||||
if (nb == null) { | if (nb == null) { | ||||
nb = 0; | nb = 0; | ||||
} | } | ||||
counters.put(bin_name, nb + 1); | counters.put(bin_name, nb + 1); | ||||
state.setMap(1, counters, String.class, Integer.class); | state.setMap(1, counters, String.class, Integer.class); | ||||
} | } | ||||
return state; | return state; | ||||
$$ | $$;""", | ||||
;""", | |||||
""" | """ | ||||
CREATE OR REPLACE AGGREGATE ascii_bins_count ( ascii ) | CREATE OR REPLACE AGGREGATE ascii_bins_count ( ascii ) | ||||
SFUNC ascii_bins_count_sfunc | SFUNC ascii_bins_count_sfunc | ||||
STYPE tuple<int, map<ascii, int>> | STYPE tuple<int, map<ascii, int>> | ||||
INITCOND (0, {}) | INITCOND (0, {}) | ||||
;""", | ;""", | ||||
] | |||||
elif UDF_LANGUAGE == "lua": | |||||
# For ScyllaDB | |||||
# TODO: this is not implementable yet, because ScyllaDB does not support | |||||
# user-defined aggregates. https://github.com/scylladb/scylla/issues/7201 | |||||
CREATE_TABLES_QUERIES = [] | |||||
else: | |||||
assert False, f"{UDF_LANGUAGE} must be 'lua' or 'java'" | |||||
CREATE_TABLES_QUERIES = [ | |||||
*CREATE_TABLES_QUERIES, | |||||
""" | """ | ||||
CREATE TYPE IF NOT EXISTS microtimestamp ( | CREATE TYPE IF NOT EXISTS microtimestamp ( | ||||
seconds bigint, | seconds bigint, | ||||
microseconds int | microseconds int | ||||
);""", | );""", | ||||
""" | """ | ||||
CREATE TYPE IF NOT EXISTS microtimestamp_with_timezone ( | CREATE TYPE IF NOT EXISTS microtimestamp_with_timezone ( | ||||
timestamp frozen<microtimestamp>, | timestamp frozen<microtimestamp>, | ||||
▲ Show 20 Lines • Show All 110 Lines • ▼ Show 20 Lines | CREATE TABLE IF NOT EXISTS origin_visit_status ( | ||||
origin text, | origin text, | ||||
visit bigint, | visit bigint, | ||||
date timestamp, | date timestamp, | ||||
type text, | type text, | ||||
status ascii, | status ascii, | ||||
metadata text, | metadata text, | ||||
snapshot blob, | snapshot blob, | ||||
PRIMARY KEY ((origin), visit, date) | PRIMARY KEY ((origin), visit, date) | ||||
);""", | ) | ||||
WITH CLUSTERING ORDER BY (visit DESC, date DESC) | |||||
;""", # 'WITH CLUSTERING ORDER BY' is optional with Cassandra 4, but ScyllaDB needs it | |||||
""" | """ | ||||
CREATE TABLE IF NOT EXISTS origin ( | CREATE TABLE IF NOT EXISTS origin ( | ||||
sha1 blob PRIMARY KEY, | sha1 blob PRIMARY KEY, | ||||
url text, | url text, | ||||
next_visit_id int, | next_visit_id int, | ||||
-- We need integer visit ids for compatibility with the pgsql | -- We need integer visit ids for compatibility with the pgsql | ||||
-- storage, so we're using lightweight transactions with this trick: | -- storage, so we're using lightweight transactions with this trick: | ||||
-- https://stackoverflow.com/a/29391877/539465 | -- https://stackoverflow.com/a/29391877/539465 | ||||
▲ Show 20 Lines • Show All 142 Lines • Show Last 20 Lines |
It's not clear to me: who is responsible for setting this variable? Is there a big "feature switch" somewhere else that handles this?