diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -35,6 +35,7 @@ wait_random_exponential, ) +from swh.core.utils import grouper from swh.model.identifiers import CoreSWHID from swh.model.model import ( Content, @@ -74,6 +75,17 @@ ) from .schema import CREATE_TABLES_QUERIES, HASH_ALGORITHMS +PARTITION_KEY_RESTRICTION_MAX_SIZE = 100 +"""Maximum number of restrictions in a single query. +Usually this is a very low number (eg. SELECT ... FROM ... WHERE x=?), +but some queries can request arbitrarily many (eg. SELECT ... FROM ... WHERE x IN ?). + +This can cause performance issues, as the node getting the query need to +coordinate with other nodes to get the complete results. +See for details and rationale. +""" + + logger = logging.getLogger(__name__) @@ -302,8 +314,10 @@ return None def _missing(self, statement, ids): - rows = self._execute_with_retries(statement, [ids]) - found_ids = {row["id"] for row in rows} + found_ids = set() + for id_group in grouper(ids, PARTITION_KEY_RESTRICTION_MAX_SIZE): + rows = self._execute_with_retries(statement, [list(id_group)]) + found_ids.update(row["id"] for row in rows) return [id_ for id_ in ids if id_ not in found_ids] ########################## diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -1120,6 +1120,22 @@ revision3.id, } + def test_revision_missing_many(self, swh_storage, sample_data): + """Large number of revision ids to check can cause ScyllaDB to reject + queries.""" + revision = sample_data.revision + ids = [bytes([b1, b2]) * 10 for b1 in range(256) for b2 in range(10)] + ids.append(revision.id) + ids.sort() + init_missing = swh_storage.revision_missing(ids) + assert set(init_missing) == set(ids) + + actual_result = swh_storage.revision_add([revision]) + assert actual_result == {"revision:add": 1} + + end_missing = swh_storage.revision_missing(ids) + assert set(end_missing) == set(ids) - {revision.id} + def test_extid_add_git(self, swh_storage, sample_data): gitids = [