diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -243,8 +243,6 @@ # 'content' table ########################## - _content_pk = ["sha1", "sha1_git", "sha256", "blake2s256"] - def _content_add_finalize(self, statement: BoundStatement) -> None: """Returned currified by content_add_prepare, to be called when the content row should be added to the primary table.""" @@ -265,7 +263,7 @@ token_class = self._cluster.metadata.token_map.token_class # Token of the row when it will be inserted. This is equivalent to - # "SELECT token({', '.join(self._content_pk)}) FROM content WHERE ..." + # "SELECT token({', '.join(ContentRow.PARTITION_KEY)}) FROM content WHERE ..." # after the row is inserted; but we need the token to insert in the # index tables *before* inserting to the main 'content' table token = token_class.from_key(statement.routing_key).value @@ -295,13 +293,13 @@ return None @_prepared_select_statement( - ContentRow, f"WHERE token({', '.join(_content_pk)}) = ?" + ContentRow, f"WHERE token({', '.join(ContentRow.PARTITION_KEY)}) = ?" ) def content_get_from_token(self, token, *, statement) -> Iterable[ContentRow]: return map(ContentRow.from_dict, self._execute_with_retries(statement, [token])) @_prepared_select_statement( - ContentRow, f"WHERE token({', '.join(_content_pk)}) > ? LIMIT 1" + ContentRow, f"WHERE token({', '.join(ContentRow.PARTITION_KEY)}) > ? LIMIT 1" ) def content_get_random(self, *, statement) -> Optional[ContentRow]: return self._get_random_row(ContentRow, statement) @@ -310,7 +308,7 @@ ( "SELECT token({0}) AS tok, {1} FROM content " "WHERE token({0}) >= ? AND token({0}) <= ? LIMIT ?" - ).format(", ".join(_content_pk), ", ".join(ContentRow.cols())) + ).format(", ".join(ContentRow.PARTITION_KEY), ", ".join(ContentRow.cols())) ) def content_get_token_range( self, start: int, end: int, limit: int, *, statement @@ -354,7 +352,6 @@ # 'skipped_content' table ########################## - _skipped_content_pk = ["sha1", "sha1_git", "sha256", "blake2s256"] _magic_null_pk = b"" """ NULLs (or all-empty blobs) are not allowed in primary keys; instead use a @@ -377,7 +374,7 @@ # Replace NULLs (which are not allowed in the partition key) with # an empty byte string - for key in self._skipped_content_pk: + for key in SkippedContentRow.PARTITION_KEY: if getattr(content, key) is None: setattr(content, key, self._magic_null_pk) @@ -388,7 +385,7 @@ token_class = self._cluster.metadata.token_map.token_class # Token of the row when it will be inserted. This is equivalent to - # "SELECT token({', '.join(self._content_pk)}) + # "SELECT token({', '.join(SkippedContentRow.PARTITION_KEY)}) # FROM skipped_content WHERE ..." # after the row is inserted; but we need the token to insert in the # index tables *before* inserting to the main 'skipped_content' table diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py --- a/swh/storage/cassandra/model.py +++ b/swh/storage/cassandra/model.py @@ -21,7 +21,7 @@ import dataclasses import datetime -from typing import Any, ClassVar, Dict, List, Optional, Type, TypeVar +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, TypeVar from swh.model.model import Person, TimestampWithTimezone @@ -31,6 +31,8 @@ class BaseRow: TABLE: ClassVar[str] + PARTITION_KEY: ClassVar[Tuple[str, ...]] + CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = () @classmethod def from_dict(cls: Type[T], d: Dict[str, Any]) -> T: @@ -47,6 +49,7 @@ @dataclasses.dataclass class ContentRow(BaseRow): TABLE = "content" + PARTITION_KEY = ("sha1", "sha1_git", "sha256", "blake2s256") sha1: bytes sha1_git: bytes @@ -60,6 +63,7 @@ @dataclasses.dataclass class SkippedContentRow(BaseRow): TABLE = "skipped_content" + PARTITION_KEY = ("sha1", "sha1_git", "sha256", "blake2s256") sha1: Optional[bytes] sha1_git: Optional[bytes] @@ -75,6 +79,7 @@ @dataclasses.dataclass class DirectoryRow(BaseRow): TABLE = "directory" + PARTITION_KEY = ("id",) id: bytes @@ -82,6 +87,8 @@ @dataclasses.dataclass class DirectoryEntryRow(BaseRow): TABLE = "directory_entry" + PARTITION_KEY = ("directory_id",) + CLUSTERING_KEY = ("name",) directory_id: bytes name: bytes @@ -93,6 +100,7 @@ @dataclasses.dataclass class RevisionRow(BaseRow): TABLE = "revision" + PARTITION_KEY = ("id",) id: bytes date: Optional[TimestampWithTimezone] @@ -110,6 +118,8 @@ @dataclasses.dataclass class RevisionParentRow(BaseRow): TABLE = "revision_parent" + PARTITION_KEY = ("id",) + CLUSTERING_KEY = ("parent_rank",) id: bytes parent_rank: int @@ -119,6 +129,7 @@ @dataclasses.dataclass class ReleaseRow(BaseRow): TABLE = "release" + PARTITION_KEY = ("id",) id: bytes target_type: str @@ -133,6 +144,7 @@ @dataclasses.dataclass class SnapshotRow(BaseRow): TABLE = "snapshot" + PARTITION_KEY = ("id",) id: bytes @@ -140,6 +152,8 @@ @dataclasses.dataclass class SnapshotBranchRow(BaseRow): TABLE = "snapshot_branch" + PARTITION_KEY = ("snapshot_id",) + CLUSTERING_KEY = ("name",) snapshot_id: bytes name: bytes @@ -150,6 +164,8 @@ @dataclasses.dataclass class OriginVisitRow(BaseRow): TABLE = "origin_visit" + PARTITION_KEY = ("origin",) + CLUSTERING_KEY = ("visit",) origin: str visit: int @@ -160,6 +176,8 @@ @dataclasses.dataclass class OriginVisitStatusRow(BaseRow): TABLE = "origin_visit_status" + PARTITION_KEY = ("origin",) + CLUSTERING_KEY = ("visit", "date") origin: str visit: int @@ -172,6 +190,7 @@ @dataclasses.dataclass class OriginRow(BaseRow): TABLE = "origin" + PARTITION_KEY = ("sha1",) sha1: bytes url: str @@ -181,6 +200,8 @@ @dataclasses.dataclass class MetadataAuthorityRow(BaseRow): TABLE = "metadata_authority" + PARTITION_KEY = ("url",) + CLUSTERING_KEY = ("type",) url: str type: str @@ -190,6 +211,8 @@ @dataclasses.dataclass class MetadataFetcherRow(BaseRow): TABLE = "metadata_fetcher" + PARTITION_KEY = ("name",) + CLUSTERING_KEY = ("version",) name: str version: str @@ -199,6 +222,14 @@ @dataclasses.dataclass class RawExtrinsicMetadataRow(BaseRow): TABLE = "raw_extrinsic_metadata" + PARTITION_KEY = ("id",) + CLUSTERING_KEY = ( + "authority_type", + "authority_url", + "discovery_date", + "fetcher_name", + "fetcher_version", + ) type: str id: str @@ -224,6 +255,8 @@ @dataclasses.dataclass class ObjectCountRow(BaseRow): TABLE = "object_count" + PARTITION_KEY = ("partition_key",) + CLUSTERING_KEY = ("object_type",) partition_key: int object_type: str