diff --git a/sql/upgrades/162.sql b/sql/upgrades/162.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/162.sql @@ -0,0 +1,52 @@ +-- SWH DB schema upgrade +-- from_version: 161 +-- to_version: 162 +-- description: Make swh_directory_walk_one join skipped_content in addition to content + +-- latest schema version +insert into dbversion(version, release, description) + values(162, now(), 'Work Still In Progress'); + +create or replace function swh_directory_walk_one(walked_dir_id sha1_git) + returns setof directory_entry + language sql + stable +as $$ + with dir as ( + select id as dir_id, dir_entries, file_entries, rev_entries + from directory + where id = walked_dir_id), + ls_d as (select dir_id, unnest(dir_entries) as entry_id from dir), + ls_f as (select dir_id, unnest(file_entries) as entry_id from dir), + ls_r as (select dir_id, unnest(rev_entries) as entry_id from dir) + (select dir_id, 'dir'::directory_entry_type as type, + e.target, e.name, e.perms, NULL::content_status, + NULL::sha1, NULL::sha1_git, NULL::sha256, NULL::bigint + from ls_d + left join directory_entry_dir e on ls_d.entry_id = e.id) + union + (with known_contents as + (select dir_id, 'file'::directory_entry_type as type, + e.target, e.name, e.perms, c.status, + c.sha1, c.sha1_git, c.sha256, c.length + from ls_f + left join directory_entry_file e on ls_f.entry_id = e.id + inner join content c on e.target = c.sha1_git) + select * from known_contents + union + (select dir_id, 'file'::directory_entry_type as type, + e.target, e.name, e.perms, c.status, + c.sha1, c.sha1_git, c.sha256, c.length + from ls_f + left join directory_entry_file e on ls_f.entry_id = e.id + left join skipped_content c on e.target = c.sha1_git + where not exists (select 1 from known_contents where known_contents.sha1_git=e.target))) + union + (select dir_id, 'rev'::directory_entry_type as type, + e.target, e.name, e.perms, NULL::content_status, + NULL::sha1, NULL::sha1_git, NULL::sha256, NULL::bigint + from ls_r + left join directory_entry_rev e on ls_r.entry_id = e.id) + order by name; +$$; + diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -51,6 +51,7 @@ ContentRow, DirectoryEntryRow, DirectoryRow, + MAGIC_NULL_PK, MetadataAuthorityRow, MetadataFetcherRow, ObjectCountRow, @@ -385,12 +386,6 @@ # 'skipped_content' table ########################## - _magic_null_pk = b"" - """ - NULLs (or all-empty blobs) are not allowed in primary keys; instead use a - special value that can't possibly be a valid hash. - """ - def _skipped_content_add_finalize(self, statement: BoundStatement) -> None: """Returned currified by skipped_content_add_prepare, to be called when the content row should be added to the primary table.""" @@ -409,7 +404,7 @@ # an empty byte string for key in SkippedContentRow.PARTITION_KEY: if getattr(content, key) is None: - setattr(content, key, self._magic_null_pk) + setattr(content, key, MAGIC_NULL_PK) statement = statement.bind(dataclasses.astuple(content)) @@ -441,19 +436,26 @@ rows = list( self._execute_with_retries( statement, - [ - content_hashes[algo] or self._magic_null_pk - for algo in HASH_ALGORITHMS - ], + [content_hashes[algo] or MAGIC_NULL_PK for algo in HASH_ALGORITHMS], ) ) assert len(rows) <= 1 if rows: - # TODO: convert _magic_null_pk back to None? return SkippedContentRow.from_dict(rows[0]) else: return None + @_prepared_select_statement( + SkippedContentRow, + f"WHERE token({', '.join(SkippedContentRow.PARTITION_KEY)}) = ?", + ) + def skipped_content_get_from_token( + self, token, *, statement + ) -> Iterable[SkippedContentRow]: + return map( + SkippedContentRow.from_dict, self._execute_with_retries(statement, [token]) + ) + ########################## # 'skipped_content_by_*' tables ########################## @@ -468,7 +470,16 @@ f"VALUES (%s, %s)" ) self._execute_with_retries( - query, [content.get_hash(algo) or self._magic_null_pk, token] + query, [content.get_hash(algo) or MAGIC_NULL_PK, token] + ) + + def skipped_content_get_tokens_from_single_hash( + self, algo: str, hash_: bytes + ) -> Iterable[int]: + assert algo in HASH_ALGORITHMS + query = f"SELECT target_token FROM skipped_content_by_{algo} WHERE {algo} = %s" + return ( + row["target_token"] for row in self._execute_with_retries(query, [hash_]) ) ########################## diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py --- a/swh/storage/cassandra/model.py +++ b/swh/storage/cassandra/model.py @@ -26,6 +26,13 @@ from swh.model.model import Person, TimestampWithTimezone +MAGIC_NULL_PK = b"" +""" +NULLs (or all-empty blobs) are not allowed in primary keys; instead we use a +special value that can't possibly be a valid hash. +""" + + T = TypeVar("T", bound="BaseRow") @@ -75,6 +82,14 @@ reason: str origin: str + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "SkippedContentRow": + d = d.copy() + for k in ("sha1", "sha1_git", "sha256", "blake2s256"): + if d[k] == MAGIC_NULL_PK: + d[k] = None + return super().from_dict(d) + @dataclasses.dataclass class DirectoryRow(BaseRow): diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -362,6 +362,7 @@ return self._cql_runner.directory_missing(directories) def _join_dentry_to_content(self, dentry: DirectoryEntry) -> Dict[str, Any]: + contents: Union[List[Content], List[SkippedContentRow]] keys = ( "status", "sha1", @@ -373,6 +374,16 @@ ret.update(dentry.to_dict()) if ret["type"] == "file": contents = self.content_find({"sha1_git": ret["target"]}) + if not contents: + tokens = list( + self._cql_runner.skipped_content_get_tokens_from_single_hash( + "sha1_git", ret["target"] + ) + ) + if tokens: + contents = list( + self._cql_runner.skipped_content_get_from_token(tokens[0]) + ) if contents: content = contents[0] for key in keys: diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -256,6 +256,9 @@ primary_key = self._skipped_contents.primary_key_from_dict(content_hashes) return self._skipped_contents.get_from_primary_key(primary_key) + def skipped_content_get_from_token(self, token: int) -> Iterable[SkippedContentRow]: + return self._skipped_contents.get_from_token(token) + ########################## # 'skipped_content_by_*' tables ########################## @@ -265,6 +268,11 @@ ) -> None: self._skipped_content_indexes[algo][content.get_hash(algo)].add(token) + def skipped_content_get_tokens_from_single_hash( + self, algo: str, hash_: bytes + ) -> Iterable[int]: + return self._skipped_content_indexes[algo][hash_] + ########################## # 'directory' table ########################## diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -24,7 +24,7 @@ """ - current_version = 161 + current_version = 162 def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute( diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(161, now(), 'Work In Progress'); + values(162, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql --- a/swh/storage/sql/40-swh-func.sql +++ b/swh/storage/sql/40-swh-func.sql @@ -314,12 +314,22 @@ from ls_d left join directory_entry_dir e on ls_d.entry_id = e.id) union - (select dir_id, 'file'::directory_entry_type as type, + (with known_contents as + (select dir_id, 'file'::directory_entry_type as type, e.target, e.name, e.perms, c.status, c.sha1, c.sha1_git, c.sha256, c.length - from ls_f - left join directory_entry_file e on ls_f.entry_id = e.id - left join content c on e.target = c.sha1_git) + from ls_f + left join directory_entry_file e on ls_f.entry_id = e.id + inner join content c on e.target = c.sha1_git) + select * from known_contents + union + (select dir_id, 'file'::directory_entry_type as type, + e.target, e.name, e.perms, c.status, + c.sha1, c.sha1_git, c.sha256, c.length + from ls_f + left join directory_entry_file e on ls_f.entry_id = e.id + left join skipped_content c on e.target = c.sha1_git + where not exists (select 1 from known_contents where known_contents.sha1_git=e.target))) union (select dir_id, 'rev'::directory_entry_type as type, e.target, e.name, e.perms, NULL::content_status, diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -30,6 +30,7 @@ OriginVisitStatus, Person, Revision, + SkippedContent, Snapshot, TargetType, ) @@ -766,6 +767,35 @@ }, ] + def test_directory_ls_skipped_content(self, swh_storage, sample_data): + swh_storage.directory_add([sample_data.directory2]) + + cont = SkippedContent( + sha1_git=sample_data.directory2.entries[0].target, + sha1=b"c" * 20, + sha256=None, + blake2s256=None, + length=42, + status="absent", + reason="You need a premium subscription to access this content", + ) + swh_storage.skipped_content_add([cont]) + + assert list(swh_storage.directory_ls(sample_data.directory2.id)) == [ + { + "dir_id": sample_data.directory2.id, + "length": 42, + "name": b"oof", + "perms": 33188, + "sha1": b"c" * 20, + "sha1_git": sample_data.directory2.entries[0].target, + "sha256": None, + "status": "absent", + "target": sample_data.directory2.entries[0].target, + "type": "file", + }, + ] + def test_directory_entry_get_by_path(self, swh_storage, sample_data): cont, content2 = sample_data.contents[:2] dir1, dir2, dir3, dir4, dir5 = sample_data.directories[:5]