diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -656,7 +656,9 @@ self._add_one(statement, release) @_prepared_select_statement(ReleaseRow, "WHERE id in ?") - def release_get(self, release_ids: List[str], *, statement) -> Iterable[ReleaseRow]: + def release_get( + self, release_ids: List[Sha1Git], *, statement + ) -> Iterable[ReleaseRow]: return map( ReleaseRow.from_dict, self._execute_with_retries(statement, [release_ids]) ) @@ -683,6 +685,17 @@ def directory_get_random(self, *, statement) -> Optional[DirectoryRow]: return self._get_random_row(DirectoryRow, statement) + @_prepared_select_statement(DirectoryRow, "WHERE id in ?") + def directory_get( + self, directory_ids: List[Sha1Git], *, statement + ) -> Iterable[DirectoryRow]: + """Return fields from the main directory table (e.g. raw_manifest, but not + entries)""" + return map( + DirectoryRow.from_dict, + self._execute_with_retries(statement, [directory_ids]), + ) + ########################## # 'directory_entry' table ########################## diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -691,6 +691,15 @@ next_page_token = None return PagedResult(results=entries, next_page_token=next_page_token) + @timed + def directory_get_raw_manifest( + self, directory_ids: List[Sha1Git] + ) -> Dict[Sha1Git, Optional[bytes]]: + return { + dir_.id: dir_.raw_manifest + for dir_ in self._cql_runner.directory_get(directory_ids) + } + @timed def directory_get_random(self) -> Sha1Git: directory = self._cql_runner.directory_get_random() diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -299,6 +299,12 @@ def directory_get_random(self) -> Optional[DirectoryRow]: return self._directories.get_random() + def directory_get(self, directory_ids: List[Sha1Git]) -> Iterable[DirectoryRow]: + for id_ in directory_ids: + row = self._directories.get_from_primary_key((id_,)) + if row: + yield row + ########################## # 'directory_entry' table ########################## diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -456,6 +456,19 @@ """ ... + @remote_api_endpoint("directory/get_raw_manifest") + def directory_get_raw_manifest( + self, directory_ids: List[Sha1Git] + ) -> Dict[Sha1Git, Optional[bytes]]: + """Returns the raw manifest of directories that do not fit the SWH data model, + or None if they do. + Directories missing from the archive are not returned at all. + + Args: + directory_ids: List of directory ids to query + """ + ... + @remote_api_endpoint("directory/get_random") def directory_get_random(self) -> Sha1Git: """Finds a random directory id. diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -415,6 +415,19 @@ ) return list(cur) + def directory_get_raw_manifest( + self, directory_ids: List[Sha1Git], cur=None + ) -> Iterable[Tuple[Sha1Git, bytes]]: + cur = self._cursor(cur) + yield from execute_values_generator( + cur, + """ + SELECT t.id, raw_manifest FROM (VALUES %s) as t(id) + INNER JOIN directory ON (t.id=directory.id) + """, + ((id_,) for id_ in directory_ids), + ) + def directory_get_random(self, cur=None): return self._get_random_row_from_table("directory", ["id"], "id", cur) diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -521,8 +521,12 @@ dir_ for dir_ in directories if dir_.id in dirs_missing ) - # Copy directory ids - dirs_missing_dict = ({"id": dir} for dir in dirs_missing) + # Copy directory metadata + dirs_missing_dict = ( + {"id": dir_.id, "raw_manifest": dir_.raw_manifest} + for dir_ in directories + if dir_.id in dirs_missing + ) db.mktemp("directory", cur) db.copy_to(dirs_missing_dict, "tmp_directory", ["id", "raw_manifest"], cur) @@ -609,6 +613,13 @@ next_page_token=None, ) + @timed + @db_transaction() + def directory_get_raw_manifest( + self, directory_ids: List[Sha1Git], *, db: Db, cur=None + ) -> Dict[Sha1Git, Optional[bytes]]: + return dict(db.directory_get_raw_manifest(directory_ids, cur=cur)) + @timed @process_metrics @db_transaction() diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -733,6 +733,7 @@ init_missing = list(swh_storage.directory_missing([directory.id])) assert [directory.id] == init_missing + assert swh_storage.directory_get_raw_manifest([directory.id]) == {} actual_result = swh_storage.directory_add([directory]) assert actual_result == {"directory:add": 1} @@ -750,7 +751,17 @@ after_missing = list(swh_storage.directory_missing([directory.id])) assert after_missing == [] - # TODO: check the recorded manifest + assert swh_storage.directory_get_raw_manifest([directory.id]) == { + directory.id: b"foo" + } + + directory2 = attr.evolve(directory, raw_manifest=b"bar") + directory2 = attr.evolve(directory2, id=directory2.compute_hash()) + swh_storage.directory_add([directory2]) + + assert swh_storage.directory_get_raw_manifest( + [directory.id, directory2.id] + ) == {directory.id: b"foo", directory2.id: b"bar"} @settings( suppress_health_check=[HealthCheck.too_slow, HealthCheck.data_too_large] @@ -763,16 +774,15 @@ swh_storage.directory_add(directories) for directory in directories: - if directory.raw_manifest is None: - assert swh_storage.directory_get_entries(directory.id) == PagedResult( - results=list(directory.entries), next_page_token=None, - ) - else: - # TODO: compare the manifests are the same (currently, we can't - # because there is no way to get the raw_manifest of a directory) - # we can't compare the other fields, because they become non-intrinsic, - # so they may clash between hypothesis runs - pass + assert directory == Directory( + id=directory.id, + entries=tuple( + stream_results(swh_storage.directory_get_entries, directory.id) + ), + raw_manifest=swh_storage.directory_get_raw_manifest([directory.id])[ + directory.id + ], + ) def test_directory_add_twice(self, swh_storage, sample_data): directory = sample_data.directories[1]