Page MenuHomeSoftware Heritage

D7024.diff
No OneTemporary

D7024.diff

diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -656,7 +656,9 @@
self._add_one(statement, release)
@_prepared_select_statement(ReleaseRow, "WHERE id in ?")
- def release_get(self, release_ids: List[str], *, statement) -> Iterable[ReleaseRow]:
+ def release_get(
+ self, release_ids: List[Sha1Git], *, statement
+ ) -> Iterable[ReleaseRow]:
return map(
ReleaseRow.from_dict, self._execute_with_retries(statement, [release_ids])
)
@@ -683,6 +685,17 @@
def directory_get_random(self, *, statement) -> Optional[DirectoryRow]:
return self._get_random_row(DirectoryRow, statement)
+ @_prepared_select_statement(DirectoryRow, "WHERE id in ?")
+ def directory_get(
+ self, directory_ids: List[Sha1Git], *, statement
+ ) -> Iterable[DirectoryRow]:
+ """Return fields from the main directory table (e.g. raw_manifest, but not
+ entries)"""
+ return map(
+ DirectoryRow.from_dict,
+ self._execute_with_retries(statement, [directory_ids]),
+ )
+
##########################
# 'directory_entry' table
##########################
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -691,6 +691,15 @@
next_page_token = None
return PagedResult(results=entries, next_page_token=next_page_token)
+ @timed
+ def directory_get_raw_manifest(
+ self, directory_ids: List[Sha1Git]
+ ) -> Dict[Sha1Git, Optional[bytes]]:
+ return {
+ dir_.id: dir_.raw_manifest
+ for dir_ in self._cql_runner.directory_get(directory_ids)
+ }
+
@timed
def directory_get_random(self) -> Sha1Git:
directory = self._cql_runner.directory_get_random()
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -299,6 +299,12 @@
def directory_get_random(self) -> Optional[DirectoryRow]:
return self._directories.get_random()
+ def directory_get(self, directory_ids: List[Sha1Git]) -> Iterable[DirectoryRow]:
+ for id_ in directory_ids:
+ row = self._directories.get_from_primary_key((id_,))
+ if row:
+ yield row
+
##########################
# 'directory_entry' table
##########################
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -456,6 +456,19 @@
"""
...
+ @remote_api_endpoint("directory/get_raw_manifest")
+ def directory_get_raw_manifest(
+ self, directory_ids: List[Sha1Git]
+ ) -> Dict[Sha1Git, Optional[bytes]]:
+ """Returns the raw manifest of directories that do not fit the SWH data model,
+ or None if they do.
+ Directories missing from the archive are not returned at all.
+
+ Args:
+ directory_ids: List of directory ids to query
+ """
+ ...
+
@remote_api_endpoint("directory/get_random")
def directory_get_random(self) -> Sha1Git:
"""Finds a random directory id.
diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py
--- a/swh/storage/postgresql/db.py
+++ b/swh/storage/postgresql/db.py
@@ -415,6 +415,19 @@
)
return list(cur)
+ def directory_get_raw_manifest(
+ self, directory_ids: List[Sha1Git], cur=None
+ ) -> Iterable[Tuple[Sha1Git, bytes]]:
+ cur = self._cursor(cur)
+ yield from execute_values_generator(
+ cur,
+ """
+ SELECT t.id, raw_manifest FROM (VALUES %s) as t(id)
+ INNER JOIN directory ON (t.id=directory.id)
+ """,
+ ((id_,) for id_ in directory_ids),
+ )
+
def directory_get_random(self, cur=None):
return self._get_random_row_from_table("directory", ["id"], "id", cur)
diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py
--- a/swh/storage/postgresql/storage.py
+++ b/swh/storage/postgresql/storage.py
@@ -521,8 +521,12 @@
dir_ for dir_ in directories if dir_.id in dirs_missing
)
- # Copy directory ids
- dirs_missing_dict = ({"id": dir} for dir in dirs_missing)
+ # Copy directory metadata
+ dirs_missing_dict = (
+ {"id": dir_.id, "raw_manifest": dir_.raw_manifest}
+ for dir_ in directories
+ if dir_.id in dirs_missing
+ )
db.mktemp("directory", cur)
db.copy_to(dirs_missing_dict, "tmp_directory", ["id", "raw_manifest"], cur)
@@ -609,6 +613,13 @@
next_page_token=None,
)
+ @timed
+ @db_transaction()
+ def directory_get_raw_manifest(
+ self, directory_ids: List[Sha1Git], *, db: Db, cur=None
+ ) -> Dict[Sha1Git, Optional[bytes]]:
+ return dict(db.directory_get_raw_manifest(directory_ids, cur=cur))
+
@timed
@process_metrics
@db_transaction()
diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py
--- a/swh/storage/tests/storage_tests.py
+++ b/swh/storage/tests/storage_tests.py
@@ -733,6 +733,7 @@
init_missing = list(swh_storage.directory_missing([directory.id]))
assert [directory.id] == init_missing
+ assert swh_storage.directory_get_raw_manifest([directory.id]) == {}
actual_result = swh_storage.directory_add([directory])
assert actual_result == {"directory:add": 1}
@@ -750,7 +751,17 @@
after_missing = list(swh_storage.directory_missing([directory.id]))
assert after_missing == []
- # TODO: check the recorded manifest
+ assert swh_storage.directory_get_raw_manifest([directory.id]) == {
+ directory.id: b"foo"
+ }
+
+ directory2 = attr.evolve(directory, raw_manifest=b"bar")
+ directory2 = attr.evolve(directory2, id=directory2.compute_hash())
+ swh_storage.directory_add([directory2])
+
+ assert swh_storage.directory_get_raw_manifest(
+ [directory.id, directory2.id]
+ ) == {directory.id: b"foo", directory2.id: b"bar"}
@settings(
suppress_health_check=[HealthCheck.too_slow, HealthCheck.data_too_large]
@@ -763,16 +774,15 @@
swh_storage.directory_add(directories)
for directory in directories:
- if directory.raw_manifest is None:
- assert swh_storage.directory_get_entries(directory.id) == PagedResult(
- results=list(directory.entries), next_page_token=None,
- )
- else:
- # TODO: compare the manifests are the same (currently, we can't
- # because there is no way to get the raw_manifest of a directory)
- # we can't compare the other fields, because they become non-intrinsic,
- # so they may clash between hypothesis runs
- pass
+ assert directory == Directory(
+ id=directory.id,
+ entries=tuple(
+ stream_results(swh_storage.directory_get_entries, directory.id)
+ ),
+ raw_manifest=swh_storage.directory_get_raw_manifest([directory.id])[
+ directory.id
+ ],
+ )
def test_directory_add_twice(self, swh_storage, sample_data):
directory = sample_data.directories[1]

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 2:09 PM (1 w, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222775

Event Timeline