Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163731
D7024.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
D7024.diff
View Options
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -656,7 +656,9 @@
self._add_one(statement, release)
@_prepared_select_statement(ReleaseRow, "WHERE id in ?")
- def release_get(self, release_ids: List[str], *, statement) -> Iterable[ReleaseRow]:
+ def release_get(
+ self, release_ids: List[Sha1Git], *, statement
+ ) -> Iterable[ReleaseRow]:
return map(
ReleaseRow.from_dict, self._execute_with_retries(statement, [release_ids])
)
@@ -683,6 +685,17 @@
def directory_get_random(self, *, statement) -> Optional[DirectoryRow]:
return self._get_random_row(DirectoryRow, statement)
+ @_prepared_select_statement(DirectoryRow, "WHERE id in ?")
+ def directory_get(
+ self, directory_ids: List[Sha1Git], *, statement
+ ) -> Iterable[DirectoryRow]:
+ """Return fields from the main directory table (e.g. raw_manifest, but not
+ entries)"""
+ return map(
+ DirectoryRow.from_dict,
+ self._execute_with_retries(statement, [directory_ids]),
+ )
+
##########################
# 'directory_entry' table
##########################
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -691,6 +691,15 @@
next_page_token = None
return PagedResult(results=entries, next_page_token=next_page_token)
+ @timed
+ def directory_get_raw_manifest(
+ self, directory_ids: List[Sha1Git]
+ ) -> Dict[Sha1Git, Optional[bytes]]:
+ return {
+ dir_.id: dir_.raw_manifest
+ for dir_ in self._cql_runner.directory_get(directory_ids)
+ }
+
@timed
def directory_get_random(self) -> Sha1Git:
directory = self._cql_runner.directory_get_random()
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -299,6 +299,12 @@
def directory_get_random(self) -> Optional[DirectoryRow]:
return self._directories.get_random()
+ def directory_get(self, directory_ids: List[Sha1Git]) -> Iterable[DirectoryRow]:
+ for id_ in directory_ids:
+ row = self._directories.get_from_primary_key((id_,))
+ if row:
+ yield row
+
##########################
# 'directory_entry' table
##########################
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -456,6 +456,19 @@
"""
...
+ @remote_api_endpoint("directory/get_raw_manifest")
+ def directory_get_raw_manifest(
+ self, directory_ids: List[Sha1Git]
+ ) -> Dict[Sha1Git, Optional[bytes]]:
+ """Returns the raw manifest of directories that do not fit the SWH data model,
+ or None if they do.
+ Directories missing from the archive are not returned at all.
+
+ Args:
+ directory_ids: List of directory ids to query
+ """
+ ...
+
@remote_api_endpoint("directory/get_random")
def directory_get_random(self) -> Sha1Git:
"""Finds a random directory id.
diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py
--- a/swh/storage/postgresql/db.py
+++ b/swh/storage/postgresql/db.py
@@ -415,6 +415,19 @@
)
return list(cur)
+ def directory_get_raw_manifest(
+ self, directory_ids: List[Sha1Git], cur=None
+ ) -> Iterable[Tuple[Sha1Git, bytes]]:
+ cur = self._cursor(cur)
+ yield from execute_values_generator(
+ cur,
+ """
+ SELECT t.id, raw_manifest FROM (VALUES %s) as t(id)
+ INNER JOIN directory ON (t.id=directory.id)
+ """,
+ ((id_,) for id_ in directory_ids),
+ )
+
def directory_get_random(self, cur=None):
return self._get_random_row_from_table("directory", ["id"], "id", cur)
diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py
--- a/swh/storage/postgresql/storage.py
+++ b/swh/storage/postgresql/storage.py
@@ -521,8 +521,12 @@
dir_ for dir_ in directories if dir_.id in dirs_missing
)
- # Copy directory ids
- dirs_missing_dict = ({"id": dir} for dir in dirs_missing)
+ # Copy directory metadata
+ dirs_missing_dict = (
+ {"id": dir_.id, "raw_manifest": dir_.raw_manifest}
+ for dir_ in directories
+ if dir_.id in dirs_missing
+ )
db.mktemp("directory", cur)
db.copy_to(dirs_missing_dict, "tmp_directory", ["id", "raw_manifest"], cur)
@@ -609,6 +613,13 @@
next_page_token=None,
)
+ @timed
+ @db_transaction()
+ def directory_get_raw_manifest(
+ self, directory_ids: List[Sha1Git], *, db: Db, cur=None
+ ) -> Dict[Sha1Git, Optional[bytes]]:
+ return dict(db.directory_get_raw_manifest(directory_ids, cur=cur))
+
@timed
@process_metrics
@db_transaction()
diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py
--- a/swh/storage/tests/storage_tests.py
+++ b/swh/storage/tests/storage_tests.py
@@ -733,6 +733,7 @@
init_missing = list(swh_storage.directory_missing([directory.id]))
assert [directory.id] == init_missing
+ assert swh_storage.directory_get_raw_manifest([directory.id]) == {}
actual_result = swh_storage.directory_add([directory])
assert actual_result == {"directory:add": 1}
@@ -750,7 +751,17 @@
after_missing = list(swh_storage.directory_missing([directory.id]))
assert after_missing == []
- # TODO: check the recorded manifest
+ assert swh_storage.directory_get_raw_manifest([directory.id]) == {
+ directory.id: b"foo"
+ }
+
+ directory2 = attr.evolve(directory, raw_manifest=b"bar")
+ directory2 = attr.evolve(directory2, id=directory2.compute_hash())
+ swh_storage.directory_add([directory2])
+
+ assert swh_storage.directory_get_raw_manifest(
+ [directory.id, directory2.id]
+ ) == {directory.id: b"foo", directory2.id: b"bar"}
@settings(
suppress_health_check=[HealthCheck.too_slow, HealthCheck.data_too_large]
@@ -763,16 +774,15 @@
swh_storage.directory_add(directories)
for directory in directories:
- if directory.raw_manifest is None:
- assert swh_storage.directory_get_entries(directory.id) == PagedResult(
- results=list(directory.entries), next_page_token=None,
- )
- else:
- # TODO: compare the manifests are the same (currently, we can't
- # because there is no way to get the raw_manifest of a directory)
- # we can't compare the other fields, because they become non-intrinsic,
- # so they may clash between hypothesis runs
- pass
+ assert directory == Directory(
+ id=directory.id,
+ entries=tuple(
+ stream_results(swh_storage.directory_get_entries, directory.id)
+ ),
+ raw_manifest=swh_storage.directory_get_raw_manifest([directory.id])[
+ directory.id
+ ],
+ )
def test_directory_add_twice(self, swh_storage, sample_data):
directory = sample_data.directories[1]
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 2:09 PM (1 w, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222775
Attached To
D7024: Fix directory_add to actually insert the manifest + add directory_get_raw_manifest
Event Timeline
Log In to Comment