diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py --- a/swh/vault/cookers/git_bare.py +++ b/swh/vault/cookers/git_bare.py @@ -535,7 +535,7 @@ def write_revision_node(self, revision: Revision) -> bool: """Writes a revision object to disk""" - git_object = git_objects.revision_git_object(revision) + git_object = revision.raw_manifest or git_objects.revision_git_object(revision) return self.write_object(revision.id, git_object) def load_releases(self, obj_ids: List[Sha1Git]) -> List[Release]: @@ -577,18 +577,23 @@ def write_release_node(self, release: Release) -> bool: """Writes a release object to disk""" - git_object = git_objects.release_git_object(release) + git_object = release.raw_manifest or git_objects.release_git_object(release) return self.write_object(release.id, git_object) def load_directories(self, obj_ids: List[Sha1Git]) -> None: if not obj_ids: return + raw_manifests = self.storage.directory_get_raw_manifest(obj_ids) + with multiprocessing.dummy.Pool(min(self.thread_pool_size, len(obj_ids))) as p: - for _ in p.imap_unordered(self.load_directory, obj_ids): + for _ in p.imap_unordered( + lambda obj_id: self.load_directory(obj_id, raw_manifests.get(obj_id)), + obj_ids, + ): pass - def load_directory(self, obj_id: Sha1Git) -> None: + def load_directory(self, obj_id: Sha1Git, raw_manifest: Optional[bytes]) -> None: # Load the directory entries_it: Optional[Iterable[DirectoryEntry]] = stream_results_optional( self.storage.directory_get_entries, obj_id @@ -598,8 +603,10 @@ logger.error("Missing swh:1:dir:%s, ignoring.", hash_to_hex(obj_id)) return - directory = Directory(id=obj_id, entries=tuple(entries_it)) - git_object = git_objects.directory_git_object(directory) + directory = Directory( + id=obj_id, entries=tuple(entries_it), raw_manifest=raw_manifest + ) + git_object = raw_manifest or git_objects.directory_git_object(directory) self.write_object(obj_id, git_object) # Add children to the stack diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -27,13 +27,18 @@ from swh.loader.git.from_disk import GitLoaderFromDisk from swh.model import from_disk, hashutil from swh.model.model import ( - Directory, - DirectoryEntry, Person, + Release, Revision, RevisionType, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, TimestampWithTimezone, ) +from swh.model.model import Content, Directory, DirectoryEntry +from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ObjectType from swh.vault.cookers import DirectoryCooker, GitBareCooker, RevisionGitfastCooker from swh.vault.tests.vault_testing import hash_content @@ -1076,3 +1081,109 @@ swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_snapshot_tags(ert, p, main_rev_id) + + def test_original_malformed_objects(self, swh_storage, cook_extract_snapshot): + """Tests that objects that were originally malformed: + + * are still interpreted somewhat correctly (if the loader could make sense of + them), especially that they still have links to children + * have their original manifest in the bundle + """ + date = TimestampWithTimezone.from_numeric_offset( + Timestamp(1643819927, 0), 0, False + ) + + content = Content.from_data(b"foo") + swh_storage.content_add([content]) + + # disordered + # fmt: off + malformed_dir_manifest = ( + b"" + + b"100644 file2\x00" + content.sha1_git + + b"100644 file1\x00" + content.sha1_git + ) + # fmt: on + directory = Directory( + entries=( + DirectoryEntry( + name=b"file1", type="file", perms=0o100644, target=content.sha1_git + ), + DirectoryEntry( + name=b"file2", type="file", perms=0o100644, target=content.sha1_git + ), + ), + raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode() + + malformed_dir_manifest, + ) + swh_storage.directory_add([directory]) + + # 'committer' and 'author' swapped + # fmt: off + malformed_rev_manifest = ( + b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n" + + b"committer me 1643819927 +0000\n" + + b"author me 1643819927 +0000\n" + + b"\n" + + b"rev" + ) + # fmt: on + revision = Revision( + message=b"rev", + author=Person.from_fullname(b"me "), + date=date, + committer=Person.from_fullname(b"me "), + committer_date=date, + parents=(), + type=RevisionType.GIT, + directory=directory.id, + synthetic=True, + raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode() + + malformed_rev_manifest, + ) + swh_storage.revision_add([revision]) + + # 'tag' and 'tagger' swapped + # fmt: off + malformed_rel_manifest = ( + b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n" + + b"type commit\n" + + b"tagger me 1643819927 +0000\n" + + b"tag v1.1.0\n" + ) + # fmt: on + + release = Release( + name=b"v1.1.0", + message=None, + author=Person.from_fullname(b"me "), + date=date, + target=revision.id, + target_type=ModelObjectType.REVISION, + synthetic=True, + raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode() + + malformed_rel_manifest, + ) + swh_storage.release_add([release]) + + snapshot = Snapshot( + branches={ + b"refs/tags/v1.1.0": SnapshotBranch( + target=release.id, target_type=TargetType.RELEASE + ), + b"HEAD": SnapshotBranch( + target=revision.id, target_type=TargetType.REVISION + ), + } + ) + swh_storage.snapshot_add([snapshot]) + + with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p): + tag = ert.repo[b"refs/tags/v1.1.0"] + assert tag.as_raw_string() == malformed_rel_manifest + + commit = ert.repo[tag.object[1]] + assert commit.as_raw_string() == malformed_rev_manifest + + tree = ert.repo[commit.tree] + assert tree.as_raw_string() == malformed_dir_manifest