diff --git a/swh/model/model.py b/swh/model/model.py --- a/swh/model/model.py +++ b/swh/model/model.py @@ -211,7 +211,6 @@ id: Sha1Git - @abstractmethod def compute_hash(self) -> bytes: """Derived model classes must implement this to compute the object hash. @@ -219,7 +218,11 @@ This method is called by the object initialization if the `id` attribute is set to an empty value. """ - pass + return self._compute_hash_from_attributes() + + @abstractmethod + def _compute_hash_from_attributes(self) -> Sha1Git: + raise NotImplementedError(f"_compute_hash_from_attributes for {self}") def __attrs_post_init__(self): if not self.id: @@ -236,6 +239,41 @@ raise ValueError("'id' does not match recomputed hash.") +class HashableObjectWithManifest(HashableObject): + """Derived class of HashableObject, for objects that may need to store + verbatim git objects as ``raw_manifest`` to preserve original hashes.""" + + raw_manifest: Optional[bytes] = None + """Stores the original content of git objects when they cannot be faithfully + represented using only the other attributes. + + This should only be used as a last resort, and only set in the Git loader, + for objects too corrupt to fit the data model.""" + + def compute_hash(self) -> bytes: + """Derived model classes must implement this to compute + the object hash. + + This method is called by the object initialization if the `id` + attribute is set to an empty value. + """ + if self.raw_manifest is None: + return super().compute_hash() + else: + return _compute_hash_from_manifest(self.raw_manifest) + + def check(self) -> None: + super().check() + + if ( + self.raw_manifest is not None + and self.id == self._compute_hash_from_attributes() + ): + raise ValueError( + f"{self} has a non-none raw_manifest attribute, but does not need it." + ) + + @attr.s(frozen=True, slots=True) class Person(BaseModel): """Represents the author/committer of a revision or release.""" @@ -504,7 +542,7 @@ def unique_key(self) -> KeyType: return {"url": self.url} - def compute_hash(self) -> bytes: + def _compute_hash_from_attributes(self) -> bytes: return _compute_hash_from_manifest(self.url.encode("utf-8")) def swhid(self) -> ExtendedSWHID: @@ -648,7 +686,7 @@ ) id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"", repr=hash_repr) - def compute_hash(self) -> bytes: + def _compute_hash_from_attributes(self) -> bytes: return _compute_hash_from_manifest(git_objects.snapshot_git_object(self)) @classmethod @@ -668,7 +706,7 @@ @attr.s(frozen=True, slots=True) -class Release(HashableObject, BaseModel): +class Release(HashableObjectWithManifest, BaseModel): object_type: Final = "release" name = attr.ib(type=bytes, validator=type_validator()) @@ -687,8 +725,9 @@ default=None, ) id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"", repr=hash_repr) + raw_manifest = attr.ib(type=Optional[bytes], default=None) - def compute_hash(self) -> bytes: + def _compute_hash_from_attributes(self) -> bytes: return _compute_hash_from_manifest(git_objects.release_git_object(self)) @author.validator @@ -743,7 +782,7 @@ @attr.s(frozen=True, slots=True) -class Revision(HashableObject, BaseModel): +class Revision(HashableObjectWithManifest, BaseModel): object_type: Final = "revision" message = attr.ib(type=Optional[bytes], validator=type_validator()) @@ -770,6 +809,7 @@ converter=tuplify_extra_headers, default=(), ) + raw_manifest = attr.ib(type=Optional[bytes], default=None) def __attrs_post_init__(self): super().__attrs_post_init__() @@ -785,7 +825,7 @@ attr.validate(self) object.__setattr__(self, "metadata", metadata) - def compute_hash(self) -> bytes: + def _compute_hash_from_attributes(self) -> bytes: return _compute_hash_from_manifest(git_objects.revision_git_object(self)) @classmethod @@ -837,17 +877,18 @@ @name.validator def check_name(self, attribute, value): if b"/" in value: - raise ValueError("{value!r} is not a valid directory entry name.") + raise ValueError(f"{value!r} is not a valid directory entry name.") @attr.s(frozen=True, slots=True) -class Directory(HashableObject, BaseModel): +class Directory(HashableObjectWithManifest, BaseModel): object_type: Final = "directory" entries = attr.ib(type=Tuple[DirectoryEntry, ...], validator=type_validator()) id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"", repr=hash_repr) + raw_manifest = attr.ib(type=Optional[bytes], default=None) - def compute_hash(self) -> bytes: + def _compute_hash_from_attributes(self) -> bytes: return _compute_hash_from_manifest(git_objects.directory_git_object(self)) @entries.validator @@ -1193,7 +1234,7 @@ id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"", repr=hash_repr) - def compute_hash(self) -> bytes: + def _compute_hash_from_attributes(self) -> bytes: return _compute_hash_from_manifest( git_objects.raw_extrinsic_metadata_git_object(self) ) @@ -1395,5 +1436,5 @@ extid_version=d.get("extid_version", 0), ) - def compute_hash(self) -> bytes: + def _compute_hash_from_attributes(self) -> bytes: return _compute_hash_from_manifest(git_objects.extid_git_object(self)) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -6,6 +6,7 @@ import collections import copy import datetime +import hashlib from typing import Any, List, Optional, Tuple, Union import attr @@ -760,6 +761,29 @@ with pytest.raises(ValueError, match="does not match recomputed hash"): directory2.check() + directory2 = attr.evolve( + directory, raw_manifest=swh.model.git_objects.directory_git_object(directory) + ) + with pytest.raises( + ValueError, match="non-none raw_manifest attribute, but does not need it." + ): + directory2.check() + + +@given(strategies.directories()) +def test_directory_raw_manifest(directory): + raw_manifest = b"foo" + id_ = hashlib.new("sha1", raw_manifest).digest() + + directory2 = attr.evolve(directory, raw_manifest=raw_manifest) + with pytest.raises(ValueError, match="does not match recomputed hash"): + directory2.check() + + directory2 = attr.evolve(directory, raw_manifest=raw_manifest, id=id_) + assert directory2.id is not None + assert directory2.id == id_ != directory.id + directory2.check() + def test_directory_entry_name_validation(): with pytest.raises(ValueError, match="valid directory entry name."): @@ -793,6 +817,29 @@ with pytest.raises(ValueError, match="does not match recomputed hash"): release2.check() + release2 = attr.evolve( + release, raw_manifest=swh.model.git_objects.release_git_object(release) + ) + with pytest.raises( + ValueError, match="non-none raw_manifest attribute, but does not need it." + ): + release2.check() + + +@given(strategies.releases()) +def test_release_raw_manifest(release): + raw_manifest = b"foo" + id_ = hashlib.new("sha1", raw_manifest).digest() + + release2 = attr.evolve(release, raw_manifest=raw_manifest) + with pytest.raises(ValueError, match="does not match recomputed hash"): + release2.check() + + release2 = attr.evolve(release, raw_manifest=raw_manifest, id=id_) + assert release2.id is not None + assert release2.id == id_ != release.id + release2.check() + # Revision @@ -805,6 +852,29 @@ with pytest.raises(ValueError, match="does not match recomputed hash"): revision2.check() + revision2 = attr.evolve( + revision, raw_manifest=swh.model.git_objects.revision_git_object(revision) + ) + with pytest.raises( + ValueError, match="non-none raw_manifest attribute, but does not need it." + ): + revision2.check() + + +@given(strategies.revisions()) +def test_revision_raw_manifest(revision): + raw_manifest = b"foo" + id_ = hashlib.new("sha1", raw_manifest).digest() + + revision2 = attr.evolve(revision, raw_manifest=raw_manifest) + with pytest.raises(ValueError, match="does not match recomputed hash"): + revision2.check() + + revision2 = attr.evolve(revision, raw_manifest=raw_manifest, id=id_) + assert revision2.id is not None + assert revision2.id == id_ != revision.id + revision2.check() + def test_revision_extra_headers_no_headers(): rev_dict = revision_example.copy()