diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -58,7 +58,18 @@ import os from typing import Callable, Dict, Optional, Union -ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5"]) +ALGORITHMS = set( + [ + "sha1", + "sha256", + "sha1_git", + "blake2s256", + "blake2b512", + "sha384", + "sha512", + "md5", + ] +) """Hashing algorithms supported by this module""" DEFAULT_ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256"]) diff --git a/swh/model/model.py b/swh/model/model.py --- a/swh/model/model.py +++ b/swh/model/model.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -32,7 +32,13 @@ from . import git_objects from .collections import ImmutableDict -from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytehex, hash_to_hex +from .hashutil import ( + ALGORITHMS, + DEFAULT_ALGORITHMS, + MultiHash, + hash_to_bytehex, + hash_to_hex, +) from .swhids import CoreSWHID from .swhids import ExtendedObjectType as SwhidExtendedObjectType from .swhids import ExtendedSWHID @@ -1291,9 +1297,9 @@ ) @staticmethod - def _hash_data(data: bytes): + def _hash_data(data: bytes, hash_names=DEFAULT_ALGORITHMS): """Hash some data, returning most of the fields of a content object""" - d = MultiHash.from_data(data).digest() + d = MultiHash.from_data(data, hash_names).digest() d["data"] = data d["length"] = len(data) @@ -1311,9 +1317,12 @@ return super().from_dict(d) def get_hash(self, hash_name): - if hash_name not in DEFAULT_ALGORITHMS: + if hash_name not in ALGORITHMS: raise ValueError("{} is not a valid hash name.".format(hash_name)) - return getattr(self, hash_name) + value = getattr(self, hash_name) + if value is None: + raise ValueError("Content objects do not store {} hashes".format(hash_name)) + return value def hashes(self) -> Dict[str, bytes]: """Returns a dictionary {hash_name: hash_value}""" @@ -1337,6 +1346,36 @@ default="visible", ) + sha384 = attr.ib( + type=Optional[bytes], + validator=generic_type_validator, + repr=hash_repr, + default=None, + ) + sha512 = attr.ib( + type=Optional[bytes], + validator=generic_type_validator, + repr=hash_repr, + default=None, + ) + blake2s256 = attr.ib( + type=Optional[bytes], + validator=generic_type_validator, + repr=hash_repr, + default=None, + ) + blake2b512 = attr.ib( + type=Optional[bytes], + validator=generic_type_validator, + repr=hash_repr, + default=None, + ) + md5 = attr.ib( + type=Optional[bytes], + validator=generic_type_validator, + repr=hash_repr, + default=None, + ) data = attr.ib(type=Optional[bytes], validator=generic_type_validator, default=None) ctime = attr.ib( @@ -1371,13 +1410,15 @@ return content @classmethod - def from_data(cls, data, status="visible", ctime=None) -> "Content": + def from_data( + cls, data, status="visible", ctime=None, hash_names=DEFAULT_ALGORITHMS + ) -> "Content": """Generate a Content from a given `data` byte string. This populates the Content with the hashes and length for the data passed as argument, as well as the data itself. """ - d = cls._hash_data(data) + d = cls._hash_data(data, hash_names) d["status"] = status d["ctime"] = ctime return cls(**d) @@ -1437,6 +1478,36 @@ default=None, eq=False, ) + sha384 = attr.ib( + type=Optional[bytes], + validator=generic_type_validator, + repr=hash_repr, + default=None, + ) + sha512 = attr.ib( + type=Optional[bytes], + validator=generic_type_validator, + repr=hash_repr, + default=None, + ) + blake2s256 = attr.ib( + type=Optional[bytes], + validator=generic_type_validator, + repr=hash_repr, + default=None, + ) + blake2b512 = attr.ib( + type=Optional[bytes], + validator=generic_type_validator, + repr=hash_repr, + default=None, + ) + md5 = attr.ib( + type=Optional[bytes], + validator=generic_type_validator, + repr=hash_repr, + default=None, + ) @reason.validator def check_reason(self, attribute, value): diff --git a/swh/model/tests/test_hypothesis_strategies.py b/swh/model/tests/test_hypothesis_strategies.py --- a/swh/model/tests/test_hypothesis_strategies.py +++ b/swh/model/tests/test_hypothesis_strategies.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -9,7 +9,7 @@ from hypothesis import given, settings import iso8601 -from swh.model.hashutil import DEFAULT_ALGORITHMS +from swh.model.hashutil import ALGORITHMS, DEFAULT_ALGORITHMS from swh.model.hypothesis_strategies import ( aware_datetimes, contents, @@ -128,7 +128,7 @@ obj_dict = object_.to_dict() assert_nested_dict(obj_dict) if object_type in ("content", "skipped_content"): - COMMON_KEYS = set(DEFAULT_ALGORITHMS) | {"length", "status"} + COMMON_KEYS = set(ALGORITHMS) | {"length", "status"} if object_.ctime is not None: COMMON_KEYS |= {"ctime"} diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -19,7 +19,7 @@ from swh.model.collections import ImmutableDict from swh.model.from_disk import DentryPerms import swh.model.git_objects -from swh.model.hashutil import MultiHash, hash_to_bytes +from swh.model.hashutil import ALGORITHMS, MultiHash, hash_to_bytes import swh.model.hypothesis_strategies as strategies import swh.model.model from swh.model.model import ( @@ -748,6 +748,13 @@ for (hash_name, hash_) in hashes.items(): assert c.get_hash(hash_name) == hash_ + with pytest.raises(ValueError, match="not a valid"): + c.get_hash("inexisting-hash-algo") + + for hash_name in ALGORITHMS - set(hashes.keys()): + with pytest.raises(ValueError, match="do not store"): + c.get_hash(hash_name) + def test_content_hashes(): hashes = dict(sha1=b"foo", sha1_git=b"bar", sha256=b"baz", blake2s256=b"qux") @@ -826,11 +833,11 @@ @given(binary(max_size=4096)) def test_content_from_data(data): - c = Content.from_data(data) + c = Content.from_data(data, hash_names=ALGORITHMS) assert c.data == data assert c.length == len(data) assert c.status == "visible" - for key, value in MultiHash.from_data(data).digest().items(): + for key, value in MultiHash.from_data(data, hash_names=ALGORITHMS).digest().items(): assert getattr(c, key) == value