diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -56,7 +56,7 @@ import hashlib from io import BytesIO import os -from typing import Callable, Dict +from typing import Callable, Dict, Optional ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512"]) """Hashing algorithms supported by this module""" @@ -212,12 +212,10 @@ return hashlib.new(algo) -def _new_git_hash(base_algo, git_type, length): - """Initialize a digest object (as returned by python's hashlib) for the - requested algorithm, and feed it with the header for a git object of the - given type and length. +def _git_header(git_type: str, length: int) -> bytes: + """Returns the header for a git object of the given type and length. - The header for hashing a git object consists of: + The header of a git object consists of: - The type of the object (encoded in ASCII) - One ASCII space (\x20) - The length of the object (decimal encoded in ASCII) @@ -232,15 +230,26 @@ Returns: a hashutil.hash object """ + git_object_types = { + "blob", + "tree", + "commit", + "tag", + "snapshot", + "raw_extrinsic_metadata", + "extid", + } - h = _new_hashlib_hash(base_algo) - git_header = "%s %d\0" % (git_type, length) - h.update(git_header.encode("ascii")) + if git_type not in git_object_types: + raise ValueError( + "Unexpected git object type %s, expected one of %s" + % (git_type, ", ".join(sorted(git_object_types))) + ) - return h + return ("%s %d\0" % (git_type, length)).encode("ascii") -def _new_hash(algo, length=None): +def _new_hash(algo: str, length: Optional[int] = None): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm. See the constant ALGORITHMS for the list of supported algorithms. If a git-specific hashing algorithm is @@ -270,47 +279,13 @@ if length is None: raise ValueError("Missing length for git hashing algorithm") base_algo = algo[:-4] - return _new_git_hash(base_algo, "blob", length) + h = _new_hashlib_hash(base_algo) + h.update(_git_header("blob", length)) + return h return _new_hashlib_hash(algo) -def hash_git_data(data, git_type, base_algo="sha1"): - """Hash the given data as a git object of type git_type. - - Args: - data: a bytes object - git_type: the git object type - base_algo: the base hashing algorithm used (default: sha1) - - Returns: a dict mapping each algorithm to a bytes digest - - Raises: - ValueError if the git_type is unexpected. - """ - - git_object_types = { - "blob", - "tree", - "commit", - "tag", - "snapshot", - "raw_extrinsic_metadata", - "extid", - } - - if git_type not in git_object_types: - raise ValueError( - "Unexpected git object type %s, expected one of %s" - % (git_type, ", ".join(sorted(git_object_types))) - ) - - h = _new_git_hash(base_algo, git_type, len(data)) - h.update(data) - - return h.digest() - - @functools.lru_cache() def hash_to_hex(hash): """Converts a hash (in hex or bytes form) to its hexadecimal ascii form diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -29,7 +29,7 @@ from attrs_strict import type_validator from .exceptions import ValidationError -from .hashutil import MultiHash, hash_git_data, hash_to_bytes, hash_to_hex +from .hashutil import MultiHash, _git_header, _new_hash, hash_to_bytes, hash_to_hex class ObjectType(enum.Enum): @@ -156,6 +156,12 @@ ) +def _git_object_to_identifier_str(git_object: bytes) -> str: + h = _new_hash("sha1") + h.update(git_object) + return identifier_to_str(h.digest()) + + def content_identifier(content: Dict[str, Any]) -> Dict[str, bytes]: """Return the intrinsic identifier for a content. @@ -241,11 +247,11 @@ (Note that there is no separator between entries) """ - manifest = directory_manifest(directory) - return identifier_to_str(hash_git_data(manifest, "tree")) + git_object = directory_git_object(directory) + return _git_object_to_identifier_str(git_object) -def directory_manifest(directory: Dict[str, Any]) -> bytes: +def directory_git_object(directory: Dict[str, Any]) -> bytes: components = [] for entry in sorted(directory["entries"], key=directory_entry_sort_key): @@ -259,7 +265,7 @@ ] ) - return b"".join(components) + return format_list_git_object("tree", components) def format_date(date): @@ -416,12 +422,15 @@ return b" ".join(ret) -def format_manifest( - headers: Iterable[Tuple[bytes, bytes]], message: Optional[bytes] = None, +def format_git_object( + git_type: str, + headers: Iterable[Tuple[bytes, bytes]], + message: Optional[bytes] = None, ) -> bytes: - """Format a manifest comprised of a sequence of `headers` and an optional `message`. + """Format a git_object comprised of a git header and a manifest, + which is itself a sequence of `headers`, and an optional `message`. - The manifest format, compatible with the git format for tag and commit + The git_object format, compatible with the git format for tag and commit objects, is as follows: - for each `key`, `value` in `headers`, emit: @@ -441,7 +450,7 @@ message: an optional message used to trail the manifest. Returns: - the formatted manifest as bytes + the formatted git_object as bytes """ entries: List[bytes] = [] @@ -451,7 +460,19 @@ if message is not None: entries.extend((b"\n", message)) - return b"".join(entries) + concatenated_entries = b"".join(entries) + + header = _git_header(git_type, len(concatenated_entries)) + return header + concatenated_entries + + +def format_list_git_object(git_type: str, entries: Iterable[bytes]) -> bytes: + """Similar to :func:`format_git_object`, but for manifests made of a flat + list of entries, instead of key-value + message, ie. trees and snapshots.""" + concatenated_entries = b"".join(entries) + + header = _git_header(git_type, len(concatenated_entries)) + return header + concatenated_entries def format_author_data(author, date_offset) -> bytes: @@ -550,12 +571,12 @@ type. """ - manifest = revision_manifest(revision) - return identifier_to_str(hash_git_data(manifest, "commit")) + git_object = revision_git_object(revision) + return _git_object_to_identifier_str(git_object) -def revision_manifest(revision: Dict[str, Any]) -> bytes: - """Formats the manifest of a revision. See :func:`revision_identifier` for details +def revision_git_object(revision: Dict[str, Any]) -> bytes: + """Formats the git_object of a revision. See :func:`revision_identifier` for details on the format.""" headers = [(b"tree", identifier_to_str(revision["directory"]).encode())] for parent in revision["parents"]: @@ -580,7 +601,7 @@ headers.extend(extra_headers) - return format_manifest(headers, revision["message"]) + return format_git_object("commit", headers, revision["message"]) def target_type_to_git(target_type: str) -> bytes: @@ -596,11 +617,11 @@ def release_identifier(release: Dict[str, Any]) -> str: """Return the intrinsic identifier for a release.""" - manifest = release_manifest(release) - return identifier_to_str(hash_git_data(manifest, "tag")) + git_object = release_git_object(release) + return _git_object_to_identifier_str(git_object) -def release_manifest(release: Dict[str, Any]) -> bytes: +def release_git_object(release: Dict[str, Any]) -> bytes: headers = [ (b"object", identifier_to_str(release["target"]).encode()), (b"type", target_type_to_git(release["target_type"])), @@ -612,7 +633,7 @@ (b"tagger", format_author_data(release["author"], release["date"])) ) - return format_manifest(headers, release["message"]) + return format_git_object("tag", headers, release["message"]) def snapshot_identifier( @@ -672,14 +693,14 @@ str: the intrinsic identifier for `snapshot` """ - manifest = snapshot_manifest(snapshot, ignore_unresolved=ignore_unresolved) - return identifier_to_str(hash_git_data(manifest, "snapshot")) + git_object = snapshot_git_object(snapshot, ignore_unresolved=ignore_unresolved) + return _git_object_to_identifier_str(git_object) -def snapshot_manifest( +def snapshot_git_object( snapshot: Dict[str, Any], *, ignore_unresolved: bool = False ) -> bytes: - """Formats the manifest of a revision. See :func:`snapshot_identifier` for details + """Formats the git_object of a revision. See :func:`snapshot_identifier` for details on the format.""" unresolved = [] lines = [] @@ -715,7 +736,7 @@ unresolved, ) - return b"".join(lines) + return format_list_git_object("snapshot", lines) def origin_identifier(origin): @@ -773,12 +794,12 @@ str: the intrinsic identifier for ``metadata`` """ - manifest = raw_extrinsic_metadata_manifest(metadata) - return identifier_to_str(hash_git_data(manifest, "raw_extrinsic_metadata")) + git_object = raw_extrinsic_metadata_git_object(metadata) + return _git_object_to_identifier_str(git_object) -def raw_extrinsic_metadata_manifest(metadata: Dict[str, Any]) -> bytes: - """Formats the manifest of a raw_extrinsic_metadata object. +def raw_extrinsic_metadata_git_object(metadata: Dict[str, Any]) -> bytes: + """Formats the git_object of a raw_extrinsic_metadata object. See :func:`raw_extrinsic_metadata_identifier` for details on the format.""" # equivalent to using math.floor(dt.timestamp()) to round down, @@ -827,7 +848,7 @@ headers.append((key.encode("ascii"), value)) - return format_manifest(headers, metadata["metadata"]) + return format_git_object("raw_extrinsic_metadata", headers, metadata["metadata"]) def extid_identifier(extid: Dict[str, Any]) -> str: @@ -858,8 +879,8 @@ (b"target", str(extid["target"]).encode("ascii")), ] - manifest = format_manifest(headers) - return identifier_to_str(hash_git_data(manifest, "extid")) + git_object = format_git_object("extid", headers) + return _git_object_to_identifier_str(git_object) # type of the "object_type" attribute of the SWHID class; either diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -805,7 +805,7 @@ } def test_minimal(self): - manifest = ( + git_object = ( b"raw_extrinsic_metadata 210\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" b"discovery_date 1611574071\n" @@ -816,9 +816,12 @@ b'{"foo": "bar"}' ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_git_object(self.minimal), git_object, + ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.minimal), - hashlib.sha1(manifest).hexdigest(), + hashlib.sha1(git_object).hexdigest(), ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.minimal), @@ -826,7 +829,7 @@ ) def test_maximal(self): - manifest = ( + git_object = ( b"raw_extrinsic_metadata 533\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" b"discovery_date 1611574071\n" @@ -844,9 +847,12 @@ b'{"foo": "bar"}' ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_git_object(self.maximal), git_object, + ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.maximal), - hashlib.sha1(manifest).hexdigest(), + hashlib.sha1(git_object).hexdigest(), ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.maximal), @@ -858,7 +864,7 @@ **self.minimal, "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f", } - manifest = ( + git_object = ( b"raw_extrinsic_metadata 231\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" b"discovery_date 1611574071\n" @@ -871,9 +877,12 @@ b'{"foo": "bar"}' ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(manifest).hexdigest(), + hashlib.sha1(git_object).hexdigest(), ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata), @@ -882,7 +891,7 @@ def test_timezone_insensitive(self): """Checks the timezone of the datetime.datetime does not affect the - hashed manifest.""" + hashed git_object.""" utc_plus_one = datetime.timezone(datetime.timedelta(hours=1)) metadata = { **self.minimal, @@ -891,6 +900,10 @@ ), } + self.assertEqual( + identifiers.raw_extrinsic_metadata_git_object(self.minimal), + identifiers.raw_extrinsic_metadata_git_object(metadata), + ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.minimal), identifiers.raw_extrinsic_metadata_identifier(metadata), @@ -910,6 +923,10 @@ ), } + self.assertEqual( + identifiers.raw_extrinsic_metadata_git_object(self.minimal), + identifiers.raw_extrinsic_metadata_git_object(metadata), + ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.minimal), identifiers.raw_extrinsic_metadata_identifier(metadata), @@ -930,6 +947,10 @@ ), } + self.assertEqual( + identifiers.raw_extrinsic_metadata_git_object(self.minimal), + identifiers.raw_extrinsic_metadata_git_object(metadata), + ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.minimal), identifiers.raw_extrinsic_metadata_identifier(metadata), @@ -947,7 +968,7 @@ ), } - manifest = ( + git_object = ( b"raw_extrinsic_metadata 210\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" b"discovery_date -313504329\n" @@ -958,9 +979,12 @@ b'{"foo": "bar"}' ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(manifest).hexdigest(), + hashlib.sha1(git_object).hexdigest(), ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata), @@ -975,7 +999,7 @@ ), } - manifest = ( + git_object = ( b"raw_extrinsic_metadata 201\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" b"discovery_date 0\n" @@ -986,9 +1010,12 @@ b'{"foo": "bar"}' ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(manifest).hexdigest(), + hashlib.sha1(git_object).hexdigest(), ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata), @@ -1003,7 +1030,7 @@ ), } - manifest = ( + git_object = ( b"raw_extrinsic_metadata 202\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" b"discovery_date -1\n" @@ -1014,9 +1041,12 @@ b'{"foo": "bar"}' ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(manifest).hexdigest(), + hashlib.sha1(git_object).hexdigest(), ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata),