diff --git a/PKG-INFO b/PKG-INFO index e14129d..92d127e 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,42 +1,42 @@ Metadata-Version: 2.1 Name: swh.model -Version: 6.5.0 +Version: 6.5.1 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-model Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-model/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: cli Provides-Extra: testing-minimal Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-model ========= Implementation of the Data model of the Software Heritage project, used to archive source code artifacts. This module defines the notion of SoftWare Heritage persistent IDentifiers (SWHIDs) and provides tools to compute them: ```sh $ swh-identify fork.c kmod.c sched/deadline.c swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3 fork.c swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2 kmod.c swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82 sched/deadline.c $ swh-identify --no-filename /usr/src/linux/kernel/ swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab ``` diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index e14129d..92d127e 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,42 +1,42 @@ Metadata-Version: 2.1 Name: swh.model -Version: 6.5.0 +Version: 6.5.1 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-model Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-model/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: cli Provides-Extra: testing-minimal Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-model ========= Implementation of the Data model of the Software Heritage project, used to archive source code artifacts. This module defines the notion of SoftWare Heritage persistent IDentifiers (SWHIDs) and provides tools to compute them: ```sh $ swh-identify fork.c kmod.c sched/deadline.c swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3 fork.c swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2 kmod.c swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82 sched/deadline.c $ swh-identify --no-filename /usr/src/linux/kernel/ swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab ``` diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py index 86ecf12..8795b1f 100644 --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -1,588 +1,592 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Conversion from filesystem tree to SWH objects. This module allows reading a tree of directories and files from a local filesystem, and convert them to in-memory data structures, which can then be exported to SWH data model objects, as defined in :mod:`swh.model.model`. """ import datetime import enum import fnmatch import glob import os import re import stat from typing import Any, Iterable, Iterator, List, Optional, Pattern, Tuple import attr from attrs_strict import type_validator from typing_extensions import Final from . import model from .exceptions import InvalidDirectoryPath from .git_objects import directory_entry_sort_key from .hashutil import MultiHash, hash_to_hex from .merkle import MerkleLeaf, MerkleNode from .swhids import CoreSWHID, ObjectType @attr.s(frozen=True, slots=True) class DiskBackedContent(model.BaseContent): """Content-like class, which allows lazy-loading data from the disk.""" object_type: Final = "content_file" sha1 = attr.ib(type=bytes, validator=type_validator()) sha1_git = attr.ib(type=model.Sha1Git, validator=type_validator()) sha256 = attr.ib(type=bytes, validator=type_validator()) blake2s256 = attr.ib(type=bytes, validator=type_validator()) length = attr.ib(type=int, validator=type_validator()) status = attr.ib( type=str, validator=attr.validators.in_(["visible", "hidden"]), default="visible", ) ctime = attr.ib( type=Optional[datetime.datetime], validator=type_validator(), default=None, eq=False, ) path = attr.ib(type=Optional[bytes], default=None) @classmethod def from_dict(cls, d): return cls(**d) def __attrs_post_init__(self): if self.path is None: raise TypeError("path must not be None.") def with_data(self) -> model.Content: args = self.to_dict() del args["path"] assert self.path is not None with open(self.path, "rb") as fd: return model.Content.from_dict({**args, "data": fd.read()}) class DentryPerms(enum.IntEnum): """Admissible permissions for directory entries.""" content = 0o100644 """Content""" executable_content = 0o100755 """Executable content (e.g. executable script)""" symlink = 0o120000 """Symbolic link""" directory = 0o040000 """Directory""" revision = 0o160000 """Revision (e.g. submodule)""" def mode_to_perms(mode): """Convert a file mode to a permission compatible with Software Heritage directory entries Args: mode (int): a file mode as returned by :func:`os.stat` in :attr:`os.stat_result.st_mode` Returns: DentryPerms: one of the following values: :const:`DentryPerms.content`: plain file :const:`DentryPerms.executable_content`: executable file :const:`DentryPerms.symlink`: symbolic link :const:`DentryPerms.directory`: directory """ if stat.S_ISLNK(mode): return DentryPerms.symlink if stat.S_ISDIR(mode): return DentryPerms.directory else: # file is executable in any way if mode & (0o111): return DentryPerms.executable_content else: return DentryPerms.content class Content(MerkleLeaf): """Representation of a Software Heritage content as a node in a Merkle tree. The current Merkle hash for the Content nodes is the `sha1_git`, which makes it consistent with what :class:`Directory` uses for its own hash computation. """ __slots__ = [] # type: List[str] object_type: Final = "content" @classmethod def from_bytes(cls, *, mode, data): """Convert data (raw :class:`bytes`) to a Software Heritage content entry Args: mode (int): a file mode (passed to :func:`mode_to_perms`) data (bytes): raw contents of the file """ ret = MultiHash.from_data(data).digest() ret["length"] = len(data) ret["perms"] = mode_to_perms(mode) ret["data"] = data ret["status"] = "visible" return cls(ret) @classmethod def from_symlink(cls, *, path, mode): """Convert a symbolic link to a Software Heritage content entry""" return cls.from_bytes(mode=mode, data=os.readlink(path)) @classmethod def from_file(cls, *, path, max_content_length=None): """Compute the Software Heritage content entry corresponding to an on-disk file. The returned dictionary contains keys useful for both: - loading the content in the archive (hashes, `length`) - using the content as a directory entry in a directory Args: save_path (bool): add the file path to the entry max_content_length (Optional[int]): if given, all contents larger than this will be skipped. """ file_stat = os.lstat(path) mode = file_stat.st_mode length = file_stat.st_size too_large = max_content_length is not None and length > max_content_length if stat.S_ISLNK(mode): # Symbolic link: return a file whose contents are the link target if too_large: # Unlike large contents, we can't stream symlinks to # MultiHash, and we don't want to fit them in memory if # they exceed max_content_length either. # Thankfully, this should not happen for reasonable values of # max_content_length because of OS/filesystem limitations, # so let's just raise an error. raise Exception(f"Symlink too large ({length} bytes)") return cls.from_symlink(path=path, mode=mode) elif not stat.S_ISREG(mode): # not a regular file: return the empty file instead return cls.from_bytes(mode=mode, data=b"") if too_large: skip_reason = "Content too large" else: skip_reason = None hashes = MultiHash.from_path(path).digest() if skip_reason: ret = { **hashes, "status": "absent", "reason": skip_reason, } else: ret = { **hashes, "status": "visible", } ret["path"] = path ret["perms"] = mode_to_perms(mode) ret["length"] = length obj = cls(ret) return obj def swhid(self) -> CoreSWHID: """Return node identifier as a SWHID""" return CoreSWHID(object_type=ObjectType.CONTENT, object_id=self.hash) def __repr__(self): return "Content(id=%s)" % hash_to_hex(self.hash) def compute_hash(self): return self.data["sha1_git"] def to_model(self) -> model.BaseContent: """Builds a `model.BaseContent` object based on this leaf.""" data = self.get_data().copy() data.pop("perms", None) if data["status"] == "absent": data.pop("path", None) return model.SkippedContent.from_dict(data) elif "data" in data: return model.Content.from_dict(data) else: return DiskBackedContent.from_dict(data) def accept_all_directories(dirpath: str, dirname: str, entries: Iterable[Any]) -> bool: """Default filter for :func:`Directory.from_disk` accepting all directories Args: dirname (bytes): directory name entries (list): directory entries """ return True def ignore_empty_directories( dirpath: str, dirname: str, entries: Iterable[Any] ) -> bool: """Filter for :func:`directory_to_objects` ignoring empty directories Args: dirname (bytes): directory name entries (list): directory entries Returns: True if the directory is not empty, false if the directory is empty """ return bool(entries) def ignore_named_directories(names, *, case_sensitive=True): """Filter for :func:`directory_to_objects` to ignore directories named one of names. Args: names (list of bytes): names to ignore case_sensitive (bool): whether to do the filtering in a case sensitive way Returns: a directory filter for :func:`directory_to_objects` """ if not case_sensitive: names = [name.lower() for name in names] def named_filter( dirpath: str, dirname: str, entries: Iterable[Any], names: Iterable[Any] = names, case_sensitive: bool = case_sensitive, ): if case_sensitive: return dirname not in names else: return dirname.lower() not in names return named_filter # TODO: `extract_regex_objs` has been copied and adapted from `swh.scanner`. # In the future `swh.scanner` should use the `swh.model` version and remove its own. def extract_regex_objs( root_path: bytes, patterns: Iterable[bytes] ) -> Iterator[Pattern[bytes]]: """Generates a regex object for each pattern given in input and checks if the path is a subdirectory or relative to the root path. Args: root_path (bytes): path to the root directory - patterns (list of byte): patterns to match + patterns (list of byte): shell patterns to match Yields: an SRE_Pattern object """ absolute_root_path = os.path.abspath(root_path) for pattern in patterns: - for path in glob.glob(pattern): - absolute_path = os.path.abspath(path) - if not absolute_path.startswith(absolute_root_path): + if os.path.isabs(pattern): + pattern = os.path.relpath(pattern, root_path) + # python 3.10 has a `root_dir` argument for glob, but not the previous + # version. So we adjust the pattern + test_pattern = os.path.join(absolute_root_path, pattern) + for path in glob.glob(test_pattern): + if os.path.isabs(path) and not path.startswith(absolute_root_path): error_msg = ( b'The path "' + path + b'" is not a subdirectory or relative ' b'to the root directory path: "' + root_path + b'"' ) raise InvalidDirectoryPath(error_msg) regex = fnmatch.translate((pattern.decode())) yield re.compile(regex.encode()) def ignore_directories_patterns(root_path: bytes, patterns: Iterable[bytes]): """Filter for :func:`directory_to_objects` to ignore directories matching certain patterns. Args: root_path (bytes): path of the root directory - patterns (list of byte): patterns to ignore + patterns (list of bytes): patterns to ignore Returns: a directory filter for :func:`directory_to_objects` """ sre_patterns = set(extract_regex_objs(root_path, patterns)) def pattern_filter( dirpath: bytes, dirname: bytes, entries: Iterable[Any], patterns: Iterable[Any] = sre_patterns, root_path: bytes = os.path.abspath(root_path), ): full_path = os.path.abspath(dirpath) relative_path = os.path.relpath(full_path, root_path) return not any([pattern.match(relative_path) for pattern in patterns]) return pattern_filter def iter_directory( directory, ) -> Tuple[List[model.Content], List[model.SkippedContent], List[model.Directory]]: """Return the directory listing from a disk-memory directory instance. Raises: TypeError in case an unexpected object type is listed. Returns: Tuple of respectively iterable of content, skipped content and directories. """ contents: List[model.Content] = [] skipped_contents: List[model.SkippedContent] = [] directories: List[model.Directory] = [] for obj in directory.iter_tree(): obj = obj.to_model() obj_type = obj.object_type if obj_type in (model.Content.object_type, DiskBackedContent.object_type): # FIXME: read the data from disk later (when the # storage buffer is flushed). obj = obj.with_data() contents.append(obj) elif obj_type == model.SkippedContent.object_type: skipped_contents.append(obj) elif obj_type == model.Directory.object_type: directories.append(obj) else: raise TypeError(f"Unexpected object type from disk: {obj}") return contents, skipped_contents, directories class Directory(MerkleNode): """Representation of a Software Heritage directory as a node in a Merkle Tree. This class can be used to generate, from an on-disk directory, all the objects that need to be sent to the Software Heritage archive. The :func:`from_disk` constructor allows you to generate the data structure from a directory on disk. The resulting :class:`Directory` can then be manipulated as a dictionary, using the path as key. The :func:`collect` method is used to retrieve all the objects that need to be added to the Software Heritage archive since the last collection, by class (contents and directories). When using the dict-like methods to update the contents of the directory, the affected levels of hierarchy are reset and can be collected again using the same method. This enables the efficient collection of updated nodes, for instance when the client is applying diffs. """ __slots__ = ["__entries", "__model_object"] object_type: Final = "directory" @classmethod def from_disk( cls, *, path, dir_filter=accept_all_directories, max_content_length=None ): """Compute the Software Heritage objects for a given directory tree Args: path (bytes): the directory to traverse data (bool): whether to add the data to the content objects save_path (bool): whether to add the path to the content objects dir_filter (function): a filter to ignore some directories by name or contents. Takes two arguments: dirname and entries, and returns True if the directory should be added, False if the directory should be ignored. max_content_length (Optional[int]): if given, all contents larger than this will be skipped. """ top_path = path dirs = {} for root, dentries, fentries in os.walk(top_path, topdown=False): entries = {} # Join fentries and dentries in the same processing, as symbolic # links to directories appear in dentries... for name in fentries + dentries: path = os.path.join(root, name) if not os.path.isdir(path) or os.path.islink(path): content = Content.from_file( path=path, max_content_length=max_content_length ) entries[name] = content else: if dir_filter(path, name, dirs[path].entries): entries[name] = dirs[path] dirs[root] = cls({"name": os.path.basename(root), "path": root}) dirs[root].update(entries) return dirs[top_path] def __init__(self, data=None): super().__init__(data=data) self.__entries = None self.__model_object = None def invalidate_hash(self): self.__entries = None self.__model_object = None super().invalidate_hash() @staticmethod def child_to_directory_entry(name, child): if child.object_type == "directory": return { "type": "dir", "perms": DentryPerms.directory, "target": child.hash, "name": name, } elif child.object_type == "content": return { "type": "file", "perms": child.data["perms"], "target": child.hash, "name": name, } else: raise ValueError(f"unknown child {child}") def get_data(self, **kwargs): return { "id": self.hash, "entries": self.entries, } @property def entries(self): """Child nodes, sorted by name in the same way :func:`swh.model.git_objects.directory_git_object` does.""" if self.__entries is None: self.__entries = sorted( ( self.child_to_directory_entry(name, child) for name, child in self.items() ), key=directory_entry_sort_key, ) return self.__entries def swhid(self) -> CoreSWHID: """Return node identifier as a SWHID""" return CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=self.hash) def compute_hash(self): return self.to_model().id def to_model(self) -> model.Directory: """Builds a `model.Directory` object based on this node; ignoring its children.""" if self.__model_object is None: DirectoryEntry = model.DirectoryEntry entries = [] for name, child in self.items(): if child.object_type == "directory": e = DirectoryEntry( type="dir", perms=DentryPerms.directory, target=child.hash, name=name, ) elif child.object_type == "content": e = DirectoryEntry( type="file", perms=child.data["perms"], target=child.hash, name=name, ) else: raise ValueError(f"unknown child {child}") entries.append(e) entries.sort(key=directory_entry_sort_key) self.__model_object = model.Directory(entries=tuple(entries)) return self.__model_object def __getitem__(self, key): if not isinstance(key, bytes): raise ValueError("Can only get a bytes from Directory") # Convenience shortcut if key == b"": return self if b"/" not in key: return super().__getitem__(key) else: key1, key2 = key.split(b"/", 1) return self.__getitem__(key1)[key2] def __setitem__(self, key, value): if not isinstance(key, bytes): raise ValueError("Can only set a bytes Directory entry") if not isinstance(value, (Content, Directory)): raise ValueError( "Can only set a Directory entry to a Content or " "Directory" ) if key == b"": raise ValueError("Directory entry must have a name") if b"\x00" in key: raise ValueError("Directory entry name must not contain nul bytes") if b"/" not in key: return super().__setitem__(key, value) else: key1, key2 = key.rsplit(b"/", 1) self[key1].__setitem__(key2, value) def __delitem__(self, key): if not isinstance(key, bytes): raise ValueError("Can only delete a bytes Directory entry") if b"/" not in key: super().__delitem__(key) else: key1, key2 = key.rsplit(b"/", 1) del self[key1][key2] def __contains__(self, key): if b"/" not in key: return super().__contains__(key) else: key1, key2 = key.split(b"/", 1) return super().__contains__(key1) and self[key1].__contains__(key2) def __repr__(self): return "Directory(id=%s, entries=[%s])" % ( hash_to_hex(self.hash), ", ".join(str(entry) for entry in self), ) diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index 75d9f8b..8d2cb90 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -1,351 +1,353 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of hashing function definitions. This is the base module use to compute swh's hashes. Only a subset of hashing algorithms is supported as defined in the ALGORITHMS set. Any provided algorithms not in that list will result in a ValueError explaining the error. This module defines a MultiHash class to ease the softwareheritage hashing algorithms computation. This allows to compute hashes from file object, path, data using a similar interface as what the standard hashlib module provides. Basic usage examples: - file object: MultiHash.from_file( file_object, hash_names=DEFAULT_ALGORITHMS).digest() - path (filepath): MultiHash.from_path(b'foo').hexdigest() - data (bytes): MultiHash.from_data(b'foo').bytehexdigest() "Complex" usage, defining a swh hashlib instance first: - To compute length, integrate the length to the set of algorithms to compute, for example: .. code-block:: python h = MultiHash(hash_names=set({'length'}).union(DEFAULT_ALGORITHMS)) with open(filepath, 'rb') as f: h.update(f.read(HASH_BLOCK_SIZE)) hashes = h.digest() # returns a dict of {hash_algo_name: hash_in_bytes} - Write alongside computing hashing algorithms (from a stream), example: .. code-block:: python h = MultiHash(length=length) with open(filepath, 'wb') as f: for chunk in r.iter_content(): # r a stream of sort h.update(chunk) f.write(chunk) hashes = h.hexdigest() # returns a dict of {hash_algo_name: hash_in_hex} """ import binascii import functools import hashlib from io import BytesIO import os from typing import Callable, Dict, Optional, Union -ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5"]) +ALGORITHMS = set( + ["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5", "sha512"] +) """Hashing algorithms supported by this module""" DEFAULT_ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256"]) """Algorithms computed by default when calling the functions from this module. Subset of :const:`ALGORITHMS`. """ HASH_BLOCK_SIZE = 32768 """Block size for streaming hash computations made in this module""" _blake2_hash_cache = {} # type: Dict[str, Callable] class MultiHash: """Hashutil class to support multiple hashes computation. Args: hash_names (set): Set of hash algorithms (+ optionally length) to compute hashes (cf. DEFAULT_ALGORITHMS) length (int): Length of the total sum of chunks to read If the length is provided as algorithm, the length is also computed and returned. """ def __init__(self, hash_names=DEFAULT_ALGORITHMS, length=None): self.state = {} self.track_length = False for name in hash_names: if name == "length": self.state["length"] = 0 self.track_length = True else: self.state[name] = _new_hash(name, length) @classmethod def from_state(cls, state, track_length): ret = cls([]) ret.state = state ret.track_length = track_length @classmethod def from_file(cls, fobj, hash_names=DEFAULT_ALGORITHMS, length=None): ret = cls(length=length, hash_names=hash_names) while True: chunk = fobj.read(HASH_BLOCK_SIZE) if not chunk: break ret.update(chunk) return ret @classmethod def from_path(cls, path, hash_names=DEFAULT_ALGORITHMS): length = os.path.getsize(path) with open(path, "rb") as f: ret = cls.from_file(f, hash_names=hash_names, length=length) return ret @classmethod def from_data(cls, data, hash_names=DEFAULT_ALGORITHMS): length = len(data) fobj = BytesIO(data) return cls.from_file(fobj, hash_names=hash_names, length=length) def update(self, chunk): for name, h in self.state.items(): if name == "length": continue h.update(chunk) if self.track_length: self.state["length"] += len(chunk) def digest(self): return { name: h.digest() if name != "length" else h for name, h in self.state.items() } def hexdigest(self): return { name: h.hexdigest() if name != "length" else h for name, h in self.state.items() } def bytehexdigest(self): return { name: hash_to_bytehex(h.digest()) if name != "length" else h for name, h in self.state.items() } def copy(self): copied_state = { name: h.copy() if name != "length" else h for name, h in self.state.items() } return self.from_state(copied_state, self.track_length) def _new_blake2_hash(algo): """Return a function that initializes a blake2 hash.""" if algo in _blake2_hash_cache: return _blake2_hash_cache[algo]() lalgo = algo.lower() if not lalgo.startswith("blake2"): raise ValueError("Algorithm %s is not a blake2 hash" % algo) blake_family = lalgo[:7] digest_size = None if lalgo[7:]: try: digest_size, remainder = divmod(int(lalgo[7:]), 8) except ValueError: raise ValueError("Unknown digest size for algo %s" % algo) from None if remainder: raise ValueError( "Digest size for algorithm %s must be a multiple of 8" % algo ) blake2 = getattr(hashlib, blake_family) _blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size) return _blake2_hash_cache[algo]() def _new_hashlib_hash(algo): """Initialize a digest object from hashlib. Handle the swh-specific names for the blake2-related algorithms """ if algo.startswith("blake2"): return _new_blake2_hash(algo) else: return hashlib.new(algo) def git_object_header(git_type: str, length: int) -> bytes: """Returns the header for a git object of the given type and length. The header of a git object consists of: - The type of the object (encoded in ASCII) - One ASCII space (\x20) - The length of the object (decimal encoded in ASCII) - One NUL byte Args: base_algo (str from :const:`ALGORITHMS`): a hashlib-supported algorithm git_type: the type of the git object (supposedly one of 'blob', 'commit', 'tag', 'tree') length: the length of the git object you're encoding Returns: a hashutil.hash object """ git_object_types = { "blob", "tree", "commit", "tag", "snapshot", "raw_extrinsic_metadata", "extid", } if git_type not in git_object_types: raise ValueError( "Unexpected git object type %s, expected one of %s" % (git_type, ", ".join(sorted(git_object_types))) ) return ("%s %d\0" % (git_type, length)).encode("ascii") def _new_hash(algo: str, length: Optional[int] = None): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm. See the constant ALGORITHMS for the list of supported algorithms. If a git-specific hashing algorithm is requested (e.g., "sha1_git"), the hashing object will be pre-fed with the needed header; for this to work, length must be given. Args: algo (str): a hashing algorithm (one of ALGORITHMS) length (int): the length of the hashed payload (needed for git-specific algorithms) Returns: a hashutil.hash object Raises: ValueError if algo is unknown, or length is missing for a git-specific hash. """ if algo not in ALGORITHMS: raise ValueError( "Unexpected hashing algorithm %s, expected one of %s" % (algo, ", ".join(sorted(ALGORITHMS))) ) if algo.endswith("_git"): if length is None: raise ValueError("Missing length for git hashing algorithm") base_algo = algo[:-4] h = _new_hashlib_hash(base_algo) h.update(git_object_header("blob", length)) return h return _new_hashlib_hash(algo) def hash_git_data(data, git_type, base_algo="sha1"): """Hash the given data as a git object of type git_type. Args: data: a bytes object git_type: the git object type base_algo: the base hashing algorithm used (default: sha1) Returns: a dict mapping each algorithm to a bytes digest Raises: ValueError if the git_type is unexpected. """ h = _new_hashlib_hash(base_algo) h.update(git_object_header(git_type, len(data))) h.update(data) return h.digest() @functools.lru_cache() def hash_to_hex(hash: Union[str, bytes]) -> str: """Converts a hash (in hex or bytes form) to its hexadecimal ascii form Args: hash (str or bytes): a :class:`bytes` hash or a :class:`str` containing the hexadecimal form of the hash Returns: str: the hexadecimal form of the hash """ if isinstance(hash, str): return hash return binascii.hexlify(hash).decode("ascii") @functools.lru_cache() def hash_to_bytehex(hash: bytes) -> bytes: """Converts a hash to its hexadecimal bytes representation Args: hash (bytes): a :class:`bytes` hash Returns: bytes: the hexadecimal form of the hash, as :class:`bytes` """ return binascii.hexlify(hash) @functools.lru_cache() def hash_to_bytes(hash: Union[str, bytes]) -> bytes: """Converts a hash (in hex or bytes form) to its raw bytes form Args: hash (str or bytes): a :class:`bytes` hash or a :class:`str` containing the hexadecimal form of the hash Returns: bytes: the :class:`bytes` form of the hash """ if isinstance(hash, bytes): return hash return bytes.fromhex(hash) @functools.lru_cache() def bytehex_to_hash(hex: bytes) -> bytes: """Converts a hexadecimal bytes representation of a hash to that hash Args: hash (bytes): a :class:`bytes` containing the hexadecimal form of the hash encoded in ascii Returns: bytes: the :class:`bytes` form of the hash """ return hash_to_bytes(hex.decode()) diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index 1ab2812..b279f2f 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -1,324 +1,334 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import contextlib import hashlib import io import os import tempfile from unittest.mock import patch import pytest from swh.model import hashutil from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytehex @contextlib.contextmanager def patch_blake2(function_name): try: with patch(function_name) as mock: yield mock finally: # mocking blake2 inserts mock objects in the cache; we need # to clean it before the next test runs hashutil._blake2_hash_cache.clear() @pytest.fixture(autouse=True) def blake2_hash_cache_reset(): # Reset function cache hashutil._blake2_hash_cache = {} @pytest.fixture def hash_test_data(): class HashTestData: data = b"1984\n" hex_checksums = { "sha1": "62be35bf00ff0c624f4a621e2ea5595a049e0731", "sha1_git": "568aaf43d83b2c3df8067f3bedbb97d83260be6d", "sha256": "26602113b4b9afd9d55466b08580d3c2" "4a9b50ee5b5866c0d91fab0e65907311", "blake2s256": "63cfb259e1fdb485bc5c55749697a6b21ef31fb7445f6c78a" "c9422f9f2dc8906", } checksums = { type: bytes.fromhex(cksum) for type, cksum in hex_checksums.items() } bytehex_checksums = { type: hashutil.hash_to_bytehex(cksum) for type, cksum in checksums.items() } git_hex_checksums = { "blob": hex_checksums["sha1_git"], "tree": "5b2e883aa33d2efab98442693ea4dd5f1b8871b0", "commit": "79e4093542e72f0fcb7cbd75cb7d270f9254aa8f", "tag": "d6bf62466f287b4d986c545890716ce058bddf67", } git_checksums = { type: bytes.fromhex(cksum) for type, cksum in git_hex_checksums.items() } return HashTestData def test_multi_hash_data(hash_test_data): checksums = MultiHash.from_data(hash_test_data.data).digest() assert checksums == hash_test_data.checksums assert "length" not in checksums def test_multi_hash_data_with_length(hash_test_data): expected_checksums = hash_test_data.checksums.copy() expected_checksums["length"] = len(hash_test_data.data) algos = set(["length"]).union(hashutil.DEFAULT_ALGORITHMS) checksums = MultiHash.from_data(hash_test_data.data, hash_names=algos).digest() assert checksums == expected_checksums assert "length" in checksums def test_multi_hash_data_unknown_hash(hash_test_data): with pytest.raises(ValueError, match="Unexpected hashing algorithm.*unknown-hash"): MultiHash.from_data(hash_test_data.data, ["unknown-hash"]) def test_multi_hash_file(hash_test_data): fobj = io.BytesIO(hash_test_data.data) checksums = MultiHash.from_file(fobj, length=len(hash_test_data.data)).digest() assert checksums == hash_test_data.checksums def test_multi_hash_file_hexdigest(hash_test_data): fobj = io.BytesIO(hash_test_data.data) length = len(hash_test_data.data) checksums = MultiHash.from_file(fobj, length=length).hexdigest() assert checksums == hash_test_data.hex_checksums def test_multi_hash_file_bytehexdigest(hash_test_data): fobj = io.BytesIO(hash_test_data.data) length = len(hash_test_data.data) checksums = MultiHash.from_file(fobj, length=length).bytehexdigest() assert checksums == hash_test_data.bytehex_checksums -def test_multi_hash_file_with_md5(hash_test_data): +EXTRA_HASH_ALGOS = ["md5", "sha512"] + + +@pytest.mark.parametrize("hash_algo", EXTRA_HASH_ALGOS) +def test_multi_hash_file_with_extra_hash_algo(hash_test_data, hash_algo): fobj = io.BytesIO(hash_test_data.data) checksums = MultiHash.from_file( - fobj, hash_names=DEFAULT_ALGORITHMS | {"md5"}, length=len(hash_test_data.data) + fobj, + hash_names=DEFAULT_ALGORITHMS | {hash_algo}, + length=len(hash_test_data.data), ).digest() - md5sum = {"md5": hashlib.md5(hash_test_data.data).digest()} - assert checksums == {**hash_test_data.checksums, **md5sum} + checksum = {hash_algo: hashlib.new(hash_algo, hash_test_data.data).digest()} + assert checksums == {**hash_test_data.checksums, **checksum} -def test_multi_hash_file_hexdigest_with_md5(hash_test_data): +@pytest.mark.parametrize("hash_algo", EXTRA_HASH_ALGOS) +def test_multi_hash_file_hexdigest_with_extra_hash_algo(hash_test_data, hash_algo): fobj = io.BytesIO(hash_test_data.data) length = len(hash_test_data.data) checksums = MultiHash.from_file( - fobj, hash_names=DEFAULT_ALGORITHMS | {"md5"}, length=length + fobj, hash_names=DEFAULT_ALGORITHMS | {hash_algo}, length=length ).hexdigest() - md5sum = {"md5": hashlib.md5(hash_test_data.data).hexdigest()} - assert checksums == {**hash_test_data.hex_checksums, **md5sum} + checksum = {hash_algo: hashlib.new(hash_algo, hash_test_data.data).hexdigest()} + assert checksums == {**hash_test_data.hex_checksums, **checksum} -def test_multi_hash_file_bytehexdigest_with_md5(hash_test_data): +@pytest.mark.parametrize("hash_algo", EXTRA_HASH_ALGOS) +def test_multi_hash_file_bytehexdigest_with_extra_algo(hash_test_data, hash_algo): fobj = io.BytesIO(hash_test_data.data) length = len(hash_test_data.data) checksums = MultiHash.from_file( - fobj, hash_names=DEFAULT_ALGORITHMS | {"md5"}, length=length + fobj, hash_names=DEFAULT_ALGORITHMS | {hash_algo}, length=length ).bytehexdigest() - md5sum = {"md5": hash_to_bytehex(hashlib.md5(hash_test_data.data).digest())} - assert checksums == {**hash_test_data.bytehex_checksums, **md5sum} + checksum = { + hash_algo: hash_to_bytehex(hashlib.new(hash_algo, hash_test_data.data).digest()) + } + assert checksums == {**hash_test_data.bytehex_checksums, **checksum} def test_multi_hash_file_missing_length(hash_test_data): fobj = io.BytesIO(hash_test_data.data) with pytest.raises(ValueError, match="Missing length"): MultiHash.from_file(fobj, hash_names=["sha1_git"]) def test_multi_hash_path(hash_test_data): with tempfile.NamedTemporaryFile(delete=False) as f: f.write(hash_test_data.data) hashes = MultiHash.from_path(f.name).digest() os.remove(f.name) assert hash_test_data.checksums == hashes def test_hash_git_data(hash_test_data): checksums = { git_type: hashutil.hash_git_data(hash_test_data.data, git_type) for git_type in hash_test_data.git_checksums } assert checksums == hash_test_data.git_checksums def test_hash_git_data_unknown_git_type(hash_test_data): with pytest.raises( ValueError, match="Unexpected git object type.*unknown-git-type" ): hashutil.hash_git_data(hash_test_data.data, "unknown-git-type") def test_hash_to_hex(hash_test_data): for type in hash_test_data.checksums: hex = hash_test_data.hex_checksums[type] hash = hash_test_data.checksums[type] assert hashutil.hash_to_hex(hex) == hex assert hashutil.hash_to_hex(hash) == hex def test_hash_to_bytes(hash_test_data): for type in hash_test_data.checksums: hex = hash_test_data.hex_checksums[type] hash = hash_test_data.checksums[type] assert hashutil.hash_to_bytes(hex) == hash assert hashutil.hash_to_bytes(hash) == hash def test_hash_to_bytehex(hash_test_data): for algo in hash_test_data.checksums: hex_checksum = hash_test_data.hex_checksums[algo].encode("ascii") assert hex_checksum == hashutil.hash_to_bytehex(hash_test_data.checksums[algo]) def test_bytehex_to_hash(hash_test_data): for algo in hash_test_data.checksums: assert hash_test_data.checksums[algo] == hashutil.bytehex_to_hash( hash_test_data.hex_checksums[algo].encode() ) def test_new_hash_unsupported_hashing_algorithm(): expected_message = ( "Unexpected hashing algorithm blake2:10, " "expected one of blake2b512, blake2s256, " "md5, sha1, sha1_git, sha256" ) with pytest.raises(ValueError, match=expected_message): hashutil._new_hash("blake2:10") def test_new_hash_blake2b_builtin(): with patch_blake2("hashlib.blake2b") as mock_blake2b: mock_blake2b.return_value = sentinel = object() h = hashutil._new_hash("blake2b512") assert h is sentinel mock_blake2b.assert_called_with(digest_size=512 // 8) def test_new_hash_blake2s_builtin(): with patch_blake2("hashlib.blake2s") as mock_blake2s: mock_blake2s.return_value = sentinel = object() h = hashutil._new_hash("blake2s256") assert h is sentinel mock_blake2s.assert_called_with(digest_size=256 // 8) @pytest.fixture def hashgit_test_data(): class HashGitTestData: blob_data = b"42\n" tree_data = b"".join( [ b"40000 barfoo\0", bytes.fromhex("c3020f6bf135a38c6df" "3afeb5fb38232c5e07087"), b"100644 blah\0", bytes.fromhex("63756ef0df5e4f10b6efa" "33cfe5c758749615f20"), b"100644 hello\0", bytes.fromhex("907b308167f0880fb2a" "5c0e1614bb0c7620f9dc3"), ] ) commit_data = b"""\ tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 author Antoine R. Dumont (@ardumont) 1444054085 +0200 committer Antoine R. Dumont (@ardumont) 1444054085 +0200 initial """ # noqa tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 type commit tag 0.0.1 tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 blah """.encode( "utf-8" ) # NOQA checksums = { "blob_sha1_git": bytes.fromhex( "d81cc0710eb6cf9efd5b920a8453e1" "e07157b6cd" ), "tree_sha1_git": bytes.fromhex( "ac212302c45eada382b27bfda795db" "121dacdb1c" ), "commit_sha1_git": bytes.fromhex( "e960570b2e6e2798fa4cfb9af2c399" "d629189653" ), "tag_sha1_git": bytes.fromhex( "bc2b99ba469987bcf1272c189ed534" "e9e959f120" ), } return HashGitTestData def test_unknown_header_type(): with pytest.raises(ValueError, match="Unexpected git object type"): hashutil.hash_git_data(b"any-data", "some-unknown-type") def test_hashdata_content(hashgit_test_data): # when actual_hash = hashutil.hash_git_data(hashgit_test_data.blob_data, git_type="blob") # then assert actual_hash == hashgit_test_data.checksums["blob_sha1_git"] def test_hashdata_tree(hashgit_test_data): # when actual_hash = hashutil.hash_git_data(hashgit_test_data.tree_data, git_type="tree") # then assert actual_hash == hashgit_test_data.checksums["tree_sha1_git"] def test_hashdata_revision(hashgit_test_data): # when actual_hash = hashutil.hash_git_data( hashgit_test_data.commit_data, git_type="commit" ) # then assert actual_hash == hashgit_test_data.checksums["commit_sha1_git"] def test_hashdata_tag(hashgit_test_data): # when actual_hash = hashutil.hash_git_data(hashgit_test_data.tag_data, git_type="tag") # then assert actual_hash == hashgit_test_data.checksums["tag_sha1_git"]