diff --git a/PKG-INFO b/PKG-INFO index 5686739..8cf842c 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.17 +Version: 0.0.18 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index f85b0fa..02653d2 100644 --- a/debian/control +++ b/debian/control @@ -1,18 +1,21 @@ Source: swh-model Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, + python3 (>= 3.5) | python3-pyblake2, python3-all, python3-nose, python3-setuptools, - python3 (>= 3.5) | python3-pyblake2, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DMOD/ Package: python3-swh.model Architecture: all Depends: ${misc:Depends}, ${python3:Depends} +Breaks: python3-swh.loader.core (<< 0.0.16~), + python3-swh.loader.dir (<< 0.0.28~), + python3-swh.loader.svn (<< 0.0.28~) Description: Software Heritage data model diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..58a761e --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,3 @@ +_build/ +apidoc/ +*-stamp diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..c30c50a --- /dev/null +++ b/docs/Makefile @@ -0,0 +1 @@ +include ../../swh-docs/Makefile.sphinx diff --git a/docs/_static/.placeholder b/docs/_static/.placeholder new file mode 100644 index 0000000..e69de29 diff --git a/docs/_templates/.placeholder b/docs/_templates/.placeholder new file mode 100644 index 0000000..e69de29 diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..190deb7 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1 @@ +from swh.docs.sphinx.conf import * # NoQA diff --git a/docs/data-model.rst b/docs/data-model.rst new file mode 100644 index 0000000..f365f9f --- /dev/null +++ b/docs/data-model.rst @@ -0,0 +1,13 @@ +.. _data-model: + +Software Heritage data model +============================ + +TODO + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..22eccfd --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,20 @@ +Software Heritage - Development Documentation +============================================= + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + +Overview +-------- + +* :ref:`data-model` + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/setup.py b/setup.py index 793b8e7..8d2e843 100644 --- a/setup.py +++ b/setup.py @@ -1,37 +1,40 @@ -import sys +import hashlib from setuptools import setup def parse_requirements(): requirements = [] for reqf in ('requirements.txt', 'requirements-swh.txt'): with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements extra_requirements = [] -if sys.version_info < (3, 5): - extra_requirements = ['pyblake2'] + + +pyblake2_hashes = {'blake2s256', 'blake2b512'} +if pyblake2_hashes - set(hashlib.algorithms_available): + extra_requirements.append('pyblake2') setup( name='swh.model', description='Software Heritage data model', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DMOD/', packages=[ 'swh.model', 'swh.model.fields', 'swh.model.tests', 'swh.model.tests.fields', ], # packages's modules scripts=[], # scripts to package install_requires=parse_requirements() + extra_requirements, setup_requires=['vcversioner'], vcversioner={}, include_package_data=True, ) diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 5686739..8cf842c 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.17 +Version: 0.0.18 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.model.egg-info/SOURCES.txt b/swh.model.egg-info/SOURCES.txt index 7927051..f88b36d 100644 --- a/swh.model.egg-info/SOURCES.txt +++ b/swh.model.egg-info/SOURCES.txt @@ -1,44 +1,53 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile Makefile.local README-dev.md requirements-swh.txt requirements.txt setup.py version.txt bin/git-revhash bin/swh-revhash debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format +docs/.gitignore +docs/Makefile +docs/conf.py +docs/data-model.rst +docs/index.rst +docs/_static/.placeholder +docs/_templates/.placeholder swh.model.egg-info/PKG-INFO swh.model.egg-info/SOURCES.txt swh.model.egg-info/dependency_links.txt swh.model.egg-info/requires.txt swh.model.egg-info/top_level.txt swh/model/__init__.py swh/model/exceptions.py -swh/model/git.py +swh/model/from_disk.py swh/model/hashutil.py swh/model/identifiers.py +swh/model/merkle.py swh/model/validators.py swh/model/fields/__init__.py swh/model/fields/compound.py swh/model/fields/hashes.py swh/model/fields/simple.py swh/model/tests/__init__.py -swh/model/tests/test_git.py -swh/model/tests/test_git_slow.py +swh/model/tests/generate_testdata_from_disk.py +swh/model/tests/test_from_disk.py swh/model/tests/test_hashutil.py swh/model/tests/test_identifiers.py +swh/model/tests/test_merkle.py swh/model/tests/test_validators.py swh/model/tests/fields/__init__.py swh/model/tests/fields/test_compound.py swh/model/tests/fields/test_hashes.py swh/model/tests/fields/test_simple.py \ No newline at end of file diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py new file mode 100644 index 0000000..f9f3729 --- /dev/null +++ b/swh/model/from_disk.py @@ -0,0 +1,346 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import enum +import os +import stat + +from . import hashutil +from .merkle import MerkleLeaf, MerkleNode +from .identifiers import ( + directory_identifier, + identifier_to_bytes as id_to_bytes, + identifier_to_str as id_to_str, +) + + +class DentryPerms(enum.IntEnum): + """Admissible permissions for directory entries.""" + content = 0o100644 + """Content""" + executable_content = 0o100755 + """Executable content (e.g. executable script)""" + symlink = 0o120000 + """Symbolic link""" + directory = 0o040000 + """Directory""" + revision = 0o160000 + """Revision (e.g. submodule)""" + + +def mode_to_perms(mode): + """Convert a file mode to a permission compatible with Software Heritage + directory entries + + Args: + mode (int): a file mode as returned by :func:`os.stat` in + :attr:`os.stat_result.st_mode` + + Returns: + DentryPerms: one of the following values: + :const:`DentryPerms.content`: plain file + :const:`DentryPerms.executable_content`: executable file + :const:`DentryPerms.symlink`: symbolic link + :const:`DentryPerms.directory`: directory + + """ + if stat.S_ISLNK(mode): + return DentryPerms.symlink + if stat.S_ISDIR(mode): + return DentryPerms.directory + else: + # file is executable in any way + if mode & (0o111): + return DentryPerms.executable_content + else: + return DentryPerms.content + + +class Content(MerkleLeaf): + """Representation of a Software Heritage content as a node in a Merkle tree. + + The current Merkle hash for the Content nodes is the `sha1_git`, which + makes it consistent with what :class:`Directory` uses for its own hash + computation. + + """ + __slots__ = [] + type = 'content' + + @classmethod + def from_bytes(cls, *, mode, data): + """Convert data (raw :class:`bytes`) to a Software Heritage content entry + + Args: + mode (int): a file mode (passed to :func:`mode_to_perms`) + data (bytes): raw contents of the file + """ + ret = hashutil.hash_data(data) + ret['length'] = len(data) + ret['perms'] = mode_to_perms(mode) + ret['data'] = data + + return cls(ret) + + @classmethod + def from_symlink(cls, *, path, mode): + """Convert a symbolic link to a Software Heritage content entry""" + return cls.from_bytes(mode=mode, data=os.readlink(path)) + + @classmethod + def from_file(cls, *, path, data=False, save_path=False): + """Compute the Software Heritage content entry corresponding to an on-disk + file. + + The returned dictionary contains keys useful for both: + - loading the content in the archive (hashes, `length`) + - using the content as a directory entry in a directory + + Args: + path (bytes): path to the file for which we're computing the + content entry + data (bool): add the file data to the entry + save_path (bool): add the file path to the entry + """ + file_stat = os.lstat(path) + mode = file_stat.st_mode + + if stat.S_ISLNK(mode): + # Symbolic link: return a file whose contents are the link target + return cls.from_symlink(path=path, mode=mode) + elif not stat.S_ISREG(mode): + # not a regular file: return the empty file instead + return cls.from_bytes(mode=mode, data=b'') + + length = file_stat.st_size + + if not data: + ret = hashutil.hash_path(path) + else: + chunks = [] + + def append_chunk(x, chunks=chunks): + chunks.append(x) + + with open(path, 'rb') as fobj: + ret = hashutil.hash_file(fobj, length=length, + chunk_cb=append_chunk) + + ret['data'] = b''.join(chunks) + + if save_path: + ret['path'] = path + ret['perms'] = mode_to_perms(mode) + ret['length'] = length + + obj = cls(ret) + return obj + + def __repr__(self): + return 'Content(id=%s)' % id_to_str(self.hash) + + def compute_hash(self): + return self.data['sha1_git'] + + +def accept_all_directories(dirname, entries): + """Default filter for :func:`Directory.from_disk` accepting all + directories + + Args: + dirname (bytes): directory name + entries (list): directory entries + """ + return True + + +def ignore_empty_directories(dirname, entries): + """Filter for :func:`directory_to_objects` ignoring empty directories + + Args: + dirname (bytes): directory name + entries (list): directory entries + Returns: + True if the directory is not empty, false if the directory is empty + """ + return bool(entries) + + +def ignore_named_directories(names, *, case_sensitive=True): + """Filter for :func:`directory_to_objects` to ignore directories named one + of names. + + Args: + names (list of bytes): names to ignore + case_sensitive (bool): whether to do the filtering in a case sensitive + way + Returns: + a directory filter for :func:`directory_to_objects` + """ + if not case_sensitive: + names = [name.lower() for name in names] + + def named_filter(dirname, entries, + names=names, case_sensitive=case_sensitive): + if case_sensitive: + return dirname not in names + else: + return dirname.lower() not in names + + return named_filter + + +class Directory(MerkleNode): + """Representation of a Software Heritage directory as a node in a Merkle Tree. + + This class can be used to generate, from an on-disk directory, all the + objects that need to be sent to the Software Heritage archive. + + The :func:`from_disk` constructor allows you to generate the data structure + from a directory on disk. The resulting :class:`Directory` can then be + manipulated as a dictionary, using the path as key. + + The :func:`collect` method is used to retrieve all the objects that need to + be added to the Software Heritage archive since the last collection, by + class (contents and directories). + + When using the dict-like methods to update the contents of the directory, + the affected levels of hierarchy are reset and can be collected again using + the same method. This enables the efficient collection of updated nodes, + for instance when the client is applying diffs. + """ + __slots__ = ['__entries'] + type = 'directory' + + @classmethod + def from_disk(cls, *, path, data=False, save_path=False, + dir_filter=accept_all_directories): + """Compute the Software Heritage objects for a given directory tree + + Args: + path (bytes): the directory to traverse + data (bool): whether to add the data to the content objects + save_path (bool): whether to add the path to the content objects + dir_filter (function): a filter to ignore some directories by + name or contents. Takes two arguments: dirname and entries, and + returns True if the directory should be added, False if the + directory should be ignored. + """ + + top_path = path + dirs = {} + + for root, dentries, fentries in os.walk(top_path, topdown=False): + entries = {} + # Join fentries and dentries in the same processing, as symbolic + # links to directories appear in dentries... + for name in fentries + dentries: + path = os.path.join(root, name) + if not os.path.isdir(path) or os.path.islink(path): + content = Content.from_file(path=path, data=data, + save_path=save_path) + entries[name] = content + else: + if dir_filter(name, dirs[path].entries): + entries[name] = dirs[path] + + dirs[root] = cls({'name': os.path.basename(root)}) + dirs[root].update(entries) + + return dirs[top_path] + + def __init__(self, data=None): + super().__init__(data=data) + self.__entries = None + + def invalidate_hash(self): + self.__entries = None + super().invalidate_hash() + + @staticmethod + def child_to_directory_entry(name, child): + if isinstance(child, Directory): + return { + 'type': 'dir', + 'perms': DentryPerms.directory, + 'target': child.hash, + 'name': name, + } + elif isinstance(child, Content): + return { + 'type': 'file', + 'perms': child.data['perms'], + 'target': child.hash, + 'name': name, + } + else: + raise ValueError('unknown child') + + def get_data(self, **kwargs): + return { + 'id': self.hash, + 'entries': self.entries, + } + + @property + def entries(self): + if self.__entries is None: + self.__entries = [ + self.child_to_directory_entry(name, child) + for name, child in self.items() + ] + + return self.__entries + + def compute_hash(self): + return id_to_bytes(directory_identifier({'entries': self.entries})) + + def __getitem__(self, key): + if not isinstance(key, bytes): + raise ValueError('Can only get a bytes from Directory') + + # Convenience shortcut + if key == b'': + return self + + if b'/' not in key: + return super().__getitem__(key) + else: + key1, key2 = key.split(b'/', 1) + return self.__getitem__(key1)[key2] + + def __setitem__(self, key, value): + if not isinstance(key, bytes): + raise ValueError('Can only set a bytes Directory entry') + if not isinstance(value, (Content, Directory)): + raise ValueError('Can only set a Directory entry to a Content or ' + 'Directory') + + if key == b'': + raise ValueError('Directory entry must have a name') + if b'\x00' in key: + raise ValueError('Directory entry name must not contain nul bytes') + + if b'/' not in key: + return super().__setitem__(key, value) + else: + key1, key2 = key.rsplit(b'/', 1) + self[key1].__setitem__(key2, value) + + def __delitem__(self, key): + if not isinstance(key, bytes): + raise ValueError('Can only delete a bytes Directory entry') + + if b'/' not in key: + super().__delitem__(key) + else: + key1, key2 = key.rsplit(b'/', 1) + del self[key1][key2] + + def __repr__(self): + return 'Directory(id=%s, entries=[%s])' % ( + id_to_str(self.hash), + ', '.join(str(entry) for entry in self), + ) diff --git a/swh/model/git.py b/swh/model/git.py deleted file mode 100644 index ad5962f..0000000 --- a/swh/model/git.py +++ /dev/null @@ -1,587 +0,0 @@ -# Copyright (C) 2015-2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -import os -import stat - -from enum import Enum, IntEnum - -from swh.model import hashutil, identifiers - - -ROOT_TREE_KEY = b'' - - -class GitType(Enum): - BLOB = b'blob' - TREE = b'tree' - EXEC = b'exec' - LINK = b'link' - COMM = b'commit' - RELE = b'release' - REFS = b'ref' - - -class GitPerm(IntEnum): - BLOB = 0o100644 - TREE = 0o040000 - EXEC = 0o100755 - LINK = 0o120000 - - -def _compute_directory_git_sha1(hashes): - """Compute a directory git sha1 from hashes. - - Args: - hashes: list of tree entries with keys: - - sha1_git: the tree entry's sha1 - - name: file or subdir's name - - perms: the tree entry's sha1 permissions - - Returns: - the binary sha1 of the dictionary's identifier - - Assumes: - Every path exists in hashes. - - """ - directory = { - 'entries': - [ - { - 'name': entry['name'], - 'perms': entry['perms'], - 'target': entry['sha1_git'], - 'type': 'dir' if entry['perms'] == GitPerm.TREE else 'file', - } - for entry in hashes - ] - } - return hashutil.hash_to_bytes(identifiers.directory_identifier(directory)) - - -def compute_directory_git_sha1(dirpath, hashes): - """Compute a directory git sha1 for a dirpath. - - Args: - dirpath: the directory's absolute path - hashes: list of tree entries with keys: - - sha1_git: the tree entry's sha1 - - name: file or subdir's name - - perms: the tree entry's sha1 permissions - - Returns: - the binary sha1 of the dictionary's identifier - - Assumes: - Every path exists in hashes. - - """ - return _compute_directory_git_sha1(hashes[dirpath]) - - -def compute_revision_sha1_git(revision): - """Compute a revision sha1 git from its dict representation. - - Args: - revision: Additional dictionary information needed to compute a - synthetic - revision. Following keys are expected: - - author - - date - - committer - - committer_date - - message - - type - - directory: binary form of the tree hash - - Returns: - revision sha1 in bytes - - # FIXME: beware, bytes output from storage api - - """ - return hashutil.hash_to_bytes(identifiers.revision_identifier(revision)) - - -def compute_release_sha1_git(release): - """Compute a release sha1 git from its dict representation. - - Args: - release: Additional dictionary information needed to compute a - synthetic release. Following keys are expected: - - name - - message - - date - - author - - revision: binary form of the sha1_git revision targeted by this - - Returns: - release sha1 in bytes - - """ - return hashutil.hash_to_bytes(identifiers.release_identifier(release)) - - -def compute_link_metadata(linkpath): - """Given a linkpath, compute the git metadata. - - Args: - linkpath: absolute pathname of the link - - Returns: - Dictionary of values: - - data: link's content - - length: link's content length - - name: basename of the link - - perms: git permission for link - - type: git type for link - - path: absolute path to the link on filesystem - - """ - data = os.readlink(linkpath) - link_metadata = hashutil.hash_data(data) - link_metadata.update({ - 'data': data, - 'length': len(data), - 'name': os.path.basename(linkpath), - 'perms': GitPerm.LINK, - 'type': GitType.BLOB, - 'path': linkpath - }) - - return link_metadata - - -def compute_blob_metadata(filepath): - """Given a filepath resolving to a regular file, compute the metadata. - Other file types (fifo, character or block device, symlink) will - be considered empty regular file. To deal properly with symlinks, - use swh.model.git.compute_link_metadata. - - Args: - filepath: absolute pathname of the regular file. - - Returns: - Dictionary of values: - - name: basename of the file - - length: data length - - perms: git permission for file - - type: git type for file - - path: absolute filepath on filesystem - - """ - mode = os.lstat(filepath).st_mode - if not stat.S_ISREG(mode): # special (block or character device, fifo) - perms = GitPerm.BLOB - blob_metadata = hashutil.hash_data(b'') - blob_metadata['length'] = 0 - else: - perms = GitPerm.EXEC if os.access(filepath, os.X_OK) else GitPerm.BLOB - blob_metadata = hashutil.hash_path(filepath) - - blob_metadata.update({ - 'name': os.path.basename(filepath), - 'perms': perms, - 'type': GitType.BLOB, - 'path': filepath - }) - - return blob_metadata - - -def _compute_tree_metadata(dirname, hashes): - """Given a dirname, compute the git metadata. - - Args: - dirname: absolute pathname of the directory. - hashes: list of tree dirname's entries with keys: - - sha1_git: the tree entry's sha1 - - name: file or subdir's name - - perms: the tree entry's sha1 permissions - - Returns: - Dictionary of values: - - sha1_git: tree's sha1 git - - name: basename of the directory - - perms: git permission for directory - - type: git type for directory - - path: absolute path to directory on filesystem - - """ - return { - 'sha1_git': _compute_directory_git_sha1(hashes), - 'name': os.path.basename(dirname), - 'perms': GitPerm.TREE, - 'type': GitType.TREE, - 'path': dirname - } - - -def compute_tree_metadata(dirname, ls_hashes): - """Given a dirname, compute the git metadata. - - Args: - dirname: absolute pathname of the directory. - ls_hashes: dictionary of path, hashes - - Returns: - Dictionary of values: - - sha1_git: tree's sha1 git - - name: basename of the directory - - perms: git permission for directory - - type: git type for directory - - path: absolute path to directory on filesystem - - """ - return _compute_tree_metadata(dirname, ls_hashes[dirname]) - - -def default_validation_dir(dirpath): - """Default validation function. - This is the equivalent of the identity function. - - Args: - dirpath: Path to validate - - Returns: True - - """ - return True - - -def _walk(rootdir, - dir_ok_fn=default_validation_dir, - remove_empty_folder=False): - """Walk the filesystem and yields a 3 tuples (dirpath, dirnames as set - of absolute paths, filenames as set of abslute paths) - - Ignore files which won't pass the dir_ok_fn validation. - - If remove_empty_folder is True, remove and ignore any - encountered empty folder. - - Args: - - rootdir: starting walk root directory path - - dir_ok_fn: validation function. if folder encountered are - not ok, they are ignored. Default to default_validation_dir - which does nothing. - - remove_empty_folder: Flag to remove and ignore any - encountered empty folders. - - Yields: - 3 tuples dirpath, set of absolute children dirname paths, set - of absolute filename paths. - - """ - def basic_gen_dir(rootdir): - for dp, dns, fns in os.walk(rootdir, topdown=False): - yield (dp, - set((os.path.join(dp, dn) for dn in dns)), - set((os.path.join(dp, fn) for fn in fns))) - - if dir_ok_fn == default_validation_dir: - if not remove_empty_folder: # os.walk - yield from basic_gen_dir(rootdir) - else: # os.walk + empty dir cleanup - empty_folders = set() - for dp, dns, fns in basic_gen_dir(rootdir): - if not dns and not fns: - empty_folders.add(dp) - # need to remove it because folder of empty folder - # is an empty folder!!! - if os.path.islink(dp): - os.remove(dp) - else: - os.rmdir(dp) - parent = os.path.dirname(dp) - # edge case about parent containing one empty - # folder which become an empty one - while not os.listdir(parent): - empty_folders.add(parent) - if os.path.islink(parent): - os.remove(parent) - else: - os.rmdir(parent) - parent = os.path.dirname(parent) - continue - yield (dp, dns - empty_folders, fns) - else: - def filtfn(dirnames): - return set(filter(dir_ok_fn, dirnames)) - - gen_dir = ((dp, dns, fns) for dp, dns, fns - in basic_gen_dir(rootdir) if dir_ok_fn(dp)) - - if not remove_empty_folder: # os.walk + filtering - for dp, dns, fns in gen_dir: - yield (dp, filtfn(dns), fns) - else: # os.walk + filtering + empty dir cleanup - empty_folders = set() - for dp, dns, fns in gen_dir: - dps = filtfn(dns) - - if not dps and not fns: - empty_folders.add(dp) - # need to remove it because folder of empty folder - # is an empty folder!!! - if os.path.islink(dp): - os.remove(dp) - else: - os.rmdir(dp) - parent = os.path.dirname(dp) - # edge case about parent containing one empty - # folder which become an empty one - while not os.listdir(parent): - empty_folders.add(parent) - if os.path.islink(parent): - os.remove(parent) - else: - os.rmdir(parent) - parent = os.path.dirname(parent) - continue - yield dp, dps - empty_folders, fns - - -def walk_and_compute_sha1_from_directory(rootdir, - dir_ok_fn=default_validation_dir, - with_root_tree=True, - remove_empty_folder=False): - """(Deprecated) TODO migrate the code to - compute_hashes_from_directory. - - Compute git sha1 from directory rootdir. - - Args: - - rootdir: Root directory from which beginning the git hash computation - - - dir_ok_fn: Filter function to filter directory according to rules - defined in the function. By default, all folders are ok. - Example override: dir_ok_fn = lambda dirpath: b'svn' not in dirpath - - - with_root_tree: Determine if we compute the upper root tree's - checksums. As a default, we want it. One possible use case where this - is not useful is the update (cf. `update_checksums_from`) - - Returns: - Dictionary of entries with keys and as values a list of - directory entries. - Those are list of dictionary with keys: - - 'perms' - - 'type' - - 'name' - - 'sha1_git' - - and specifically content: 'sha1', 'sha256', ... - - Note: - One special key is ROOT_TREE_KEY to indicate the upper root of the - directory (this is the revision's directory). - - Raises: - Nothing - If something is raised, this is a programmatic error. - - """ - ls_hashes = {} - all_links = set() - - if rootdir.endswith(b'/'): - rootdir = rootdir.rstrip(b'/') - - for dirpath, dirnames, filenames in _walk( - rootdir, dir_ok_fn, remove_empty_folder): - hashes = [] - - links = (file - for file in filenames.union(dirnames) - if os.path.islink(file)) - - for linkpath in links: - all_links.add(linkpath) - m_hashes = compute_link_metadata(linkpath) - hashes.append(m_hashes) - - for filepath in (file for file in filenames if file not in all_links): - m_hashes = compute_blob_metadata(filepath) - hashes.append(m_hashes) - - ls_hashes[dirpath] = hashes - - dir_hashes = [] - for fulldirname in (dir for dir in dirnames if dir not in all_links): - tree_hash = _compute_tree_metadata(fulldirname, - ls_hashes[fulldirname]) - dir_hashes.append(tree_hash) - - ls_hashes[dirpath].extend(dir_hashes) - - if with_root_tree: - # compute the current directory hashes - root_hash = { - 'sha1_git': _compute_directory_git_sha1(ls_hashes[rootdir]), - 'path': rootdir, - 'name': os.path.basename(rootdir), - 'perms': GitPerm.TREE, - 'type': GitType.TREE - } - ls_hashes[ROOT_TREE_KEY] = [root_hash] - - return ls_hashes - - -def compute_hashes_from_directory(rootdir, - dir_ok_fn=default_validation_dir, - remove_empty_folder=False): - """Compute git sha1 from directory rootdir. - - Args: - - rootdir: Root directory from which beginning the git hash - computation - - - dir_ok_fn: Filter function to filter directory according to rules - defined in the function. By default, all folders are ok. - Example override: dir_ok_fn = lambda dirpath: b'svn' not in dirpath - - Returns: - Dictionary of entries with keys absolute path name. - Path-name can be a file/link or directory. - The associated value is a dictionary with: - - checksums: the dictionary with the hashes for the link/file/dir - Those are list of dictionary with keys: - - 'perms' - - 'type' - - 'name' - - 'sha1_git' - - and specifically content: 'sha1', 'sha256', ... - - - children: Only for a directory, the set of children paths - - Note: - One special key is the / which indicates the upper root of - the directory (this is the revision's directory). - - Raises: - Nothing - If something is raised, this is a programmatic error. - - """ - def _get_dict_from_dirpath(_dict, path): - """Retrieve the default associated value for key path. - - """ - return _dict.get(path, dict(children=set(), checksums=None)) - - def _get_dict_from_filepath(_dict, path): - """Retrieve the default associated value for key path. - - """ - return _dict.get(path, dict(checksums=None)) - - ls_hashes = {} - all_links = set() - - if rootdir.endswith(b'/'): - rootdir = rootdir.rstrip(b'/') - - for dirpath, dirnames, filenames in _walk( - rootdir, dir_ok_fn, remove_empty_folder): - - dir_entry = _get_dict_from_dirpath(ls_hashes, dirpath) - children = dir_entry['children'] - - links = (file - for file in filenames.union(dirnames) - if os.path.islink(file)) - - for linkpath in links: - all_links.add(linkpath) - m_hashes = compute_link_metadata(linkpath) - d = _get_dict_from_filepath(ls_hashes, linkpath) - d['checksums'] = m_hashes - ls_hashes[linkpath] = d - children.add(linkpath) - - for filepath in (file for file in filenames if file not in all_links): - m_hashes = compute_blob_metadata(filepath) - d = _get_dict_from_filepath(ls_hashes, filepath) - d['checksums'] = m_hashes - ls_hashes[filepath] = d - children.add(filepath) - - for fulldirname in (dir for dir in dirnames if dir not in all_links): - d_hashes = _get_dict_from_dirpath(ls_hashes, fulldirname) - tree_hash = _compute_tree_metadata( - fulldirname, - (ls_hashes[p]['checksums'] for p in d_hashes['children']) - ) - d = _get_dict_from_dirpath(ls_hashes, fulldirname) - d['checksums'] = tree_hash - ls_hashes[fulldirname] = d - children.add(fulldirname) - - dir_entry['children'] = children - ls_hashes[dirpath] = dir_entry - - # compute the current directory hashes - d_hashes = _get_dict_from_dirpath(ls_hashes, rootdir) - root_hash = { - 'sha1_git': _compute_directory_git_sha1( - (ls_hashes[p]['checksums'] for p in d_hashes['children']) - ), - 'path': rootdir, - 'name': os.path.basename(rootdir), - 'perms': GitPerm.TREE, - 'type': GitType.TREE - } - d_hashes['checksums'] = root_hash - ls_hashes[rootdir] = d_hashes - - return ls_hashes - - -def children_hashes(children, objects): - """Given a collection of children path, yield the corresponding - hashes. - - Args: - objects: objects hash as returned by git.compute_hashes_from_directory. - children: collection of bytes path - - Yields: - Dictionary hashes - - """ - for p in children: - c = objects.get(p) - if c: - h = c.get('checksums') - if h: - yield h - - -def objects_per_type(filter_type, objects_per_path): - """Given an object dictionary returned by - `swh.model.git.compute_hashes_from_directory`, yields - corresponding element type's hashes - - Args: - filter_type: one of GitType enum - objects_per_path: - - Yields: - Elements of type filter_type's hashes - - """ - for path, obj in objects_per_path.items(): - o = obj['checksums'] - if o['type'] == filter_type: - if 'children' in obj: # for trees - if obj['children']: - o['children'] = children_hashes(obj['children'], - objects_per_path) - else: - o['children'] = [] - yield o diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index f9aca1b..4d6f9db 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -1,236 +1,269 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of hashing function definitions. This is the base module use to compute swh's hashes. Only a subset of hashing algorithms is supported as defined in the ALGORITHMS set. Any provided algorithms not in that list will result in a ValueError explaining the error. This modules defines the following hashing functions: - hash_file: Hash the contents of the given file object with the given algorithms (defaulting to DEFAULT_ALGORITHMS if none provided). - hash_data: Hash the given binary blob with the given algorithms (defaulting to DEFAULT_ALGORITHMS if none provided). - hash_path: Hash the contents of the file at the given path with the given algorithms (defaulting to DEFAULT_ALGORITHMS if none provided). """ import binascii import functools import hashlib import os -import sys from io import BytesIO -# Supported algorithms ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256', 'blake2b512']) +"""Hashing algorithms supported by this module""" -# Default algorithms used DEFAULT_ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256']) +"""Algorithms computed by default when calling the functions from this module. + +Subset of :const:`ALGORITHMS`. +""" -# should be a multiple of 64 (sha1/sha256's block size) -# FWIW coreutils' sha1sum uses 32768 HASH_BLOCK_SIZE = 32768 +"""Block size for streaming hash computations made in this module""" -# Prior to python3.4, only blake2 is available through pyblake2 module -# From 3.5 onwards, it's been integrated in python -if sys.version_info.major == 3 and sys.version_info.minor <= 4: - import pyblake2 - # register those hash algorithms in hashlib - __cache = hashlib.__builtin_constructor_cache - __cache['blake2s256'] = pyblake2.blake2s - __cache['blake2b512'] = pyblake2.blake2b +# Load blake2 hashes from pyblake2 if they are not available in the builtin +# hashlib +__pyblake2_hashes = {'blake2s256': 'blake2s', + 'blake2b512': 'blake2b'} +__cache = hashlib.__builtin_constructor_cache +for __hash, __pyblake2_fn in __pyblake2_hashes.items(): + if __hash not in hashlib.algorithms_available: + import pyblake2 + __cache[__hash] = getattr(pyblake2, __pyblake2_fn) def _new_git_hash(base_algo, git_type, length): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm, and feed it with the header for a git object of the given type and length. The header for hashing a git object consists of: - The type of the object (encoded in ASCII) - One ASCII space (\x20) - The length of the object (decimal encoded in ASCII) - One NUL byte Args: - base_algo: a hashlib-supported algorithm + base_algo (str from :const:`ALGORITHMS`): a hashlib-supported algorithm git_type: the type of the git object (supposedly one of 'blob', 'commit', 'tag', 'tree') length: the length of the git object you're encoding Returns: a hashutil.hash object """ h = hashlib.new(base_algo) git_header = '%s %d\0' % (git_type, length) h.update(git_header.encode('ascii')) return h def _new_hash(algo, length=None): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm. See the constant ALGORITHMS for the list of supported algorithms. If a git-specific hashing algorithm is requested (e.g., "sha1_git"), the hashing object will be pre-fed with the needed header; for this to work, length must be given. Args: algo (str): a hashing algorithm (one of ALGORITHMS) length (int): the length of the hashed payload (needed for - git-specific algorithms) + git-specific algorithms) Returns: a hashutil.hash object Raises: ValueError if algo is unknown, or length is missing for a git-specific hash. """ if algo not in ALGORITHMS: raise ValueError( 'Unexpected hashing algorithm %s, expected one of %s' % (algo, ', '.join(sorted(ALGORITHMS)))) if algo.endswith('_git'): if length is None: raise ValueError('Missing length for git hashing algorithm') base_algo = algo[:-4] return _new_git_hash(base_algo, 'blob', length) return hashlib.new(algo) def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): """Hash the contents of the given file object with the given algorithms. Args: fobj: a file-like object length: the length of the contents of the file-like object (for the git-specific algorithms) algorithms: the hashing algorithms used Returns: a dict mapping each algorithm to a bytes digest. Raises: ValueError if algorithms contains an unknown hash algorithm. """ hashes = {algo: _new_hash(algo, length) for algo in algorithms} while True: chunk = fobj.read(HASH_BLOCK_SIZE) if not chunk: break for hash in hashes.values(): hash.update(chunk) if chunk_cb: chunk_cb(chunk) return {algo: hash.digest() for algo, hash in hashes.items()} def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): """Hash the contents of the file at the given path with the given algorithms. Args: path: the path of the file to hash algorithms: the hashing algorithms used chunk_cb: a callback Returns: a dict mapping each algorithm to a bytes digest. Raises: ValueError if algorithms contains an unknown hash algorithm. OSError on file access error """ length = os.path.getsize(path) with open(path, 'rb') as fobj: hash = hash_file(fobj, length, algorithms, chunk_cb) hash['length'] = length return hash def hash_data(data, algorithms=DEFAULT_ALGORITHMS): """Hash the given binary blob with the given algorithms. Args: data: a bytes object algorithms: the hashing algorithms used Returns: a dict mapping each algorithm to a bytes digest Raises: TypeError if data does not support the buffer interface. ValueError if algorithms contains an unknown hash algorithm. """ fobj = BytesIO(data) return hash_file(fobj, len(data), algorithms) def hash_git_data(data, git_type, base_algo='sha1'): """Hash the given data as a git object of type git_type. Args: data: a bytes object git_type: the git object type base_algo: the base hashing algorithm used (default: sha1) Returns: a dict mapping each algorithm to a bytes digest Raises: ValueError if the git_type is unexpected. """ git_object_types = {'blob', 'tree', 'commit', 'tag'} if git_type not in git_object_types: raise ValueError('Unexpected git object type %s, expected one of %s' % (git_type, ', '.join(sorted(git_object_types)))) h = _new_git_hash(base_algo, git_type, len(data)) h.update(data) return h.digest() @functools.lru_cache() def hash_to_hex(hash): - """Converts a hash (in hex or bytes form) to its hexadecimal ascii form""" + """Converts a hash (in hex or bytes form) to its hexadecimal ascii form + + Args: + hash (str or bytes): a :class:`bytes` hash or a :class:`str` containing + the hexadecimal form of the hash + + Returns: + str: the hexadecimal form of the hash + """ if isinstance(hash, str): return hash return binascii.hexlify(hash).decode('ascii') @functools.lru_cache() def hash_to_bytehex(hash): - """Converts a hash to its hexadecimal bytes representation""" + """Converts a hash to its hexadecimal bytes representation + + Args: + hash (bytes): a :class:`bytes` hash + + Returns: + bytes: the hexadecimal form of the hash, as :class:`bytes` + """ return binascii.hexlify(hash) @functools.lru_cache() def hash_to_bytes(hash): - """Converts a hash (in hex or bytes form) to its raw bytes form""" + """Converts a hash (in hex or bytes form) to its raw bytes form + + Args: + hash (str or bytes): a :class:`bytes` hash or a :class:`str` containing + the hexadecimal form of the hash + + Returns: + bytes: the :class:`bytes` form of the hash + """ if isinstance(hash, bytes): return hash return bytes.fromhex(hash) @functools.lru_cache() def bytehex_to_hash(hex): - """Converts a hexadecimal bytes representation of a hash to that hash""" + """Converts a hexadecimal bytes representation of a hash to that hash + + Args: + hash (bytes): a :class:`bytes` containing the hexadecimal form of the + hash encoded in ascii + + Returns: + bytes: the :class:`bytes` form of the hash + """ return hash_to_bytes(hex.decode()) diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index d51304e..c7a6ce9 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -1,490 +1,501 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import binascii import datetime from functools import lru_cache from .hashutil import hash_data, hash_git_data, DEFAULT_ALGORITHMS @lru_cache() def identifier_to_bytes(identifier): """Convert a text identifier to bytes. Args: identifier: an identifier, either a 40-char hexadecimal string or a - bytes object of length 20 + bytes object of length 20 Returns: The length 20 bytestring corresponding to the given identifier Raises: - ValueError if the identifier is of an unexpected type or length. + ValueError: if the identifier is of an unexpected type or length. """ if isinstance(identifier, bytes): if len(identifier) != 20: raise ValueError( 'Wrong length for bytes identifier %s, expected 20' % len(identifier)) return identifier if isinstance(identifier, str): if len(identifier) != 40: raise ValueError( 'Wrong length for str identifier %s, expected 40' % len(identifier)) return bytes.fromhex(identifier) raise ValueError('Wrong type for identifier %s, expected bytes or str' % identifier.__class__.__name__) @lru_cache() def identifier_to_str(identifier): """Convert an identifier to an hexadecimal string. Args: identifier: an identifier, either a 40-char hexadecimal string or a - bytes object of length 20 + bytes object of length 20 + Returns: The length 40 string corresponding to the given identifier, hex encoded Raises: ValueError if the identifier is of an unexpected type or length. """ if isinstance(identifier, str): if len(identifier) != 40: raise ValueError( 'Wrong length for str identifier %s, expected 40' % len(identifier)) return identifier if isinstance(identifier, bytes): if len(identifier) != 20: raise ValueError( 'Wrong length for bytes identifier %s, expected 20' % len(identifier)) return binascii.hexlify(identifier).decode() raise ValueError('Wrong type for identifier %s, expected bytes or str' % identifier.__class__.__name__) def content_identifier(content): """Return the intrinsic identifier for a content. A content's identifier is the sha1, sha1_git and sha256 checksums of its data. Args: content: a content conforming to the Software Heritage schema Returns: A dictionary with all the hashes for the data Raises: - KeyError if the content doesn't have a data member. + KeyError: if the content doesn't have a data member. """ return hash_data(content['data'], DEFAULT_ALGORITHMS) def _sort_key(entry): """The sorting key for tree entries""" if entry['type'] == 'dir': return entry['name'] + b'/' else: return entry['name'] @lru_cache() def _perms_to_bytes(perms): """Convert the perms value to its bytes representation""" oc = oct(perms)[2:] return oc.encode('ascii') def escape_newlines(snippet): """Escape the newlines present in snippet according to git rules. New lines in git manifests are escaped by indenting the next line by one - space.""" + space. + + """ if b'\n' in snippet: return b'\n '.join(snippet.split(b'\n')) else: return snippet def directory_identifier(directory): """Return the intrinsic identifier for a directory. A directory's identifier is the tree sha1 à la git of a directory listing, using the following algorithm, which is equivalent to the git algorithm for trees: 1. Entries of the directory are sorted using the name (or the name with '/' - appended for directory entries) as key, in bytes order. + appended for directory entries) as key, in bytes order. 2. For each entry of the directory, the following bytes are output: - - the octal representation of the permissions for the entry - (stored in the 'perms' member), which is a representation of the - entry type: - b'100644' (int 33188) for files - b'100755' (int 33261) for executable files - b'120000' (int 40960) for symbolic links - b'40000' (int 16384) for directories - b'160000' (int 57344) for references to revisions - - an ascii space (b'\x20') - - the entry's name (as raw bytes), stored in the 'name' member - - a null byte (b'\x00') - - the 20 byte long identifier of the object pointed at by the entry, - stored in the 'target' member: - for files or executable files: their blob sha1_git - for symbolic links: the blob sha1_git of a file containing the - link destination - for directories: their intrinsic identifier - for revisions: their intrinsic identifier + + - the octal representation of the permissions for the entry (stored in + the 'perms' member), which is a representation of the entry type: + + - b'100644' (int 33188) for files + - b'100755' (int 33261) for executable files + - b'120000' (int 40960) for symbolic links + - b'40000' (int 16384) for directories + - b'160000' (int 57344) for references to revisions + + - an ascii space (b'\x20') + - the entry's name (as raw bytes), stored in the 'name' member + - a null byte (b'\x00') + - the 20 byte long identifier of the object pointed at by the entry, + stored in the 'target' member: + + - for files or executable files: their blob sha1_git + - for symbolic links: the blob sha1_git of a file containing the link + destination + - for directories: their intrinsic identifier + - for revisions: their intrinsic identifier (Note that there is no separator between entries) """ components = [] for entry in sorted(directory['entries'], key=_sort_key): components.extend([ _perms_to_bytes(entry['perms']), b'\x20', entry['name'], b'\x00', identifier_to_bytes(entry['target']), ]) return identifier_to_str(hash_git_data(b''.join(components), 'tree')) def format_date(date): """Convert a date object into an UTC timestamp encoded as ascii bytes. Git stores timestamps as an integer number of seconds since the UNIX epoch. However, Software Heritage stores timestamps as an integer number of microseconds (postgres type "datetime with timezone"). Therefore, we print timestamps with no microseconds as integers, and timestamps with microseconds as floating point values. We elide the trailing zeroes from microsecond values, to "future-proof" our representation if we ever need more precision in timestamps. """ if not isinstance(date, dict): raise ValueError('format_date only supports dicts, %r received' % date) seconds = date.get('seconds', 0) microseconds = date.get('microseconds', 0) if not microseconds: return str(seconds).encode() else: float_value = ('%d.%06d' % (seconds, microseconds)) return float_value.rstrip('0').encode() @lru_cache() def format_offset(offset, negative_utc=None): """Convert an integer number of minutes into an offset representation. The offset representation is [+-]hhmm where: - hh is the number of hours; - mm is the number of minutes. + + - hh is the number of hours; + - mm is the number of minutes. A null offset is represented as +0000. """ if offset < 0 or offset == 0 and negative_utc: sign = '-' else: sign = '+' hours = abs(offset) // 60 minutes = abs(offset) % 60 t = '%s%02d%02d' % (sign, hours, minutes) return t.encode() def normalize_timestamp(time_representation): """Normalize a time representation for processing by Software Heritage This function supports a numeric timestamp (representing a number of - seconds since the UNIX epoch, 1970-01-01 at 00:00 UTC), a datetime.datetime - object (with timezone information), or a normalized Software - Heritage time representation (idempotency). + seconds since the UNIX epoch, 1970-01-01 at 00:00 UTC), a + :obj:`datetime.datetime` object (with timezone information), or a + normalized Software Heritage time representation (idempotency). Args: time_representation: the representation of a timestamp - Returns: a normalized dictionary with three keys + Returns: + dict: a normalized dictionary with three keys: + + - timestamp: a dict with two optional keys: + + - seconds: the integral number of seconds since the UNIX epoch + - microseconds: the integral number of microseconds - - timestamp: a dict with two optional keys: - - seconds: the integral number of seconds since the UNIX epoch - - microseconds: the integral number of microseconds - - offset: the timezone offset as a number of minutes relative to UTC - - negative_utc: a boolean representing whether the offset is -0000 when - offset = 0. + - offset: the timezone offset as a number of minutes relative to + UTC + - negative_utc: a boolean representing whether the offset is -0000 + when offset = 0. """ if time_representation is None: return None negative_utc = False if isinstance(time_representation, dict): ts = time_representation['timestamp'] if isinstance(ts, dict): seconds = ts.get('seconds', 0) microseconds = ts.get('microseconds', 0) elif isinstance(ts, int): seconds = ts microseconds = 0 else: raise ValueError( 'normalize_timestamp received non-integer timestamp member:' ' %r' % ts) offset = time_representation['offset'] if 'negative_utc' in time_representation: negative_utc = time_representation['negative_utc'] elif isinstance(time_representation, datetime.datetime): seconds = int(time_representation.timestamp()) microseconds = time_representation.microsecond utcoffset = time_representation.utcoffset() if utcoffset is None: raise ValueError( 'normalize_timestamp received datetime without timezone: %s' % time_representation) # utcoffset is an integer number of minutes seconds_offset = utcoffset.total_seconds() offset = int(seconds_offset) // 60 elif isinstance(time_representation, int): seconds = time_representation microseconds = 0 offset = 0 else: raise ValueError( 'normalize_timestamp received non-integer timestamp:' ' %r' % time_representation) return { 'timestamp': { 'seconds': seconds, 'microseconds': microseconds, }, 'offset': offset, 'negative_utc': negative_utc, } def format_author(author): """Format the specification of an author. An author is either a byte string (passed unchanged), or a dict with three keys, fullname, name and email. If the fullname exists, return it; if it doesn't, we construct a fullname using the following heuristics: if the name value is None, we return the email in angle brackets, else, we return the name, a space, and the email in angle brackets. """ if isinstance(author, bytes) or author is None: return author if 'fullname' in author: return author['fullname'] ret = [] if author['name'] is not None: ret.append(author['name']) if author['email'] is not None: ret.append(b''.join([b'<', author['email'], b'>'])) return b' '.join(ret) def format_author_line(header, author, date_offset): """Format a an author line according to git standards. An author line has three components: - - a header, describing the type of author (author, committer, tagger) - - a name and email, which is an arbitrary bytestring - - optionally, a timestamp with UTC offset specification - The author line is formatted thus: + - a header, describing the type of author (author, committer, tagger) + - a name and email, which is an arbitrary bytestring + - optionally, a timestamp with UTC offset specification + + The author line is formatted thus:: `header` `name and email`[ `timestamp` `utc_offset`] The timestamp is encoded as a (decimal) number of seconds since the UNIX epoch (1970-01-01 at 00:00 UTC). As an extension to the git format, we support fractional timestamps, using a dot as the separator for the decimal part. The utc offset is a number of minutes encoded as '[+-]HHMM'. Note some tools can pass a negative offset corresponding to the UTC timezone ('-0000'), which is valid and is encoded as such. For convenience, this function returns the whole line with its trailing newline. Args: header: the header of the author line (one of 'author', 'committer', - 'tagger') + 'tagger') author: an author specification (dict with two bytes values: name and - email, or byte value) + email, or byte value) date_offset: a normalized date/time representation as returned by - `normalize_timestamp`. + :func:`normalize_timestamp`. Returns: the newline-terminated byte string containing the author line """ ret = [header.encode(), b' ', escape_newlines(format_author(author))] date_offset = normalize_timestamp(date_offset) if date_offset is not None: date_f = format_date(date_offset['timestamp']) offset_f = format_offset(date_offset['offset'], date_offset['negative_utc']) ret.extend([b' ', date_f, b' ', offset_f]) ret.append(b'\n') return b''.join(ret) def revision_identifier(revision): """Return the intrinsic identifier for a revision. The fields used for the revision identifier computation are: - - directory - - parents - - author - - author_date - - committer - - committer_date - - metadata -> extra_headers - - message + + - directory + - parents + - author + - author_date + - committer + - committer_date + - metadata -> extra_headers + - message A revision's identifier is the 'git'-checksum of a commit manifest - constructed as follows (newlines are a single ASCII newline character): - - ``` - tree - [for each parent in parents] - parent - [end for each parents] - author - committer - [for each key, value in extra_headers] - - [end for each extra_headers] - - - ``` + constructed as follows (newlines are a single ASCII newline character):: + + tree + [for each parent in parents] + parent + [end for each parents] + author + committer + [for each key, value in extra_headers] + + [end for each extra_headers] + + The directory identifier is the ascii representation of its hexadecimal encoding. - Author and committer are formatted with the `format_author` function. - Dates are formatted with the `format_date_offset` function. + Author and committer are formatted with the :func:`format_author` function. + Dates are formatted with the :func:`format_offset` function. Extra headers are an ordered list of [key, value] pairs. Keys are strings and get encoded to utf-8 for identifier computation. Values are either byte strings, unicode strings (that get encoded to utf-8), or integers (that get encoded to their utf-8 decimal representation). Multiline extra header values are escaped by indenting the continuation lines with one ascii space. If the message is None, the manifest ends with the last header. Else, the message is appended to the headers after an empty line. The checksum of the full manifest is computed using the 'commit' git object type. """ components = [ b'tree ', identifier_to_str(revision['directory']).encode(), b'\n', ] for parent in revision['parents']: if parent: components.extend([ b'parent ', identifier_to_str(parent).encode(), b'\n', ]) components.extend([ format_author_line('author', revision['author'], revision['date']), format_author_line('committer', revision['committer'], revision['committer_date']), ]) # Handle extra headers metadata = revision.get('metadata') if not metadata: metadata = {} for key, value in metadata.get('extra_headers', []): # Integer values: decimal representation if isinstance(value, int): value = str(value).encode('utf-8') # Unicode string values: utf-8 encoding if isinstance(value, str): value = value.encode('utf-8') # encode the key to utf-8 components.extend([key.encode('utf-8'), b' ', escape_newlines(value), b'\n']) if revision['message'] is not None: components.extend([b'\n', revision['message']]) commit_raw = b''.join(components) return identifier_to_str(hash_git_data(commit_raw, 'commit')) def target_type_to_git(target_type): """Convert a software heritage target type to a git object type""" return { 'content': b'blob', 'directory': b'tree', 'revision': b'commit', 'release': b'tag', }[target_type] def release_identifier(release): """Return the intrinsic identifier for a release.""" components = [ b'object ', identifier_to_str(release['target']).encode(), b'\n', b'type ', target_type_to_git(release['target_type']), b'\n', b'tag ', release['name'], b'\n', ] if 'author' in release and release['author']: components.append( format_author_line('tagger', release['author'], release['date']) ) if release['message'] is not None: components.extend([b'\n', release['message']]) return identifier_to_str(hash_git_data(b''.join(components), 'tag')) diff --git a/swh/model/merkle.py b/swh/model/merkle.py new file mode 100644 index 0000000..c75cc2c --- /dev/null +++ b/swh/model/merkle.py @@ -0,0 +1,286 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Merkle tree data structure""" + +import abc +import collections + + +def deep_update(left, right): + """Recursively update the left mapping with deeply nested values from the right + mapping. + + This function is useful to merge the results of several calls to + :func:`MerkleNode.collect`. + + Arguments: + left: a mapping (modified by the update operation) + right: a mapping + + Returns: + the left mapping, updated with nested values from the right mapping + + Example: + >>> a = { + ... 'key1': { + ... 'key2': { + ... 'key3': 'value1/2/3', + ... }, + ... }, + ... } + >>> deep_update(a, { + ... 'key1': { + ... 'key2': { + ... 'key4': 'value1/2/4', + ... }, + ... }, + ... }) == { + ... 'key1': { + ... 'key2': { + ... 'key3': 'value1/2/3', + ... 'key4': 'value1/2/4', + ... }, + ... }, + ... } + True + >>> deep_update(a, { + ... 'key1': { + ... 'key2': { + ... 'key3': 'newvalue1/2/3', + ... }, + ... }, + ... }) == { + ... 'key1': { + ... 'key2': { + ... 'key3': 'newvalue1/2/3', + ... 'key4': 'value1/2/4', + ... }, + ... }, + ... } + True + + """ + for key, rvalue in right.items(): + if isinstance(rvalue, collections.Mapping): + new_lvalue = deep_update(left.get(key, {}), rvalue) + left[key] = new_lvalue + else: + left[key] = rvalue + return left + + +class MerkleNode(dict, metaclass=abc.ABCMeta): + """Representation of a node in a Merkle Tree. + + A (generalized) `Merkle Tree`_ is a tree in which every node is labeled + with a hash of its own data and the hash of its children. + + .. _Merkle Tree: https://en.wikipedia.org/wiki/Merkle_tree + + In pseudocode:: + + node.hash = hash(node.data + + sum(child.hash for child in node.children)) + + This class efficiently implements the Merkle Tree data structure on top of + a Python :class:`dict`, minimizing hash computations and new data + collections when updating nodes. + + Node data is stored in the :attr:`data` attribute, while (named) children + are stored as items of the underlying dictionary. + + Addition, update and removal of objects are instrumented to automatically + invalidate the hashes of the current node as well as its registered + parents; It also resets the collection status of the objects so the updated + objects can be collected. + + The collection of updated data from the tree is implemented through the + :func:`collect` function and associated helpers. + + Attributes: + data (dict): data associated to the current node + parents (list): known parents of the current node + collected (bool): whether the current node has been collected + + """ + __slots__ = ['parents', 'data', '__hash', 'collected'] + + type = None + """Type of the current node (used as a classifier for :func:`collect`)""" + + def __init__(self, data=None): + super().__init__() + self.parents = [] + self.data = data + self.__hash = None + self.collected = False + + def invalidate_hash(self): + """Invalidate the cached hash of the current node.""" + if not self.__hash: + return + + self.__hash = None + self.collected = False + for parent in self.parents: + parent.invalidate_hash() + + def update_hash(self, *, force=False): + """Recursively compute the hash of the current node. + + Args: + force (bool): invalidate the cache and force the computation for + this node and all children. + """ + if self.__hash and not force: + return self.__hash + + if force: + self.invalidate_hash() + + for child in self.values(): + child.update_hash(force=force) + + self.__hash = self.compute_hash() + return self.__hash + + @property + def hash(self): + """The hash of the current node, as calculated by + :func:`compute_hash`. + """ + return self.update_hash() + + @abc.abstractmethod + def compute_hash(self): + """Compute the hash of the current node. + + The hash should depend on the data of the node, as well as on hashes + of the children nodes. + """ + raise NotImplementedError('Must implement compute_hash method') + + def __setitem__(self, name, new_child): + """Add a child, invalidating the current hash""" + self.invalidate_hash() + + super().__setitem__(name, new_child) + + new_child.parents.append(self) + + def __delitem__(self, name): + """Remove a child, invalidating the current hash""" + if name in self: + self.invalidate_hash() + self[name].parents.remove(self) + super().__delitem__(name) + else: + raise KeyError(name) + + def update(self, new_children): + """Add several named children from a dictionary""" + if not new_children: + return + + self.invalidate_hash() + + for name, new_child in new_children.items(): + new_child.parents.append(self) + if name in self: + self[name].parents.remove(self) + + super().update(new_children) + + def get_data(self, **kwargs): + """Retrieve and format the collected data for the current node, for use by + :func:`collect`. + + Can be overridden, for instance when you want the collected data to + contain information about the child nodes. + + Arguments: + kwargs: allow subclasses to alter behaviour depending on how + :func:`collect` is called. + + Returns: + data formatted for :func:`collect` + """ + return self.data + + def collect_node(self, **kwargs): + """Collect the data for the current node, for use by :func:`collect`. + + Arguments: + kwargs: passed as-is to :func:`get_data`. + + Returns: + A :class:`dict` compatible with :func:`collect`. + """ + if not self.collected: + self.collected = True + return {self.type: {self.hash: self.get_data(**kwargs)}} + else: + return {} + + def collect(self, **kwargs): + """Collect the data for all nodes in the subtree rooted at `self`. + + The data is deduplicated by type and by hash. + + Arguments: + kwargs: passed as-is to :func:`get_data`. + + Returns: + A :class:`dict` with the following structure:: + + { + 'typeA': { + node1.hash: node1.get_data(), + node2.hash: node2.get_data(), + }, + 'typeB': { + node3.hash: node3.get_data(), + ... + }, + ... + } + """ + ret = self.collect_node(**kwargs) + for child in self.values(): + deep_update(ret, child.collect(**kwargs)) + + return ret + + def reset_collect(self): + """Recursively unmark collected nodes in the subtree rooted at `self`. + + This lets the caller use :func:`collect` again. + """ + self.collected = False + + for child in self.values(): + child.reset_collect() + + +class MerkleLeaf(MerkleNode): + """A leaf to a Merkle tree. + + A Merkle leaf is simply a Merkle node with children disabled. + """ + __slots__ = [] + + def __setitem__(self, name, child): + raise ValueError('%s is a leaf' % self.__class__.__name__) + + def __getitem__(self, name): + raise ValueError('%s is a leaf' % self.__class__.__name__) + + def __delitem__(self, name): + raise ValueError('%s is a leaf' % self.__class__.__name__) + + def update(self, new_children): + """Children update operation. Disabled for leaves.""" + raise ValueError('%s is a leaf' % self.__class__.__name__) diff --git a/swh/model/tests/generate_testdata_from_disk.py b/swh/model/tests/generate_testdata_from_disk.py new file mode 100644 index 0000000..35d4f48 --- /dev/null +++ b/swh/model/tests/generate_testdata_from_disk.py @@ -0,0 +1,92 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from operator import itemgetter +import os +import sys + +from swh.model.from_disk import Directory, DentryPerms +from swh.model.hashutil import ALGORITHMS, hash_to_hex + + +def generate_from_directory(varname, directory, indent=0): + """Generate test data from a given directory""" + def get_data(member, path): + yield (path, member.get_data()) + if isinstance(member, Directory): + for name, child in member.items(): + yield from get_data(child, os.path.join(path, name)) + + data = dict(get_data(directory, b'')) + out = [] + + def format_hash(h, indent=0): + spindent = ' ' * indent + if len(h) > 20: + cutoff = len(h)//2 + parts = h[:cutoff], h[cutoff:] + else: + parts = [h] + + out.append('hash_to_bytes(\n') + for part in parts: + out.append(spindent + ' %s\n' % repr(hash_to_hex(part))) + out.append(spindent + ')') + + def format_dict_items(d, indent=0): + spindent = ' ' * indent + for key, value in sorted(d.items()): + if isinstance(key, bytes): + out.append(spindent + repr(key) + ': {\n') + format_dict_items(value, indent=indent + 4) + out.append(spindent + '}') + else: + out.append(spindent + repr(key) + ': ') + if key == 'entries': + if not value: + out.append('[]') + else: + out.append('[') + last_index = len(value) - 1 + for i, entry in enumerate( + sorted(value, key=itemgetter('name'))): + if i: + out.append(' ') + out.append('{\n') + format_dict_items(entry, indent=indent + 4) + if i != last_index: + out.append(spindent + '},') + out.append(spindent + '}]') + elif key in ALGORITHMS | {'id', 'target'}: + format_hash(value, indent=indent) + elif isinstance(value, DentryPerms): + out.append(str(value)) + else: + out.append(repr(value)) + out.append(',\n') + + spindent = ' ' * indent + out.append(spindent + '%s = {\n' % varname) + format_dict_items(data, indent=4 + indent) + out.append(spindent + '}') + + return ''.join(out) + + +if __name__ == '__main__': + if not sys.argv[1:]: + print("Usage: %s dir1 dir2" % sys.argv[0], file=sys.stderr) + exit(2) + + for dirname in sys.argv[1:]: + basename = os.path.basename(dirname) + varname = 'expected_%s' % basename + testdata = generate_from_directory( + varname, + Directory.from_disk(path=os.fsencode(dirname)), + indent=8 + ) + print(testdata) + print() diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py new file mode 100644 index 0000000..8e568ec --- /dev/null +++ b/swh/model/tests/test_from_disk.py @@ -0,0 +1,789 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import tarfile +import tempfile +import unittest + +from nose.plugins.attrib import attr + +from swh.model import from_disk +from swh.model.from_disk import Content, Directory, DentryPerms +from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex + + +class ModeToPerms(unittest.TestCase): + def setUp(self): + super().setUp() + + # Generate a full permissions map + self.perms_map = {} + + # Symlinks + for i in range(0o120000, 0o127777 + 1): + self.perms_map[i] = DentryPerms.symlink + + # Directories + for i in range(0o040000, 0o047777 + 1): + self.perms_map[i] = DentryPerms.directory + + # Other file types: socket, regular file, block device, character + # device, fifo all map to regular files + for ft in [0o140000, 0o100000, 0o060000, 0o020000, 0o010000]: + for i in range(ft, ft + 0o7777 + 1): + if i & 0o111: + # executable bits are set + self.perms_map[i] = DentryPerms.executable_content + else: + self.perms_map[i] = DentryPerms.content + + def test_exhaustive_mode_to_perms(self): + for fmode, perm in self.perms_map.items(): + self.assertEqual(perm, from_disk.mode_to_perms(fmode)) + + +class DataMixin: + maxDiff = None + + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory( + prefix='swh.model.from_disk' + ) + self.tmpdir_name = os.fsencode(self.tmpdir.name) + + self.contents = { + b'file': { + 'data': b'42\n', + 'sha1': hash_to_bytes( + '34973274ccef6ab4dfaaf86599792fa9c3fe4689' + ), + 'sha256': hash_to_bytes( + '084c799cd551dd1d8d5c5f9a5d593b2e' + '931f5e36122ee5c793c1d08a19839cc0' + ), + 'sha1_git': hash_to_bytes( + 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd'), + 'blake2s256': hash_to_bytes( + 'd5fe1939576527e42cfd76a9455a2432' + 'fe7f56669564577dd93c4280e76d661d' + ), + 'length': 3, + 'mode': 0o100644 + }, + } + + self.symlinks = { + b'symlink': { + 'data': b'target', + 'blake2s256': hash_to_bytes( + '595d221b30fdd8e10e2fdf18376e688e' + '9f18d56fd9b6d1eb6a822f8c146c6da6' + ), + 'sha1': hash_to_bytes( + '0e8a3ad980ec179856012b7eecf4327e99cd44cd' + ), + 'sha1_git': hash_to_bytes( + '1de565933b05f74c75ff9a6520af5f9f8a5a2f1d' + ), + 'sha256': hash_to_bytes( + '34a04005bcaf206eec990bd9637d9fdb' + '6725e0a0c0d4aebf003f17f4c956eb5c' + ), + 'length': 6, + 'perms': DentryPerms.symlink, + } + } + + self.specials = { + b'fifo': os.mkfifo, + b'devnull': lambda path: os.mknod(path, device=os.makedev(1, 3)), + } + + self.empty_content = { + 'data': b'', + 'length': 0, + 'blake2s256': hash_to_bytes( + '69217a3079908094e11121d042354a7c' + '1f55b6482ca1a51e1b250dfd1ed0eef9' + ), + 'sha1': hash_to_bytes( + 'da39a3ee5e6b4b0d3255bfef95601890afd80709' + ), + 'sha1_git': hash_to_bytes( + 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391' + ), + 'sha256': hash_to_bytes( + 'e3b0c44298fc1c149afbf4c8996fb924' + '27ae41e4649b934ca495991b7852b855' + ), + 'perms': DentryPerms.content, + } + + self.empty_directory = { + 'id': hash_to_bytes( + '4b825dc642cb6eb9a060e54bf8d69288fbee4904' + ), + 'entries': [], + } + + # Generated with generate_testdata_from_disk + self.tarball_contents = { + b'': { + 'entries': [{ + 'name': b'bar', + 'perms': DentryPerms.directory, + 'target': hash_to_bytes( + '3c1f578394f4623f74a0ba7fe761729f59fc6ec4' + ), + 'type': 'dir', + }, { + 'name': b'empty-folder', + 'perms': DentryPerms.directory, + 'target': hash_to_bytes( + '4b825dc642cb6eb9a060e54bf8d69288fbee4904' + ), + 'type': 'dir', + }, { + 'name': b'foo', + 'perms': DentryPerms.directory, + 'target': hash_to_bytes( + '2b41c40f0d1fbffcba12497db71fba83fcca96e5' + ), + 'type': 'dir', + }, { + 'name': b'link-to-another-quote', + 'perms': DentryPerms.symlink, + 'target': hash_to_bytes( + '7d5c08111e21c8a9f71540939998551683375fad' + ), + 'type': 'file', + }, { + 'name': b'link-to-binary', + 'perms': DentryPerms.symlink, + 'target': hash_to_bytes( + 'e86b45e538d9b6888c969c89fbd22a85aa0e0366' + ), + 'type': 'file', + }, { + 'name': b'link-to-foo', + 'perms': DentryPerms.symlink, + 'target': hash_to_bytes( + '19102815663d23f8b75a47e7a01965dcdc96468c' + ), + 'type': 'file', + }, { + 'name': b'some-binary', + 'perms': DentryPerms.executable_content, + 'target': hash_to_bytes( + '68769579c3eaadbe555379b9c3538e6628bae1eb' + ), + 'type': 'file', + }], + 'id': hash_to_bytes( + 'e8b0f1466af8608c8a3fb9879db172b887e80759' + ), + }, + b'bar': { + 'entries': [{ + 'name': b'barfoo', + 'perms': DentryPerms.directory, + 'target': hash_to_bytes( + 'c3020f6bf135a38c6df3afeb5fb38232c5e07087' + ), + 'type': 'dir', + }], + 'id': hash_to_bytes( + '3c1f578394f4623f74a0ba7fe761729f59fc6ec4' + ), + }, + b'bar/barfoo': { + 'entries': [{ + 'name': b'another-quote.org', + 'perms': DentryPerms.content, + 'target': hash_to_bytes( + '133693b125bad2b4ac318535b84901ebb1f6b638' + ), + 'type': 'file', + }], + 'id': hash_to_bytes( + 'c3020f6bf135a38c6df3afeb5fb38232c5e07087' + ), + }, + b'bar/barfoo/another-quote.org': { + 'blake2s256': hash_to_bytes( + 'd26c1cad82d43df0bffa5e7be11a60e3' + '4adb85a218b433cbce5278b10b954fe8' + ), + 'length': 72, + 'perms': DentryPerms.content, + 'sha1': hash_to_bytes( + '90a6138ba59915261e179948386aa1cc2aa9220a' + ), + 'sha1_git': hash_to_bytes( + '133693b125bad2b4ac318535b84901ebb1f6b638' + ), + 'sha256': hash_to_bytes( + '3db5ae168055bcd93a4d08285dc99ffe' + 'e2883303b23fac5eab850273a8ea5546' + ), + }, + b'empty-folder': { + 'entries': [], + 'id': hash_to_bytes( + '4b825dc642cb6eb9a060e54bf8d69288fbee4904' + ), + }, + b'foo': { + 'entries': [{ + 'name': b'barfoo', + 'perms': DentryPerms.symlink, + 'target': hash_to_bytes( + '8185dfb2c0c2c597d16f75a8a0c37668567c3d7e' + ), + 'type': 'file', + }, { + 'name': b'quotes.md', + 'perms': DentryPerms.content, + 'target': hash_to_bytes( + '7c4c57ba9ff496ad179b8f65b1d286edbda34c9a' + ), + 'type': 'file', + }, { + 'name': b'rel-link-to-barfoo', + 'perms': DentryPerms.symlink, + 'target': hash_to_bytes( + 'acac326ddd63b0bc70840659d4ac43619484e69f' + ), + 'type': 'file', + }], + 'id': hash_to_bytes( + '2b41c40f0d1fbffcba12497db71fba83fcca96e5' + ), + }, + b'foo/barfoo': { + 'blake2s256': hash_to_bytes( + 'e1252f2caa4a72653c4efd9af871b62b' + 'f2abb7bb2f1b0e95969204bd8a70d4cd' + ), + 'data': b'bar/barfoo', + 'length': 10, + 'perms': DentryPerms.symlink, + 'sha1': hash_to_bytes( + '9057ee6d0162506e01c4d9d5459a7add1fedac37' + ), + 'sha1_git': hash_to_bytes( + '8185dfb2c0c2c597d16f75a8a0c37668567c3d7e' + ), + 'sha256': hash_to_bytes( + '29ad3f5725321b940332c78e403601af' + 'ff61daea85e9c80b4a7063b6887ead68' + ), + }, + b'foo/quotes.md': { + 'blake2s256': hash_to_bytes( + 'bf7ce4fe304378651ee6348d3e9336ed' + '5ad603d33e83c83ba4e14b46f9b8a80b' + ), + 'length': 66, + 'perms': DentryPerms.content, + 'sha1': hash_to_bytes( + '1bf0bb721ac92c18a19b13c0eb3d741cbfadebfc' + ), + 'sha1_git': hash_to_bytes( + '7c4c57ba9ff496ad179b8f65b1d286edbda34c9a' + ), + 'sha256': hash_to_bytes( + 'caca942aeda7b308859eb56f909ec96d' + '07a499491690c453f73b9800a93b1659' + ), + }, + b'foo/rel-link-to-barfoo': { + 'blake2s256': hash_to_bytes( + 'd9c327421588a1cf61f316615005a2e9' + 'c13ac3a4e96d43a24138d718fa0e30db' + ), + 'data': b'../bar/barfoo', + 'length': 13, + 'perms': DentryPerms.symlink, + 'sha1': hash_to_bytes( + 'dc51221d308f3aeb2754db48391b85687c2869f4' + ), + 'sha1_git': hash_to_bytes( + 'acac326ddd63b0bc70840659d4ac43619484e69f' + ), + 'sha256': hash_to_bytes( + '8007d20db2af40435f42ddef4b8ad76b' + '80adbec26b249fdf0473353f8d99df08' + ), + }, + b'link-to-another-quote': { + 'blake2s256': hash_to_bytes( + '2d0e73cea01ba949c1022dc10c8a43e6' + '6180639662e5dc2737b843382f7b1910' + ), + 'data': b'bar/barfoo/another-quote.org', + 'length': 28, + 'perms': DentryPerms.symlink, + 'sha1': hash_to_bytes( + 'cbeed15e79599c90de7383f420fed7acb48ea171' + ), + 'sha1_git': hash_to_bytes( + '7d5c08111e21c8a9f71540939998551683375fad' + ), + 'sha256': hash_to_bytes( + 'e6e17d0793aa750a0440eb9ad5b80b25' + '8076637ef0fb68f3ac2e59e4b9ac3ba6' + ), + }, + b'link-to-binary': { + 'blake2s256': hash_to_bytes( + '9ce18b1adecb33f891ca36664da676e1' + '2c772cc193778aac9a137b8dc5834b9b' + ), + 'data': b'some-binary', + 'length': 11, + 'perms': DentryPerms.symlink, + 'sha1': hash_to_bytes( + 'd0248714948b3a48a25438232a6f99f0318f59f1' + ), + 'sha1_git': hash_to_bytes( + 'e86b45e538d9b6888c969c89fbd22a85aa0e0366' + ), + 'sha256': hash_to_bytes( + '14126e97d83f7d261c5a6889cee73619' + '770ff09e40c5498685aba745be882eff' + ), + }, + b'link-to-foo': { + 'blake2s256': hash_to_bytes( + '08d6cad88075de8f192db097573d0e82' + '9411cd91eb6ec65e8fc16c017edfdb74' + ), + 'data': b'foo', + 'length': 3, + 'perms': DentryPerms.symlink, + 'sha1': hash_to_bytes( + '0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33' + ), + 'sha1_git': hash_to_bytes( + '19102815663d23f8b75a47e7a01965dcdc96468c' + ), + 'sha256': hash_to_bytes( + '2c26b46b68ffc68ff99b453c1d304134' + '13422d706483bfa0f98a5e886266e7ae' + ), + }, + b'some-binary': { + 'blake2s256': hash_to_bytes( + '922e0f7015035212495b090c27577357' + 'a740ddd77b0b9e0cd23b5480c07a18c6' + ), + 'length': 5, + 'perms': DentryPerms.executable_content, + 'sha1': hash_to_bytes( + '0bbc12d7f4a2a15b143da84617d95cb223c9b23c' + ), + 'sha1_git': hash_to_bytes( + '68769579c3eaadbe555379b9c3538e6628bae1eb' + ), + 'sha256': hash_to_bytes( + 'bac650d34a7638bb0aeb5342646d24e3' + 'b9ad6b44c9b383621faa482b990a367d' + ), + }, + } + + def tearDown(self): + self.tmpdir.cleanup() + + def assertContentEqual(self, left, right, *, check_data=False, # noqa + check_path=False): + if not isinstance(left, Content): + raise ValueError('%s is not a Content' % left) + if isinstance(right, Content): + right = right.get_data() + + keys = DEFAULT_ALGORITHMS | { + 'length', + 'perms', + } + if check_data: + keys |= {'data'} + if check_path: + keys |= {'path'} + + failed = [] + for key in keys: + try: + lvalue = left.data[key] + if key == 'perms' and 'perms' not in right: + rvalue = from_disk.mode_to_perms(right['mode']) + else: + rvalue = right[key] + except KeyError: + failed.append(key) + continue + + if lvalue != rvalue: + failed.append(key) + + if failed: + raise self.failureException( + 'Content mismatched:\n' + + '\n'.join( + 'content[%s] = %r != %r' % ( + key, left.data.get(key), right.get(key)) + for key in failed + ) + ) + + def assertDirectoryEqual(self, left, right): # NoQA + if not isinstance(left, Directory): + raise ValueError('%s is not a Directory' % left) + if isinstance(right, Directory): + right = right.get_data() + + return self.assertCountEqual(left.entries, right['entries']) + + def make_contents(self, directory): + for filename, content in self.contents.items(): + path = os.path.join(directory, filename) + with open(path, 'wb') as f: + f.write(content['data']) + os.chmod(path, content['mode']) + + def make_symlinks(self, directory): + for filename, symlink in self.symlinks.items(): + path = os.path.join(directory, filename) + os.symlink(symlink['data'], path) + + def make_specials(self, directory): + for filename, fn in self.specials.items(): + path = os.path.join(directory, filename) + fn(path) + + def make_from_tarball(self, directory): + tarball = os.path.join(os.path.dirname(__file__), + '../../../..', + 'swh-storage-testdata', + 'dir-folders', + 'sample-folder.tgz') + + with tarfile.open(tarball, 'r:gz') as f: + f.extractall(os.fsdecode(directory)) + + +class TestContent(DataMixin, unittest.TestCase): + def setUp(self): + super().setUp() + + def test_data_to_content(self): + for filename, content in self.contents.items(): + conv_content = Content.from_bytes(mode=content['mode'], + data=content['data']) + self.assertContentEqual(conv_content, content) + self.assertIn(hash_to_hex(conv_content.hash), repr(conv_content)) + + +class SymlinkToContent(DataMixin, unittest.TestCase): + def setUp(self): + super().setUp() + self.make_symlinks(self.tmpdir_name) + + def test_symlink_to_content(self): + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + perms = 0o120000 + conv_content = Content.from_symlink(path=path, mode=perms) + self.assertContentEqual(conv_content, symlink) + + +class FileToContent(DataMixin, unittest.TestCase): + def setUp(self): + super().setUp() + self.make_contents(self.tmpdir_name) + self.make_symlinks(self.tmpdir_name) + self.make_specials(self.tmpdir_name) + + def test_file_to_content(self): + # Check whether loading the data works + for data in [True, False]: + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path, data=data) + self.assertContentEqual(conv_content, symlink, check_data=data) + + for filename, content in self.contents.items(): + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path, data=data) + self.assertContentEqual(conv_content, content, check_data=data) + + for filename in self.specials: + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path, data=data) + self.assertContentEqual(conv_content, self.empty_content) + + def test_file_to_content_with_path(self): + for filename, content in self.contents.items(): + content_w_path = content.copy() + path = os.path.join(self.tmpdir_name, filename) + content_w_path['path'] = path + conv_content = Content.from_file(path=path, save_path=True) + self.assertContentEqual(conv_content, content_w_path, + check_path=True) + + +class DirectoryToObjects(DataMixin, unittest.TestCase): + def setUp(self): + super().setUp() + contents = os.path.join(self.tmpdir_name, b'contents') + os.mkdir(contents) + self.make_contents(contents) + symlinks = os.path.join(self.tmpdir_name, b'symlinks') + os.mkdir(symlinks) + self.make_symlinks(symlinks) + specials = os.path.join(self.tmpdir_name, b'specials') + os.mkdir(specials) + self.make_specials(specials) + empties = os.path.join(self.tmpdir_name, b'empty1', b'empty2') + os.makedirs(empties) + + def test_directory_to_objects(self): + directory = Directory.from_disk(path=self.tmpdir_name) + + for name, value in self.contents.items(): + self.assertContentEqual(directory[b'contents/' + name], value) + + for name, value in self.symlinks.items(): + self.assertContentEqual(directory[b'symlinks/' + name], value) + + for name in self.specials: + self.assertContentEqual( + directory[b'specials/' + name], + self.empty_content, + ) + + self.assertEqual( + directory[b'empty1/empty2'].get_data(), + self.empty_directory, + ) + + # Raise on non existent file + with self.assertRaisesRegex(KeyError, "b'nonexistent'"): + directory[b'empty1/nonexistent'] + + # Raise on non existent directory + with self.assertRaisesRegex(KeyError, "b'nonexistentdir'"): + directory[b'nonexistentdir/file'] + + objs = directory.collect() + + self.assertCountEqual(['content', 'directory'], objs) + + self.assertEqual(len(objs['directory']), 6) + self.assertEqual(len(objs['content']), + len(self.contents) + + len(self.symlinks) + + 1) + + def test_directory_to_objects_ignore_empty(self): + directory = Directory.from_disk( + path=self.tmpdir_name, + dir_filter=from_disk.ignore_empty_directories + ) + + for name, value in self.contents.items(): + self.assertContentEqual(directory[b'contents/' + name], value) + + for name, value in self.symlinks.items(): + self.assertContentEqual(directory[b'symlinks/' + name], value) + + for name in self.specials: + self.assertContentEqual( + directory[b'specials/' + name], + self.empty_content, + ) + + # empty directories have been ignored recursively + with self.assertRaisesRegex(KeyError, "b'empty1'"): + directory[b'empty1'] + with self.assertRaisesRegex(KeyError, "b'empty1'"): + directory[b'empty1/empty2'] + + objs = directory.collect() + + self.assertCountEqual(['content', 'directory'], objs) + + self.assertEqual(len(objs['directory']), 4) + self.assertEqual(len(objs['content']), + len(self.contents) + + len(self.symlinks) + + 1) + + def test_directory_to_objects_ignore_name(self): + directory = Directory.from_disk( + path=self.tmpdir_name, + dir_filter=from_disk.ignore_named_directories([b'symlinks']) + ) + for name, value in self.contents.items(): + self.assertContentEqual(directory[b'contents/' + name], value) + + for name in self.specials: + self.assertContentEqual( + directory[b'specials/' + name], + self.empty_content, + ) + + self.assertEqual( + directory[b'empty1/empty2'].get_data(), + self.empty_directory, + ) + + with self.assertRaisesRegex(KeyError, "b'symlinks'"): + directory[b'symlinks'] + + objs = directory.collect() + + self.assertCountEqual(['content', 'directory'], objs) + + self.assertEqual(len(objs['directory']), 5) + self.assertEqual(len(objs['content']), + len(self.contents) + + 1) + + def test_directory_to_objects_ignore_name_case(self): + directory = Directory.from_disk( + path=self.tmpdir_name, + dir_filter=from_disk.ignore_named_directories([b'symLiNks'], + case_sensitive=False) + ) + for name, value in self.contents.items(): + self.assertContentEqual(directory[b'contents/' + name], value) + + for name in self.specials: + self.assertContentEqual( + directory[b'specials/' + name], + self.empty_content, + ) + + self.assertEqual( + directory[b'empty1/empty2'].get_data(), + self.empty_directory, + ) + + with self.assertRaisesRegex(KeyError, "b'symlinks'"): + directory[b'symlinks'] + + objs = directory.collect() + + self.assertCountEqual(['content', 'directory'], objs) + + self.assertEqual(len(objs['directory']), 5) + self.assertEqual(len(objs['content']), + len(self.contents) + + 1) + + +@attr('fs') +class TarballTest(DataMixin, unittest.TestCase): + def setUp(self): + super().setUp() + self.make_from_tarball(self.tmpdir_name) + + def test_contents_match(self): + directory = Directory.from_disk( + path=os.path.join(self.tmpdir_name, b'sample-folder') + ) + + for name, data in self.tarball_contents.items(): + obj = directory[name] + if isinstance(obj, Content): + self.assertContentEqual(obj, data) + elif isinstance(obj, Directory): + self.assertDirectoryEqual(obj, data) + else: + raise self.failureException('Unknown type for %s' % obj) + + +class DirectoryManipulation(DataMixin, unittest.TestCase): + def test_directory_access_nested(self): + d = Directory() + d[b'a'] = Directory() + d[b'a/b'] = Directory() + + self.assertEqual(d[b'a/b'].get_data(), self.empty_directory) + + def test_directory_del_nested(self): + d = Directory() + d[b'a'] = Directory() + d[b'a/b'] = Directory() + + with self.assertRaisesRegex(KeyError, "b'c'"): + del d[b'a/b/c'] + + with self.assertRaisesRegex(KeyError, "b'level2'"): + del d[b'a/level2/c'] + + del d[b'a/b'] + + self.assertEqual(d[b'a'].get_data(), self.empty_directory) + + def test_directory_access_self(self): + d = Directory() + self.assertIs(d, d[b'']) + self.assertIs(d, d[b'/']) + self.assertIs(d, d[b'//']) + + def test_directory_access_wrong_type(self): + d = Directory() + with self.assertRaisesRegex(ValueError, 'bytes from Directory'): + d['foo'] + with self.assertRaisesRegex(ValueError, 'bytes from Directory'): + d[42] + + def test_directory_repr(self): + entries = [b'a', b'b', b'c'] + d = Directory() + for entry in entries: + d[entry] = Directory() + + r = repr(d) + self.assertIn(hash_to_hex(d.hash), r) + + for entry in entries: + self.assertIn(str(entry), r) + + def test_directory_set_wrong_type_name(self): + d = Directory() + with self.assertRaisesRegex(ValueError, 'bytes Directory entry'): + d['foo'] = Directory() + with self.assertRaisesRegex(ValueError, 'bytes Directory entry'): + d[42] = Directory() + + def test_directory_set_nul_in_name(self): + d = Directory() + + with self.assertRaisesRegex(ValueError, 'nul bytes'): + d[b'\x00\x01'] = Directory() + + def test_directory_set_empty_name(self): + d = Directory() + with self.assertRaisesRegex(ValueError, 'must have a name'): + d[b''] = Directory() + with self.assertRaisesRegex(ValueError, 'must have a name'): + d[b'/'] = Directory() + + def test_directory_set_wrong_type(self): + d = Directory() + with self.assertRaisesRegex(ValueError, 'Content or Directory'): + d[b'entry'] = object() + + def test_directory_del_wrong_type(self): + d = Directory() + with self.assertRaisesRegex(ValueError, 'bytes Directory entry'): + del d['foo'] + with self.assertRaisesRegex(ValueError, 'bytes Directory entry'): + del d[42] diff --git a/swh/model/tests/test_git.py b/swh/model/tests/test_git.py deleted file mode 100644 index 0bf81bc..0000000 --- a/swh/model/tests/test_git.py +++ /dev/null @@ -1,734 +0,0 @@ -# Copyright (C) 2015-2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import os -import shutil -import subprocess -import tempfile -import unittest - -from nose.plugins.attrib import attr -from nose.tools import istest - -from swh.model import git - - -class GitHashlib(unittest.TestCase): - def setUp(self): - self.tree_data = b''.join([b'40000 barfoo\0', - bytes.fromhex('c3020f6bf135a38c6df' - '3afeb5fb38232c5e07087'), - b'100644 blah\0', - bytes.fromhex('63756ef0df5e4f10b6efa' - '33cfe5c758749615f20'), - b'100644 hello\0', - bytes.fromhex('907b308167f0880fb2a' - '5c0e1614bb0c7620f9dc3')]) - - self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 -author Antoine R. Dumont (@ardumont) 1444054085 +0200 -committer Antoine R. Dumont (@ardumont) 1444054085 +0200 - -initial -""".encode('utf-8') # NOQA - self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 -type commit -tag 0.0.1 -tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 - -blah -""".encode('utf-8') # NOQA - - self.checksums = { - 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' - '121dacdb1c'), - 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' - 'd629189653'), - 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' - 'e9e959f120'), - } - - @istest - def compute_directory_git_sha1(self): - # given - dirpath = 'some-dir-path' - hashes = { - dirpath: [{'perms': git.GitPerm.TREE, - 'type': git.GitType.TREE, - 'name': b'barfoo', - 'sha1_git': bytes.fromhex('c3020f6bf135a38c6df' - '3afeb5fb38232c5e07087')}, - {'perms': git.GitPerm.BLOB, - 'type': git.GitType.BLOB, - 'name': b'hello', - 'sha1_git': bytes.fromhex('907b308167f0880fb2a' - '5c0e1614bb0c7620f9dc3')}, - {'perms': git.GitPerm.BLOB, - 'type': git.GitType.BLOB, - 'name': b'blah', - 'sha1_git': bytes.fromhex('63756ef0df5e4f10b6efa' - '33cfe5c758749615f20')}] - } - - # when - checksum = git.compute_directory_git_sha1(dirpath, hashes) - - # then - self.assertEqual(checksum, self.checksums['tree_sha1_git']) - - @istest - def compute_revision_sha1_git(self): - # given - tree_hash = bytes.fromhex('1c61f7259dcb770f46b194d941df4f08ff0a3970') - revision = { - 'author': { - 'name': b'Antoine R. Dumont (@ardumont)', - 'email': b'antoine.romain.dumont@gmail.com', - }, - 'date': { - 'timestamp': 1444054085, - 'offset': 120, - }, - 'committer': { - 'name': b'Antoine R. Dumont (@ardumont)', - 'email': b'antoine.romain.dumont@gmail.com', - }, - 'committer_date': { - 'timestamp': 1444054085, - 'offset': 120, - }, - 'message': b'initial\n', - 'type': 'tar', - 'directory': tree_hash, - 'parents': [], - } - - # when - checksum = git.compute_revision_sha1_git(revision) - - # then - self.assertEqual(checksum, self.checksums['commit_sha1_git']) - - @istest - def compute_release_sha1_git(self): - # given - revision_hash = bytes.fromhex('24d012aaec0bc5a4d2f62c56399053' - 'd6cc72a241') - release = { - 'name': b'0.0.1', - 'author': { - 'name': b'Antoine R. Dumont (@ardumont)', - 'email': b'antoine.romain.dumont@gmail.com', - }, - 'date': { - 'timestamp': 1444225145, - 'offset': 120, - }, - 'message': b'blah\n', - 'target_type': 'revision', - 'target': revision_hash, - } - - # when - checksum = git.compute_release_sha1_git(release) - - # then - self.assertEqual(checksum, self.checksums['tag_sha1_git']) - - -@attr('fs') -class ComputeBlobMetadata(unittest.TestCase): - @istest - def compute_blob_metadata__special_file_returns_nothing(self): - # prepare - tmp_root_path = tempfile.mkdtemp().encode('utf-8') - name = b'fifo-file' - path = os.path.join(tmp_root_path, name) - - # given - os.mkfifo(path) - - # when - actual_metadata = git.compute_blob_metadata(path) - - # then - expected_metadata = { - 'sha1': b'\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', - 'sha1_git': b'\xe6\x9d\xe2\x9b\xb2\xd1\xd6CK\x8b)\xaewZ\xd8\xc2' - b'\xe4\x8cS\x91', - 'sha256': b"\xe3\xb0\xc4B\x98\xfc\x1c\x14\x9a\xfb\xf4\xc8\x99o" - b"\xb9$'\xaeA\xe4d\x9b\x93L\xa4\x95\x99\x1bxR\xb8U", - 'blake2s256': b'i!z0y\x90\x80\x94\xe1\x11!\xd0B5J|\x1fU\xb6H,\xa1' - b'\xa5\x1e\x1b%\r\xfd\x1e\xd0\xee\xf9', - 'perms': git.GitPerm.BLOB, - 'path': path, - 'name': name, - 'type': git.GitType.BLOB, - 'length': 0 - } - - self.assertEquals(actual_metadata, expected_metadata) - - # cleanup - shutil.rmtree(tmp_root_path) - - -@attr('fs') -class GitHashWalkArborescenceTree: - """Root class to ease walk and git hash testing without side-effecty - problems. - - """ - def setUp(self): - super().setUp() - self.tmp_root_path = tempfile.mkdtemp().encode('utf-8') - self.maxDiff = None - - start_path = os.path.dirname(__file__).encode('utf-8') - sample_folder = os.path.join(start_path, - b'../../../..', - b'swh-storage-testdata', - b'dir-folders', - b'sample-folder.tgz') - - self.root_path = os.path.join(self.tmp_root_path, b'sample-folder') - - # uncompress the sample folder - subprocess.check_output( - ['tar', 'xvf', sample_folder, '-C', self.tmp_root_path]) - - def tearDown(self): - if os.path.exists(self.tmp_root_path): - shutil.rmtree(self.tmp_root_path) - - -class GitHashFromScratch(GitHashWalkArborescenceTree, unittest.TestCase): - """Test the main `walk_and_compute_sha1_from_directory` algorithm that - scans and compute the disk for checksums. - - """ - @istest - def walk_and_compute_sha1_from_directory(self): - # make a temporary arborescence tree to hash without ignoring anything - # same as previous behavior - walk0 = git.walk_and_compute_sha1_from_directory(self.tmp_root_path) - - keys0 = list(walk0.keys()) - path_excluded = os.path.join(self.tmp_root_path, - b'sample-folder', - b'foo') - self.assertTrue(path_excluded in keys0) # it is not excluded here - - # make the same temporary arborescence tree to hash with ignoring one - # folder foo - walk1 = git.walk_and_compute_sha1_from_directory( - self.tmp_root_path, - dir_ok_fn=lambda dirpath: b'sample-folder/foo' not in dirpath) - keys1 = list(walk1.keys()) - self.assertTrue(path_excluded not in keys1) - - # remove the keys that can't be the same (due to hash definition) - # Those are the top level folders - keys_diff = [self.tmp_root_path, - os.path.join(self.tmp_root_path, b'sample-folder'), - git.ROOT_TREE_KEY] - for k in keys_diff: - self.assertNotEquals(walk0[k], walk1[k]) - - # The remaining keys (bottom path) should have exactly the same hashes - # as before - keys = set(keys1) - set(keys_diff) - actual_walk1 = {} - for k in keys: - self.assertEquals(walk0[k], walk1[k]) - actual_walk1[k] = walk1[k] - - expected_checksums = { - os.path.join(self.tmp_root_path, b'sample-folder/empty-folder'): [], # noqa - os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo'): [{ # noqa - 'type': git.GitType.BLOB, # noqa - 'length': 72, - 'sha256': b'=\xb5\xae\x16\x80U\xbc\xd9:M\x08(]\xc9\x9f\xfe\xe2\x883\x03\xb2?\xac^\xab\x85\x02s\xa8\xeaUF', # noqa - 'name': b'another-quote.org', # noqa - 'path': os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo/another-quote.org'), # noqa - 'perms': git.GitPerm.BLOB, # noqa - 'sha1': b'\x90\xa6\x13\x8b\xa5\x99\x15&\x1e\x17\x99H8j\xa1\xcc*\xa9"\n', # noqa - 'blake2s256': b'\xd2l\x1c\xad\x82\xd4=\xf0\xbf\xfa^{\xe1\x1a`\xe3J\xdb\x85\xa2\x18\xb43\xcb\xceRx\xb1\x0b\x95O\xe8', # noqa - 'sha1_git': b'\x136\x93\xb1%\xba\xd2\xb4\xac1\x855\xb8I\x01\xeb\xb1\xf6\xb68'}], # noqa - os.path.join(self.tmp_root_path, b'sample-folder/bar'): [{ # noqa - 'type': git.GitType.TREE, # noqa - 'perms': git.GitPerm.TREE, # noqa - 'name': b'barfoo', # noqa - 'path': os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo'), # noqa - 'sha1_git': b'\xc3\x02\x0fk\xf15\xa3\x8cm\xf3\xaf\xeb_\xb3\x822\xc5\xe0p\x87'}]} # noqa - - self.assertEquals(actual_walk1, expected_checksums) - - @istest - def walk_and_compute_sha1_from_directory_without_root_tree(self): - # compute the full checksums - expected_hashes = git.walk_and_compute_sha1_from_directory( - self.tmp_root_path) - - # except for the key on that round - actual_hashes = git.walk_and_compute_sha1_from_directory( - self.tmp_root_path, - with_root_tree=False) - - # then, removing the root tree hash from the first round - del expected_hashes[git.ROOT_TREE_KEY] - - # should give us the same checksums as the second round - self.assertEquals(actual_hashes, expected_hashes) - - -class WithSampleFolderChecksums: - def setUp(self): - super().setUp() - - self.rootkey = b'/tmp/tmp7w3oi_j8' - - self.objects = { - b'/tmp/tmp7w3oi_j8': { - 'children': {b'/tmp/tmp7w3oi_j8/sample-folder'}, - 'checksums': { - 'type': git.GitType.TREE, - 'name': b'tmp7w3oi_j8', - 'sha1_git': b'\xa7A\xfcM\x96\x8c{\x8e<\x94\xff\x86\xe7\x04\x80\xc5\xc7\xe5r\xa9', # noqa - 'path': b'/tmp/tmp7w3oi_j8', - 'perms': git.GitPerm.TREE - }, - }, - b'/tmp/tmp7w3oi_j8/sample-folder': { - 'children': { - b'/tmp/tmp7w3oi_j8/sample-folder/empty-folder', - b'/tmp/tmp7w3oi_j8/sample-folder/link-to-binary', - b'/tmp/tmp7w3oi_j8/sample-folder/link-to-another-quote', - b'/tmp/tmp7w3oi_j8/sample-folder/link-to-foo', - b'/tmp/tmp7w3oi_j8/sample-folder/some-binary', - b'/tmp/tmp7w3oi_j8/sample-folder/bar', - b'/tmp/tmp7w3oi_j8/sample-folder/foo', - }, - 'checksums': { - 'type': git.GitType.TREE, - 'name': b'sample-folder', - 'sha1_git': b'\xe8\xb0\xf1Fj\xf8`\x8c\x8a?\xb9\x87\x9d\xb1r\xb8\x87\xe8\x07Y', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder', - 'perms': git.GitPerm.TREE} - }, - b'/tmp/tmp7w3oi_j8/sample-folder/empty-folder': { - 'children': {}, - 'checksums': { - 'type': git.GitType.TREE, - 'name': b'empty-folder', - 'sha1_git': b'K\x82]\xc6B\xcbn\xb9\xa0`\xe5K\xf8\xd6\x92\x88\xfb\xeeI\x04', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/empty-folder', - 'perms': git.GitPerm.TREE - } - }, - b'/tmp/tmp7w3oi_j8/sample-folder/link-to-binary': { - 'checksums': { - 'name': b'link-to-binary', - 'sha1': b'\xd0$\x87\x14\x94\x8b:H\xa2T8#*o\x99\xf01\x8fY\xf1', # noqa - 'data': b'some-binary', - 'sha1_git': b'\xe8kE\xe58\xd9\xb6\x88\x8c\x96\x9c\x89\xfb\xd2*\x85\xaa\x0e\x03f', # noqa - 'blake2s256': b'\x9c\xe1\x8b\x1a\xde\xcb3\xf8\x91\xca6fM\xa6v\xe1,w,\xc1\x93w\x8a\xac\x9a\x13{\x8d\xc5\x83K\x9b', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/link-to-binary', - 'sha256': b'\x14\x12n\x97\xd8?}&\x1cZh\x89\xce\xe76\x19w\x0f\xf0\x9e@\xc5I\x86\x85\xab\xa7E\xbe\x88.\xff', # noqa - 'perms': git.GitPerm.LINK, - 'type': git.GitType.BLOB, - 'length': 11 - } - }, - b'/tmp/tmp7w3oi_j8/sample-folder/link-to-another-quote': { - 'checksums': { - 'name': b'link-to-another-quote', - 'sha1': b'\xcb\xee\xd1^yY\x9c\x90\xdes\x83\xf4 \xfe\xd7\xac\xb4\x8e\xa1q', # noqa - 'data': b'bar/barfoo/another-quote.org', - 'sha1_git': b'}\\\x08\x11\x1e!\xc8\xa9\xf7\x15@\x93\x99\x98U\x16\x837_\xad', # noqa - 'blake2s256': b"-\x0es\xce\xa0\x1b\xa9I\xc1\x02-\xc1\x0c\x8aC\xe6a\x80c\x96b\xe5\xdc'7\xb8C8/{\x19\x10", # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/link-to-another-quote', # noqa - 'sha256': b'\xe6\xe1}\x07\x93\xaau\n\x04@\xeb\x9a\xd5\xb8\x0b%\x80vc~\xf0\xfbh\xf3\xac.Y\xe4\xb9\xac;\xa6', # noqa - 'perms': git.GitPerm.LINK, - 'type': git.GitType.BLOB, - 'length': 28 - } - }, - b'/tmp/tmp7w3oi_j8/sample-folder/link-to-foo': { - 'checksums': { - 'name': b'link-to-foo', - 'sha1': b'\x0b\xee\xc7\xb5\xea?\x0f\xdb\xc9]\r\xd4\x7f<[\xc2u\xda\x8a3', # noqa - 'data': b'foo', - 'sha1_git': b'\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c', # noqa - 'blake2s256': b'\x08\xd6\xca\xd8\x80u\xde\x8f\x19-\xb0\x97W=\x0e\x82\x94\x11\xcd\x91\xebn\xc6^\x8f\xc1l\x01~\xdf\xdbt', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/link-to-foo', - 'sha256': b',&\xb4kh\xff\xc6\x8f\xf9\x9bE<\x1d0A4\x13B-pd\x83\xbf\xa0\xf9\x8a^\x88bf\xe7\xae', # noqa - 'perms': git.GitPerm.LINK, - 'type': git.GitType.BLOB, - 'length': 3 - } - }, - b'/tmp/tmp7w3oi_j8/sample-folder/some-binary': { - 'checksums': { - 'name': b'some-binary', - 'sha1': b'\x0b\xbc\x12\xd7\xf4\xa2\xa1[\x14=\xa8F\x17\xd9\\\xb2#\xc9\xb2<', # noqa - 'sha1_git': b'hv\x95y\xc3\xea\xad\xbeUSy\xb9\xc3S\x8ef(\xba\xe1\xeb', # noqa - 'blake2s256': b"\x92.\x0fp\x15\x03R\x12I[\t\x0c'WsW\xa7@\xdd\xd7{\x0b\x9e\x0c\xd2;T\x80\xc0z\x18\xc6", # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/some-binary', - 'sha256': b'\xba\xc6P\xd3Jv8\xbb\n\xebSBdm$\xe3\xb9\xadkD\xc9\xb3\x83b\x1f\xaaH+\x99\n6}', # noqa - 'perms': git.GitPerm.EXEC, - 'type': git.GitType.BLOB, - 'length': 5} - }, - b'/tmp/tmp7w3oi_j8/sample-folder/bar': { - 'children': {b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo'}, - 'checksums': {'type': git.GitType.TREE, - 'name': b'bar', - 'sha1_git': b'<\x1fW\x83\x94\xf4b?t\xa0\xba\x7f\xe7ar\x9fY\xfcn\xc4', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/bar', - 'perms': git.GitPerm.TREE}, - }, - b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo': { - 'children': {b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo/another-quote.org'}, # noqa - 'checksums': {'type': git.GitType.TREE, - 'name': b'barfoo', - 'sha1_git': b'\xc3\x02\x0fk\xf15\xa3\x8cm\xf3\xaf\xeb_\xb3\x822\xc5\xe0p\x87', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo', # noqa - 'perms': git.GitPerm.TREE}, - }, - b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo/another-quote.org': { - 'checksums': {'name': b'another-quote.org', - 'sha1': b'\x90\xa6\x13\x8b\xa5\x99\x15&\x1e\x17\x99H8j\xa1\xcc*\xa9"\n', # noqa - 'sha1_git': b'\x136\x93\xb1%\xba\xd2\xb4\xac1\x855\xb8I\x01\xeb\xb1\xf6\xb68', # noqa - 'blake2s256': b'\xd2l\x1c\xad\x82\xd4=\xf0\xbf\xfa^{\xe1\x1a`\xe3J\xdb\x85\xa2\x18\xb43\xcb\xceRx\xb1\x0b\x95O\xe8', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo/another-quote.org', # noqa - 'sha256': b'=\xb5\xae\x16\x80U\xbc\xd9:M\x08(]\xc9\x9f\xfe\xe2\x883\x03\xb2?\xac^\xab\x85\x02s\xa8\xeaUF', # noqa - 'perms': git.GitPerm.BLOB, - 'type': git.GitType.BLOB, - 'length': 72} - }, - b'/tmp/tmp7w3oi_j8/sample-folder/foo': { - 'children': { - b'/tmp/tmp7w3oi_j8/sample-folder/foo/barfoo', - b'/tmp/tmp7w3oi_j8/sample-folder/foo/rel-link-to-barfoo', - b'/tmp/tmp7w3oi_j8/sample-folder/foo/quotes.md', - }, - 'checksums': {'type': git.GitType.TREE, - 'name': b'foo', - 'sha1_git': b'+A\xc4\x0f\r\x1f\xbf\xfc\xba\x12I}\xb7\x1f\xba\x83\xfc\xca\x96\xe5', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/foo', - 'perms': git.GitPerm.TREE} - }, - b'/tmp/tmp7w3oi_j8/sample-folder/foo/barfoo': { - 'checksums': {'name': b'barfoo', - 'sha1': b'\x90W\xeem\x01bPn\x01\xc4\xd9\xd5E\x9az\xdd\x1f\xed\xac7', # noqa - 'data': b'bar/barfoo', - 'sha1_git': b'\x81\x85\xdf\xb2\xc0\xc2\xc5\x97\xd1ou\xa8\xa0\xc3vhV|=~', # noqa - 'blake2s256': b'\xe1%/,\xaaJre\x936\xedZ\xd6\x03\xd3>\x83\xc8;\xa4\xe1KF\xf9\xb8\xa8\x0b', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/foo/quotes.md', # noqa - 'sha256': b'\xca\xca\x94*\xed\xa7\xb3\x08\x85\x9e\xb5o\x90\x9e\xc9m\x07\xa4\x99I\x16\x90\xc4S\xf7;\x98\x00\xa9;\x16Y', # noqa - 'perms': git.GitPerm.BLOB, - 'type': git.GitType.BLOB, - 'length': 66} - }, - } - - -class TestObjectsPerType(WithSampleFolderChecksums, unittest.TestCase): - @istest - def objects_per_type_blob(self): - # given - expected_blobs = [ - { - 'name': b'another-quote.org', - 'sha1': b'\x90\xa6\x13\x8b\xa5\x99\x15&\x1e\x17\x99H8j\xa1\xcc*\xa9"\n', # noqa - 'sha1_git': b'\x136\x93\xb1%\xba\xd2\xb4\xac1\x855\xb8I\x01\xeb\xb1\xf6\xb68', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo/another-quote.org', # noqa - 'sha256': b'=\xb5\xae\x16\x80U\xbc\xd9:M\x08(]\xc9\x9f\xfe\xe2\x883\x03\xb2?\xac^\xab\x85\x02s\xa8\xeaUF', # noqa - 'perms': git.GitPerm.BLOB, - 'type': git.GitType.BLOB, - 'length': 72 - }, - { - 'name': b'link-to-binary', - 'sha1': b'\xd0$\x87\x14\x94\x8b:H\xa2T8#*o\x99\xf01\x8fY\xf1', - 'data': b'some-binary', - 'sha1_git': b'\xe8kE\xe58\xd9\xb6\x88\x8c\x96\x9c\x89\xfb\xd2*\x85\xaa\x0e\x03f', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/link-to-binary', - 'sha256': b'\x14\x12n\x97\xd8?}&\x1cZh\x89\xce\xe76\x19w\x0f\xf0\x9e@\xc5I\x86\x85\xab\xa7E\xbe\x88.\xff', # noqa - 'perms': git.GitPerm.LINK, - 'type': git.GitType.BLOB, - 'length': 11 - }, - { - 'name': b'link-to-another-quote', - 'sha1': b'\xcb\xee\xd1^yY\x9c\x90\xdes\x83\xf4 \xfe\xd7\xac\xb4\x8e\xa1q', # noqa - 'data': b'bar/barfoo/another-quote.org', - 'sha1_git': b'}\\\x08\x11\x1e!\xc8\xa9\xf7\x15@\x93\x99\x98U\x16\x837_\xad', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/link-to-another-quote', # noqa - 'sha256': b'\xe6\xe1}\x07\x93\xaau\n\x04@\xeb\x9a\xd5\xb8\x0b%\x80vc~\xf0\xfbh\xf3\xac.Y\xe4\xb9\xac;\xa6', # noqa - 'perms': git.GitPerm.LINK, - 'type': git.GitType.BLOB, - 'length': 28 - }, - { - 'name': b'link-to-foo', - 'sha1': b'\x0b\xee\xc7\xb5\xea?\x0f\xdb\xc9]\r\xd4\x7f<[\xc2u\xda\x8a3', # noqa - 'data': b'foo', - 'sha1_git': b'\x19\x10(\x15f=#\xf8\xb7ZG\xe7\xa0\x19e\xdc\xdc\x96F\x8c', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/link-to-foo', - 'sha256': b',&\xb4kh\xff\xc6\x8f\xf9\x9bE<\x1d0A4\x13B-pd\x83\xbf\xa0\xf9\x8a^\x88bf\xe7\xae', # noqa - 'perms': git.GitPerm.LINK, - 'type': git.GitType.BLOB, - 'length': 3 - }, - { - 'name': b'some-binary', - 'sha1': b'\x0b\xbc\x12\xd7\xf4\xa2\xa1[\x14=\xa8F\x17\xd9\\\xb2#\xc9\xb2<', # noqa - 'sha1_git': b'hv\x95y\xc3\xea\xad\xbeUSy\xb9\xc3S\x8ef(\xba\xe1\xeb', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/some-binary', - 'sha256': b'\xba\xc6P\xd3Jv8\xbb\n\xebSBdm$\xe3\xb9\xadkD\xc9\xb3\x83b\x1f\xaaH+\x99\n6}', # noqa - 'perms': git.GitPerm.EXEC, - 'type': git.GitType.BLOB, - 'length': 5 - }, - { - 'name': b'barfoo', - 'sha1': b'\x90W\xeem\x01bPn\x01\xc4\xd9\xd5E\x9az\xdd\x1f\xed\xac7', # noqa - 'data': b'bar/barfoo', - 'sha1_git': b'\x81\x85\xdf\xb2\xc0\xc2\xc5\x97\xd1ou\xa8\xa0\xc3vhV|=~', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/foo/barfoo', - 'sha256': b')\xad?W%2\x1b\x94\x032\xc7\x8e@6\x01\xaf\xffa\xda\xea\x85\xe9\xc8\x0bJpc\xb6\x88~\xadh', # noqa - 'perms': git.GitPerm.LINK, - 'type': git.GitType.BLOB, - 'length': 10 - }, - { - 'name': b'rel-link-to-barfoo', - 'sha1': b'\xdcQ"\x1d0\x8f:\xeb\'T\xdbH9\x1b\x85h|(i\xf4', - 'data': b'../bar/barfoo', - 'sha1_git': b'\xac\xac2m\xddc\xb0\xbcp\x84\x06Y\xd4\xacCa\x94\x84\xe6\x9f', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/foo/rel-link-to-barfoo', # noqa - 'sha256': b'\x80\x07\xd2\r\xb2\xaf@C_B\xdd\xefK\x8a\xd7k\x80\xad\xbe\xc2k$\x9f\xdf\x04s5?\x8d\x99\xdf\x08', # noqa - 'perms': git.GitPerm.LINK, - 'type': git.GitType.BLOB, - 'length': 13 - }, - { - 'name': b'quotes.md', - 'sha1': b'\x1b\xf0\xbbr\x1a\xc9,\x18\xa1\x9b\x13\xc0\xeb=t\x1c\xbf\xad\xeb\xfc', # noqa - 'sha1_git': b'|LW\xba\x9f\xf4\x96\xad\x17\x9b\x8fe\xb1\xd2\x86\xed\xbd\xa3L\x9a', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/foo/quotes.md', - 'sha256': b'\xca\xca\x94*\xed\xa7\xb3\x08\x85\x9e\xb5o\x90\x9e\xc9m\x07\xa4\x99I\x16\x90\xc4S\xf7;\x98\x00\xa9;\x16Y', # noqa - 'perms': git.GitPerm.BLOB, - 'type': git.GitType.BLOB, - 'length': 66 - }, - ] - - expected_sha1_blobs = set( - ((c['sha1_git'], git.GitType.BLOB) for c in expected_blobs)) - - # when - actual_sha1_blobs = set( - ((c['sha1_git'], c['type']) - for c in git.objects_per_type(git.GitType.BLOB, self.objects))) - - # then - self.assertEqual(actual_sha1_blobs, expected_sha1_blobs) - - @istest - def objects_per_type_tree(self): - def _children_hashes(path, objects=self.objects): - return set((c['sha1_git'] - for c in git.children_hashes( - objects[path]['children'], objects))) - - expected_trees = [ - { - 'type': git.GitType.TREE, - 'name': b'tmp7w3oi_j8', - 'sha1_git': b'\xa7A\xfcM\x96\x8c{\x8e<\x94\xff\x86\xe7\x04\x80\xc5\xc7\xe5r\xa9', # noqa - 'path': b'/tmp/tmp7w3oi_j8', - 'perms': git.GitPerm.TREE, - # we only add children's sha1_git here, in reality, - # it's a full dict of hashes. - 'children': _children_hashes(b'/tmp/tmp7w3oi_j8') - }, - { - 'type': git.GitType.TREE, - 'name': b'sample-folder', - 'sha1_git': b'\xe8\xb0\xf1Fj\xf8`\x8c\x8a?\xb9\x87\x9d\xb1r\xb8\x87\xe8\x07Y', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder', - 'perms': git.GitPerm.TREE, - 'children': _children_hashes( - b'/tmp/tmp7w3oi_j8/sample-folder') - }, - { - 'type': git.GitType.TREE, - 'name': b'empty-folder', - 'sha1_git': b'K\x82]\xc6B\xcbn\xb9\xa0`\xe5K\xf8\xd6\x92\x88\xfb\xeeI\x04', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/empty-folder', - 'perms': git.GitPerm.TREE, - 'children': _children_hashes( - b'/tmp/tmp7w3oi_j8/sample-folder/empty-folder') - }, - { - 'type': git.GitType.TREE, - 'name': b'bar', - 'sha1_git': b'<\x1fW\x83\x94\xf4b?t\xa0\xba\x7f\xe7ar\x9fY\xfcn\xc4', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/bar', - 'perms': git.GitPerm.TREE, - 'children': _children_hashes( - b'/tmp/tmp7w3oi_j8/sample-folder/bar') - }, - { - 'type': git.GitType.TREE, - 'name': b'barfoo', - 'sha1_git': b'\xc3\x02\x0fk\xf15\xa3\x8cm\xf3\xaf\xeb_\xb3\x822\xc5\xe0p\x87', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo', - 'perms': git.GitPerm.TREE, - 'children': _children_hashes( - b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo'), - }, - { - 'type': git.GitType.TREE, - 'name': b'foo', - 'sha1_git': b'+A\xc4\x0f\r\x1f\xbf\xfc\xba\x12I}\xb7\x1f\xba\x83\xfc\xca\x96\xe5', # noqa - 'path': b'/tmp/tmp7w3oi_j8/sample-folder/foo', - 'perms': git.GitPerm.TREE, - 'children': _children_hashes( - b'/tmp/tmp7w3oi_j8/sample-folder/foo') - }, - ] - expected_sha1_trees = list( - ((c['sha1_git'], git.GitType.TREE, c['children']) - for c in expected_trees)) - - # when - actual_sha1_trees = list( - ((c['sha1_git'], c['type'], _children_hashes(c['path'])) - for c in git.objects_per_type(git.GitType.TREE, self.objects))) - - self.assertEquals(len(actual_sha1_trees), len(expected_sha1_trees)) - for e in actual_sha1_trees: - self.assertTrue(e in expected_sha1_trees) - - -class TestComputeHashesFromDirectory(WithSampleFolderChecksums, - GitHashWalkArborescenceTree, - unittest.TestCase): - - def __adapt_object_to_rootpath(self, rootpath): - def _replace_slash(s, - rootpath=self.rootkey, - newrootpath=rootpath): - return s.replace(rootpath, newrootpath) - - def _update_children(children): - return set((_replace_slash(c) for c in children)) - - # given - expected_objects = {} - for path, v in self.objects.items(): - p = _replace_slash(path) - v['checksums']['path'] = _replace_slash(v['checksums']['path']) - v['checksums']['name'] = os.path.basename(v['checksums']['path']) - if 'children' in v: - v['children'] = _update_children(v['children']) - expected_objects[p] = v - - return expected_objects - - @istest - def compute_hashes_from_directory_default(self): - # given - expected_objects = self.__adapt_object_to_rootpath(self.tmp_root_path) - - # when - actual_hashes = git.compute_hashes_from_directory(self.tmp_root_path) - - # then - self.assertEquals(actual_hashes, expected_objects) - - @istest - def compute_hashes_from_directory_no_empty_folder(self): - # given - def _replace_slash(s, - rootpath=self.rootkey, - newrootpath=self.tmp_root_path): - return s.replace(rootpath, newrootpath) - - expected_objects = self.__adapt_object_to_rootpath(self.tmp_root_path) - - # when - actual_hashes = git.compute_hashes_from_directory( - self.tmp_root_path, - remove_empty_folder=True) - - # then - - # One folder less, so plenty of hashes are different now - self.assertNotEquals(actual_hashes, expected_objects) - keys = set(actual_hashes.keys()) - - assert (b'/tmp/tmp7w3oi_j8/sample-folder/empty-folder' - in self.objects.keys()) - new_empty_folder_path = _replace_slash( - b'/tmp/tmp7w3oi_j8/sample-folder/empty-folder') - self.assertNotIn(new_empty_folder_path, keys) - - self.assertEqual(len(keys), len(expected_objects.keys()) - 1) - - @istest - def compute_hashes_from_directory_ignore_some_folder(self): - # given - def _replace_slash(s, - rootpath=self.rootkey, - newrootpath=self.tmp_root_path): - return s.replace(rootpath, newrootpath) - - ignore_path = b'/tmp/tmp7w3oi_j8/sample-folder' - - # when - actual_hashes = git.compute_hashes_from_directory( - self.tmp_root_path, - dir_ok_fn=lambda dirpath: b'sample-folder' not in dirpath) - - # then - - # One entry less, so plenty of hashes are different now - keys = set(actual_hashes.keys()) - - assert ignore_path in self.objects.keys() - - new_ignore_path = _replace_slash(ignore_path) - self.assertNotIn(new_ignore_path, keys) - - # top level directory contains the folder to ignore - self.assertEqual(len(keys), 1) diff --git a/swh/model/tests/test_git_slow.py b/swh/model/tests/test_git_slow.py deleted file mode 100644 index ac5f63e..0000000 --- a/swh/model/tests/test_git_slow.py +++ /dev/null @@ -1,404 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.model import hashutil - -from swh.model import git - - -_type_to_git_type = { - 'blob': git.GitType.BLOB, - 'tree': git.GitType.TREE, -} - - -_perms_to_git_perm = { - '100644': git.GitPerm.BLOB, - '120000': git.GitPerm.LINK, - '040000': git.GitPerm.TREE, - '100755': git.GitPerm.EXEC -} - - -def to_bytes(path): - """Convert the string to bytes. - - """ - return path.encode('utf-8', errors='surrogateescape') - - -def to_hash_data_entry(ls_tree_format_input_line): - def prepare_str(s): - return s.strip().replace('\t', ' ').replace(' ', ' ') - - prepared_str = prepare_str(ls_tree_format_input_line) - perms, type, sha1_git, name = prepared_str.split(' ') - return {'perms': _perms_to_git_perm[perms], - 'name': to_bytes(name), - 'type': _type_to_git_type[type], - 'sha1_git': bytes.fromhex(sha1_git)} - - -def to_hash_data(path, ls_tree_format_input): - entry_lines = ls_tree_format_input.strip().split('\n') - return {path: list(map(to_hash_data_entry, entry_lines))} - - -def compute_tree_hash(dirpath, ls_tree_format_input, hex_output): - hashes = to_hash_data(dirpath, ls_tree_format_input) - bin_hash = git.compute_directory_git_sha1(dirpath, hashes) - return hashutil.hash_to_hex(bin_hash) - - -@attr('slow') -class GitHashTreelib(unittest.TestCase): - def setUp(self): - self.to_checks = { - 'e8014cb75cfe9fdb4603ce869eeeb12c53e646d9': """ -040000 tree a1e4db2944541e47088e72830464c2ffd3935f47 testing -040000 tree f9375bba7c6d1aabec5ff90b0af53af526b7fc0d obsolete -100644 blob 1fafc4b0753b4eedf0bc00351286ff864745ab07 README -040000 tree 30d8382c42e9fd66f332d2bebfa44d044afe9d95 removed -040000 tree f3b14ca3821d7d2839713925642261e892270c88 stable - """, - '30d8382c42e9fd66f332d2bebfa44d044afe9d95': """ -100644 blob a173aecc2f18aedddf1c9882808654febffe0d20 net_dma -100644 blob 0020c49933c45ab0b61cd7e57fa9b4baa672d3c0 devfs -100644 blob c2310b6676f4c78be0a8f8b46ed45a126ca5e57a dv1394 -100644 blob 3243613bc2d2095c86fdd878236dfe08ed0cfe84 ip_queue -100644 blob 20c91adca6d412102dabf73d6b6f387a60d888ec o2cb -100644 blob ec333e67632266a935daa6e2124744c09caa8d77 raw1394 -100644 blob c39c25aee77b13e6d92e46686000ac2d8978da51 video1394 - """, - - 'f3b14ca3821d7d2839713925642261e892270c88': """ -100644 blob 16d030827368b2c49cbbe396588635dfa69d6c08 firewire-cdev -100644 blob 5eb1545e0b8d2aea38138d8ff43f4045a6b6f729 o2cb -100644 blob c3ae3e7d6a0ccdddedcc61db54910fef59dd54d3 syscalls -100644 blob 964c7a8afb268ae004364b0d71117efa51261dc3 sysfs-acpi-pmprofile -100644 blob 41e5a0cd1e3ed334234c4f3e9e3db1e2fa021dfc sysfs-bus-firewire -100644 blob 831f15d9672f29e90cca5650356d2f69599e14b8 sysfs-bus-usb -100644 blob 140d85b4ae92faff6d3646735b04974c530c604b sysfs-bus-w1 -100644 blob 3d5951c8bf5fe8b27f47b016289c910f90af97e6 sysfs-bus-xen-backend -100644 blob 70302f370e7ec1c1d46e4d278f41319e1ce536c1 sysfs-class-backlight -100644 blob 097f522c33bb7b5c3632a9ca200e099fea32b2cf sysfs-class-rfkill -100644 blob 26579ee868c9374ba92d3c1121c94310aacc38b4 sysfs-driver-w1_ds28e04 -100644 blob 9f790eebb5d2b0f4d35c317073c72791b41a20b3 sysfs-class-tpm -100644 blob 18d471d9faea9bdec594a5bada594b4062ab66fb sysfs-class-ubi -100644 blob 85d3dac2e204cfb649969ec6f7570522fb59ed4a sysfs-class-udc -100644 blob 43f78b88da28beaa556b3bba509f1ac97fa44c16 sysfs-devices -100644 blob 5b2d0f08867cd899df072f89a059995944fb8eec sysfs-devices-node -100644 blob 33c133e2a631a0d390353f76e4ad0697a568c60f sysfs-devices-system-cpu -100644 blob caa311d59ac1d24c92643f37b396407a1ab654f0 sysfs-devices-system-xen_memory -100644 blob 7049a2b5035950f3d08dc9e8595a7d40e73036e6 sysfs-driver-ib_srp -100644 blob 9a59d84497edb7c7600e546542b3f4dfbccbe1d2 sysfs-driver-qla2xxx -100644 blob e960cd027e1e9685a83f3275ca859da7a793e116 sysfs-driver-usb-usbtmc -100644 blob e928def14f28c7487e7a319f94a9c1527aaecd8d sysfs-driver-w1_ds28ea00 -100644 blob 5def20b9019e93299ed111d53c338e705b1e2639 sysfs-firmware-efi-vars -100644 blob 32fe7f5c488069c64b8c37951b6dfcfa90f4eb57 sysfs-firmware-opal-dump -100644 blob e1f3058f5954d062796d12feb153d5d025c38495 sysfs-firmware-opal-elog -100644 blob 6272ae5fb36699b9f47276c85ec313185e43a9cf sysfs-module -100644 blob 636e938d5e33a4e9331a328a05a6b93a0b538e60 sysfs-bus-vmbus -100644 blob ec7af69fea0afd9fe57f6600adc6b9be8fceb90d sysfs-transport-srp -100644 blob 9723e8b7aeb3125b352b75bc54a0ad0ea7aa2474 thermal-notification -100644 blob 7cdfc28cc2c6d93b861d6ec3acb05bc5aca8bc70 vdso - """, # NOQA - - '367b37ab86e8066a46ed8ed81b37e78138aeb7d5': """ - 100644 blob 8b7c72f07c92fe87cc7170ecc4fd1edf80fe7791 .gitignore - 100644 blob 06871b0c08a6e9fb5d38f5b1e4d5dfb90135f2f2 Makefile - 100755 blob 8f2629b41c5f5167be37fd3e7dee74dc9b67d2a6 micctrl - 100755 blob 582aad4811ae802844ebeb37d51cc9a1ffec68a8 mpss - 100644 blob 3c5c379fc29d6797d0ce17a837cbda64278f68b3 mpssd.c - 100644 blob f5f18b15d9a057cc6e8d5d1b007424da4d765c0b mpssd.h - 100644 blob 8dd32693608357df350619a8da6668fb3241afd9 sysfs.c - """, - '1f4fa162adf287b4fa3fb762cf54dafc0e671f57': """ -100644 blob cd077ca0e1b86dfba53b3bd2d0fa62724eb24eb4 00-INDEX -040000 tree e8014cb75cfe9fdb4603ce869eeeb12c53e646d9 ABI -100644 blob 65022a87bf17902f9e04fe5ecff611a41ffaf4d8 BUG-HUNTING -100644 blob f447f0516f074c700b0c78ca87fcfcf4595ea49f Changes -100644 blob 1684d0b4efa65880a36d0fb00cc5bff747c3e83a CodeOfConflict -100644 blob c06f817b3091cdb6e4be6e91dbbb98210177b370 CodingStyle -100644 blob 55b70b903ead2e95ce1226ef0fec3612bea67189 DMA-API-HOWTO.txt -100644 blob edccacd4f048a13e8afdb63db7d98ad41667a503 DMA-API.txt -100644 blob b1a19835e9070dbec2f6dba3d735b8cda23abd6e DMA-ISA-LPC.txt -100644 blob 18dc52c4f2a0b13a42d9867c36c94f4774bf58e2 DMA-attributes.txt -040000 tree 57c2bd8f00655df1d9ecbeab3a6b265279ae433a DocBook -040000 tree 902e0d5f0930c22be9b4b6dfe984fe6048626784 EDID -100644 blob 21152d397b88ecbe45bca161444fcee38158e96b HOWTO -100644 blob 31d1d658827f082f66c88c3147e99be3321635cf IPMI.txt -100644 blob 01a675175a3674ef88a08ebb4f430dca3a4e4ec2 IRQ-affinity.txt -100644 blob 3a8e15cba816a4ea16fb0208518046214ebff1e6 IRQ-domain.txt -100644 blob 1011e717502162c63a04245169ac05d8f96a895a IRQ.txt -100644 blob 7b57fc087088f49756eeb8eaabf403bfbbd92b93 Intel-IOMMU.txt -100644 blob bc0548201755e1a8d29614bccbd78fcbbe5a34ae Makefile -100644 blob a211ee8d8b447354ac3758d2f6f50b901aa41ea0 ManagementStyle -040000 tree 5dc5d1e6756e3547edf8fd663f81ca353545df9d PCI -040000 tree 7bb4565fcf075c6906c4256b4aab7915c4779ee8 RCU -100644 blob 74be14679ed891820cd9c3a7393007f8dd21d07d SAK.txt -100644 blob 561826f82093574bc61d887cae0436935d317c5e SM501.txt -100644 blob a660d494c8edcf9fc9bbaec9887ac6203bfcd60e SecurityBugs -100644 blob 2b7e32dfe00d95fadabc535372bea6ba343fdc59 SubmitChecklist -100644 blob 31d372609ac00fb715a66174214d10f2ba673520 SubmittingDrivers -100644 blob fd89b04d34f038bafd1485a8f96869828470f619 SubmittingPatches -100644 blob 70acfbf399ebfb86f975ada4b8fbc2055b0ba673 VGA-softcursor.txt -040000 tree bc7ec048cf540e56c5ba839ec9d85bd6eff3f2eb accounting -040000 tree 3f095916076e489cc63a253019e1a73693f3d3b9 acpi -100644 blob cc2d4ac4f4042b7938e38f9f11970669292839a6 adding-syscalls.txt -040000 tree 9850a7627679a34f8228c0abe8d06bcb4421f784 aoe -100644 blob 77df55b0225ab331bb7268592fa5d18ed8f909c7 applying-patches.txt -040000 tree 35fa24f995536c9d2bcf20c5f842bcc45ce83c86 arm -040000 tree adf0f8637dc105841caeabe57ed9e631802d17fb arm64 -100644 blob 2f2c6cdd73c0c24ab29dcd3f68034f99f17c3125 assoc_array.txt -100644 blob b19fc34efdb17921af43bda0000b13dc82640451 atomic_ops.txt -040000 tree 33c1cd21f36a02c691570dc7dcddf41d8331705d auxdisplay -040000 tree d6260d3558e94171cfa60b420c8df17a86cc7809 backlight -100644 blob df84162132028d6771fc0da0649f54158bdac93c bad_memory.txt -100644 blob 8764e9f70821e4f894551f1fb1b98a881f3d3e9d basic_profiling.txt -100644 blob 32b6c3189d9826a53875ae6dc51ce62e9b86778b bcache.txt -100644 blob 6b1de70583715d7728a7a31b4612564b0178679b binfmt_misc.txt -040000 tree cd97febccb0fad00d0d61f0502f6e45c91ed06bf blackfin -040000 tree 8bbf8033be7139c9897791b4c6ec6611e83de346 block -040000 tree dba91c80d3182baeb0a0ab56d13e49fd785ebec9 blockdev -100644 blob d0d042c2fd5e9e319657117b3de567b2d42a995a braille-console.txt -100644 blob d8297e4ebd265eb5dd273bad20162e51d369b25a bt8xxgpio.txt -100644 blob 34916a46c0997dd58e1922a48e08038aab930e02 btmrvl.txt -040000 tree 39641366356afa81c2a52aceeb914f2566c1f4ca bus-devices -100644 blob 2bc55ff3b4d1e2db24906a41ba71e7da8b900688 bus-virt-phys-mapping.txt -100644 blob 3f9f808b51198b3f6278621b413c872f2b0a494f cachetlb.txt -040000 tree 8e44d0125c48edbffce58fa03aeaac213868d1ab cdrom -040000 tree 4d3a7398a2edaa5039706c89a4d7de65a3179282 cgroups -100644 blob 88951b179262a912fcddf16872f302cf117ca4ba circular-buffers.txt -100644 blob 5c4bc4d01d0c32939af28b3c0044f1700231d4a1 clk.txt -040000 tree 0f0536d144e4d4b9547db48a45a007dfe207e293 cma -100644 blob 7f773d51fdd91acf10e49875abbe66fff0fae767 coccinelle.txt -040000 tree a556d57f754fbaa46c7d0906ebec131e32eb6376 connector -040000 tree 2db84b37022f7520c0c6bbfeec02c546ba553b46 console -040000 tree 11e08c481fb1b35e5faecf7cb926f3d4efe78f87 cpu-freq -100644 blob f9ad5e048b111297549df37cc6a6fc8bff1fc75a cpu-hotplug.txt -100644 blob 287224e57cfc5d2e75540e7c99cdd9e3f763ff7e cpu-load.txt -040000 tree 49738b4d2357cb08e9f1368e984815daab99dacd cpuidle -100644 blob 12b1b25b4da9711c95ab013adf1bec4214964d2c cputopology.txt -100644 blob a08a7dd9d6255867e88b1ccc51ef820eb635286c crc32.txt -040000 tree 7737f93e00f6311425f8d52af5ab63dd8bb26d64 cris -040000 tree b2e8f35053e829bb602b71dc937a89c5f4b23c57 crypto -100644 blob e1c52e2dc361607417693946573d8959c7e01b81 dcdbas.txt -100644 blob 172ad4aec493cbe9a9db3b6193a43d8794b231e6 debugging-modules.txt -100644 blob 03703afc4d302e7eeb7fb4031d494ab750233194 debugging-via-ohci1394.txt -100644 blob d262e22bddec06945136bbec0e25826ef2df696e dell_rbu.txt -040000 tree bc28bfb6c3c0e63023b704090acb200fe2bdb1c1 development-process -040000 tree adccded12cbd61b0f37fd603d09b99df8881cc7e device-mapper -100644 blob 87b4c5e82d39023094f9b5f9b10cf919e3740f9d devices.txt -040000 tree 64cd52d94d3e083b1c18cc633552b2550cf23e74 devicetree -100644 blob 3f682889068bf932052737b57071ce715c851eda digsig.txt -100644 blob 480c8de3c2c44786174e112795f61b2381d3b09f dma-buf-sharing.txt -040000 tree a75e8c5eb06d2fc0b39427f20afd694f7e30e25a dmaengine -100644 blob 9de9813d0ec5df101a48428d40cfc9b9d2df6142 dontdiff -040000 tree 213f8c902440f1b0d512b6d0f20252c028828556 driver-model -040000 tree 0ebe2f7c24011ba6c1bae528431dc2c8f11889fc dvb -100644 blob 9417871b8758f26479e9c90e90a990988d657e8a dynamic-debug-howto.txt -040000 tree 020529dc9d406d453d30c463702d35e9ee2eef6d early-userspace -100644 blob 0cf27a3544a5744f39c232c75039a37ca079c2cd edac.txt -100644 blob 7747024d3bb70023fbff500cd3fc44546b31511b efi-stub.txt -100644 blob a55e4910924ea98b71969381b47ec16d922ecbdc eisa.txt -100644 blob 3fa450881ecb8e294a74d17766538804489fe9fd email-clients.txt -040000 tree 461c382186d40395ee88eba82b2ba8764285a35f extcon -040000 tree 475212bb9f2a96518b4da5f3fec8fe641e88c7e3 fault-injection -040000 tree 4362119fa45f8ef6c411d2a269178f3bf1b7ed35 fb -040000 tree 8abbff52bbacd5c4251af71bc2e30fd497b5feb0 features -040000 tree 9e2856c144a66c8283dcd3f652edddac59e691bd filesystems -040000 tree aba7ab22ac20ede93689312a30310a5aa6793178 firmware_class -100644 blob df904aec99044f8056ac530b9e9dc6de8f26f73e flexible-arrays.txt -040000 tree d4351d91b41949608f281d285520cc06b2b9d4fa fmc -040000 tree 2368701db45cbe838bc4721bde6ebcbab27b7737 frv -100644 blob 77b36f59d16b452bbf12bba4e3db83ec3ea84a9f futex-requeue-pi.txt -100644 blob 7b727783db7ed4f87a7c68b44b52054c62f48e85 gcov.txt -100644 blob 7050ce8794b9a4b3dd93b76dd9e2a6d708b468ee gdb-kernel-debugging.txt -040000 tree bcbdeb421fc8f6bfafa6a770cdbd6815eace6985 gpio -040000 tree ceb5de1b9b291962ccbac05db7a66b6b84a2c802 hid -100644 blob 6bad6f1d1cac4c16e513c491a5a6fb6df0c94786 highuid.txt -100644 blob 6ac6cd51852af538efe38be0147fd585d14601a9 hsi.txt -100644 blob 026e237bbc875ac0401cffaf33376e784da9a0b2 hw_random.txt -040000 tree 0fd3a6b83e05058c3e8396a6f5e0d6d8e740492a hwmon -100644 blob 61c1ee98e59f2137b8b250d2b469d4d949cca9b3 hwspinlock.txt -040000 tree eac8d0f964d8511d9cf9d1dcced3f3b54ce65c54 i2c -040000 tree dbc729c5c0ad5e8c3b0921948a31695e2667dbdb ia64 -040000 tree 75c7964c0da70c8fb033064f7503e037a181cde1 ide -040000 tree 11cf0e775bfe35ea324fac18f8b6e7882edc1e35 infiniband -100644 blob 535ad5e82b98cb5ed2adad76afc03be347b3af36 init.txt -100644 blob 4e1839ccb555e32c7fc3915dd4a76a0f3664b26f initrd.txt -040000 tree 7d27d4c0f1e283e3435b24f7a3c9d1a4dc1a8bbc input -100644 blob 91d89c540709876eadba970228d317faa2dd2153 intel_txt.txt -100644 blob 5ca78426f54c58d10e3fd0030ad51f6ccb2b5b9b io-mapping.txt -100644 blob 9faae6f26d3227d1799eae90e51471f00b82398d io_ordering.txt -040000 tree 75305cae2df1b51232f7e663a9d44f8d0a615fbf ioctl -100644 blob 65f694f2d1c9461c39f2ee71de4f24c7ddc62b02 iostats.txt -100644 blob f6da05670e16d9dcfc3f8b7d50a1a4291ad8a974 irqflags-tracing.txt -100644 blob 400d1b5b523dd8b80d3b5dfbeaf7962611ffd06a isapnp.txt -040000 tree 6d8fbb1e1d7bf73bd985dbc098ba953ce06db085 isdn -040000 tree 3bcb74b2add6f724ab7f76133dc4471770e03c4d ja_JP -100644 blob 418020584ccc171b8ff079e496e73383f0f55c29 java.txt -100644 blob 0d32355a4c348ce18cf4540e61a129b4cf2ac3fb kasan.txt -040000 tree 3e92f27cedbc6a0b52e06e4ba11e57e76826f402 kbuild -040000 tree b508edd7ad1443bff47fc4ac1f843c84abbaaeb1 kdump -100644 blob 78f69cdc9b3fbcec6f32beb179eb4c8732883d5a kernel-doc-nano-HOWTO.txt -100644 blob eda1eb1451a0881097bfaa8ad76c18acd6945f36 kernel-docs.txt -100644 blob 22a4b687ea5b4b3cb9d576bfeffaed813256a795 kernel-parameters.txt -100644 blob f4cbfe0ba1085b4df3067dcc457219699c5c6150 kernel-per-CPU-kthreads.txt -100644 blob 80aae85d8da6c1b8476fd6824553ae7070e5c508 kmemcheck.txt -100644 blob 18e24abb3ecf61b1f6a214af921af8bd138b27e4 kmemleak.txt -040000 tree b51cd2dcf225f1004e4d23fd80db32f0de7f8ef3 ko_KR -100644 blob 1be59a3a521c87fd6107fcdf64f7c7ac525d1512 kobject.txt -100644 blob 1f9b3e2b98aec9a6687ae14b4f85d7c143729c07 kprobes.txt -100644 blob ddf85a5dde0c12a435b9cbcc30f44159de5acc0b kref.txt -100644 blob a87d840bacfe11df785995eaee5698f23d565f94 kselftest.txt -040000 tree 652f991d106263d2c68500cf5ad896612945c2b9 laptops -100644 blob 4f80edd14d0a688d2a4cf1cdc491102601a53b9a ldm.txt -040000 tree 4839303afa967a2104cdaf8aeff6030f27e2b932 leds -100644 blob 407576a233177c3c336827b952872c082207d9e4 local_ops.txt -040000 tree 307372f9d9d08902e22d22034081806aa2fdd6b3 locking -100644 blob 22dd6af2e4bd42152edbe872b224b85a769e7184 lockup-watchdogs.txt -100644 blob 2eae75fecfb965f49065c680063a40c594736ee5 logo.gif -100644 blob 296f0f7f67eb2d73be7ec80106feaf77c5aac163 logo.txt -100644 blob ea45dd3901e3bfa2363bbe7a7009e0fc19809bfd lzo.txt -040000 tree c40b2eebc8f4266f6374c41dfa30d29d86bb57ea m68k -100644 blob 28befed9f6102a094702337a229b78c16a94bcde magic-number.txt -100644 blob 7ed371c852046b3dd5d993db1815d00a9d8f4bc0 mailbox.txt -100644 blob 1b794369e03a4ef14099f4ce702fc0d7c65140c6 md-cluster.txt -100644 blob 1a2ada46aaedae5162499886ec7c532d80c84b82 md.txt -100644 blob f552a75c0e70b22b3800a3fa93c0783075228250 media-framework.txt -100644 blob 2ba8461b0631de759fefd2a12918a6c4f4ee7562 memory-barriers.txt -040000 tree d2fdb444074b09b83d1f74b2a190325606e3f31c memory-devices -100644 blob ce2cfcf35c27a0d0972547e82f61fbc38c85b5ab memory-hotplug.txt -100644 blob 30ded732027e2814ccc8c4cf5690a84fbc8ebc30 men-chameleon-bus.txt -040000 tree f0b23005636d2d2e4a4b9f78567895a087610195 metag -040000 tree 29c6681a225b17dbb0cd20b9d73e6d30bb846927 mic -040000 tree 27c1a445222aeb50056defd34a41ea5ba41b7306 mips -040000 tree 11295031a1fb2167d7816e2b4c53272f92489873 misc-devices -040000 tree e45fccc68091d5b9c675558a8667af34923ec594 mmc -040000 tree 1a438a86d22deddb5bf600b21242d0d3c79f0b04 mn10300 -100644 blob a78bf1ffa68cb4c4defe32146fc75f8449a46245 module-signing.txt -100644 blob d01ac60521943756a99bfc07fe8fe05e6775626f mono.txt -040000 tree 3949e1a47604a29499fb37ee66a599004436a00b mtd -040000 tree d674dc07291045530f4b83ce02ec866765990853 namespaces -040000 tree dbc8596c5816529d45d5339601d1ec9ceab2193b netlabel -040000 tree 0303625762b34a4fc5ac065d9aa84c489e8141a3 networking -040000 tree 1f4b88a93381592d6b026ad6ed895cc42c551720 nfc -040000 tree 983c152dbf360507b31e2326bb2a35c66eeddf20 nios2 -100644 blob ae57b9ea0d4169258b48b0531976b1a4a30eabae nommu-mmap.txt -100644 blob 1d9bbabb6c79abb04259b78481f7304abacbaccc ntb.txt -100644 blob 520327790d5431daae3a537d0fd36ec897cde5a8 numastat.txt -040000 tree e11c61ab7124dd21cf150ab4c31bfd1e8fedab88 nvdimm -040000 tree 2d0554d83b8cf9d2d361cc30e9794819658e3f1a nvmem -100644 blob f3ac05cc23e4abb0ea13277fc8a45873351e7ce3 oops-tracing.txt -100644 blob 7ddfe216a0aa787a52421de6dc8ebc0f3b9002b2 padata.txt -040000 tree 6814a2e66f30688c33b20c88907eaf4e2e0f8059 parisc -100644 blob 120eb20dbb09199afc1628a2ca1187812789bde9 parport-lowlevel.txt -100644 blob c208e4366c033d5bc5d1c40b6d055b7c722656d4 parport.txt -040000 tree 8e50ccd74aeee952f963e0d70cea243bd078f22a pcmcia -100644 blob 7d3c82431909dd8120322e2360ce32cbd93f87e5 percpu-rw-semaphore.txt -100644 blob b388c5af9e726fe8fdd2eaec09eb1b9374f16b87 phy.txt -040000 tree ea4f357d526fbce14e0c2879c95a8bbafd7b3d5e phy -100644 blob 9a5bc8651c2923c619b168c1719f1e25e381e368 pi-futex.txt -100644 blob 4976389e432d4dd5207d65ad7c37d407c00d9d87 pinctrl.txt -040000 tree 90cc82c9b546a1c94b1545800b84303562744d1f platform -100644 blob 763e4659bf186fceff80ae17f50e7b495fe3e7b6 pnp.txt -040000 tree 0487c8fa4b60c90fd12de8c9ef7574d749f9ac4b power -040000 tree 1d2f3280d25fca0e5a0f703e82177298911df260 powerpc -040000 tree 591eb3d2ce87db9b11b8e84270dfa59ef49854ee pps -040000 tree 98f3e67e4e4688c5a4e439caed2c6db2ae811d1a prctl -100644 blob e89ce6624af2fab481a708ad1a0e4e20d1bc0c1c preempt-locking.txt -100644 blob 2216eb187c213b4c0c5140a760f9df3098150e41 printk-formats.txt -040000 tree da1837f687e5d470a7907a0ece81c877987fd282 pti -040000 tree 962176c51cfe9f3846ab59aafdcc0f07db4e765a ptp -100644 blob ca895fd211e4e9f5f6bd0fc6a13bf60d9a0c14b2 pwm.txt -100644 blob 5d8675615e59c40c6564710a0a9b73ae060e2a00 ramoops.txt -040000 tree d51ed0cdcddfd9bd8bccbe8169ee47b61fcdc756 rapidio -100644 blob 39873ef41bf9fc1a72b8a2e9ace8284babe74abe rbtree.txt -100644 blob ef0219fa4bb4cf5beb9078293a92b3ccbcbe0d48 remoteproc.txt -100644 blob 2ee6ef9a6554d600088ae572b3256ffe44e51d08 rfkill.txt -100644 blob 16eb314f56cc45ce923d9354960bdf67ea4e6b98 robust-futex-ABI.txt -100644 blob af6fce23e4847709d32ddee025cafb055326f171 robust-futexes.txt -100644 blob f7edc3aa1e92d4e2eac9ed143212f9757577f041 rpmsg.txt -100644 blob 8446f1ea1410b87b071047dc310a787a92606c31 rtc.txt -040000 tree c7b9d98141594d46c92b026a63f854017c8039e5 s390 -040000 tree 5d3736128a6ad1ba76f945c4389034f7aa0b5681 scheduler -040000 tree 1d347ab5c9dce9eb05bf5be505afb6529183f5af scsi -040000 tree e8e43eadba479833220bf3fa3d1fbaefe9a17991 security -100644 blob 9a7bc8b3f479b2b82dbfa1056df060366dbafdec serial-console.txt -040000 tree 39133be11e4495c042f2439e984984bec4e63cb6 serial -100644 blob 876c96ae38dba1402e79c11a10ff1c64eb5741fd sgi-ioc4.txt -040000 tree e6a02a1b02f80ba24307f22431ccceb6fb308838 sh -100644 blob 6b492e82b43d98b93020e033ea1b108adbbf6033 smsc_ece1099.txt -040000 tree 887a845d843820c990ab3cc6251d56a864b9fa34 sound -100644 blob eceab1308a8c2fbde6722232db18bbb57a6e7f2e sparse.txt -040000 tree 78f79272aa73a95571b1c2d4ea4702b1eaeecb46 spi -100644 blob db3be892afb2b64ee582a5e43ce87223a1251ad3 stable_api_nonsense.txt -100644 blob 3049a612291b1ad8651da72c6081539bb4e83a74 stable_kernel_rules.txt -100644 blob 477927becacba69ee4bdea2203dd796979d14449 static-keys.txt -100644 blob cd66ec836e4f45aae80754ece6c384cfd2f45b95 svga.txt -040000 tree a9a8db7e58ce0082f02604d6f86ab4dd5f32ff9f sysctl -100644 blob ce60ffa94d2d709681ed339fc4ef25369a2c377d sysfs-rules.txt -100644 blob 13f5619b2203e68af6d766f66a8137dd1133d4fa sysrq.txt -040000 tree 9f25dc697646d3ee9505b920a07e4caaf976345d target -040000 tree 9d4f3319f51b26a7697e109e9d1ba7f435603a5d thermal -100644 blob 2cbf71975381d0a850d1a254aa76af7957b35058 this_cpu_ops.txt -040000 tree 3e4b4130aa6d96892130c0e74d8efedd6874f4e7 timers -040000 tree d1b46a427ea95f8e3e49dac8b035c3970d794e15 tpm -040000 tree db021902c4a4d411ee1b168b4670e490fa7c1b36 trace -100644 blob a445da098bc6e5aa733cd55ca2ee8b4a5f04dc2c unaligned-memory-access.txt -100644 blob 4a33f81cadb10165fad3ca7014f83b54f492a4bb unicode.txt -100644 blob a8643513a5f6cb25851140c021aec4a671c8b62c unshare.txt -040000 tree bc63f554449a02f3f2d80817327846e127b2c0f1 usb -040000 tree 04a86dfd52c143ed1352758c8e93871cf3c67a2c vDSO -100644 blob 1dd3fddfd3a1e536de39b69c37168dfc35553a4a vfio.txt -100644 blob 014423e2824c23fa5b08552e292db52fa25013a7 vgaarbiter.txt -100644 blob e517011be4f964db7b452e1e50420eaed83f143d video-output.txt -040000 tree 0613d846d1dffae70dabcc998a5fdacd7f5b7a4e video4linux -040000 tree bfa10f433ac83ca402ed876f705cb0f4a9e31c75 virtual -040000 tree abe2d8a8bbd0f97a2c5485d6adb62c14113bc3d6 vm -100644 blob ca5b82797f6c5c79c949a38cd7d7c19270035993 vme_api.txt -100644 blob db0cb228d64aa4a80a4fe380be3e46439de810e6 volatile-considered-harmful.txt -040000 tree 06051b06aeeee33b30966fbf0b53b241c6261454 w1 -040000 tree e796cb3b81fab2327d367e17ba75bac24540c59e watchdog -040000 tree b48b24715e6929469eb3e7a96eecf7f00e14a607 wimax -100644 blob 5e0e05c5183e290e8d78c531a3f42bc3c85377f7 workqueue.txt -040000 tree 1390d65651d4d0aab960bf20b55d5562c727a81e x86 -100644 blob 81d111b4dc28e15d3ab7471f8be1b8f42fe63e4c xillybus.txt -040000 tree afee3267cb7f59a0e0236309e27e14985618d523 xtensa -100644 blob 2cf3e2608de324b5622673943807b8e8b353e2da xz.txt -040000 tree d9c00fe0c456581fc233ad805191be86b387b605 zh_CN -100644 blob 90a64d52bea2f33464f86e4dc93954b2bc105f50 zorro.txt - """, # NOQA - "e202fc2cf10dcc460aaf469db4cb5379bbe326d8": - """ -100644 blob 5b6e7c66c276e7610d4a73c70ec1a1f7c1003259 COPYING -100644 blob 13248728a1c884756a0e265faf5b679ec27f47bc Copyright -100644 blob d8b02abb7e1a3523a40f8b7cbfb7d05f6fca8557 Makefile.pre -100644 blob 886eacfa48acef07d6d0b5b3b197811ab7775340 README -100755 blob 2a5781c640c10f05d7f194e0f1d24aaa96833e46 configure -040000 tree 656a2f680866edaf80fdfbcc7db503fe06b6772d doc -100644 blob b4d29e3dd5710423b57f388dfec3acd3d04b76f7 es.cwl -100644 blob b883cd6b699486be32abaeeb15eacdfb4d816893 es.dat -100644 blob 4103348bbbbc69ea08f2c970c3e360794137ed8c es.multi -100644 blob c3afb3608574b7afa5364468b5267c0824c8f079 espa\udcf1ol.alias -100644 blob c3afb3608574b7afa5364468b5267c0824c8f079 esponol.alias -100644 blob 7926a11dac0dc13055ed8a4ada14b7985a3332f5 info -100644 blob c3afb3608574b7afa5364468b5267c0824c8f079 spanish.alias -""" - } # NOQA - - @istest - def compute_complex_directories_git_sha1(self): - for sha1 in self.to_checks.keys(): - sha1_input = self.to_checks[sha1] - self.assertEquals(sha1, compute_tree_hash('some-path', sha1_input, - sha1)) diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index c9f47e1..1a4f24a 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -1,243 +1,245 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io +import os import tempfile import unittest from nose.tools import istest from unittest.mock import patch from swh.model import hashutil class Hashutil(unittest.TestCase): def setUp(self): self.data = b'1984\n' self.hex_checksums = { 'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731', 'sha1_git': '568aaf43d83b2c3df8067f3bedbb97d83260be6d', 'sha256': '26602113b4b9afd9d55466b08580d3c2' '4a9b50ee5b5866c0d91fab0e65907311', 'blake2s256': '63cfb259e1fdb485bc5c55749697a6b21ef31fb7445f6c78a' 'c9422f9f2dc8906', } self.checksums = { type: bytes.fromhex(cksum) for type, cksum in self.hex_checksums.items() } self.git_hex_checksums = { 'blob': self.hex_checksums['sha1_git'], 'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0', 'commit': '79e4093542e72f0fcb7cbd75cb7d270f9254aa8f', 'tag': 'd6bf62466f287b4d986c545890716ce058bddf67', } self.git_checksums = { type: bytes.fromhex(cksum) for type, cksum in self.git_hex_checksums.items() } @istest def hash_data(self): checksums = hashutil.hash_data(self.data) self.assertEqual(checksums, self.checksums) @istest def hash_data_unknown_hash(self): with self.assertRaises(ValueError) as cm: hashutil.hash_data(self.data, ['unknown-hash']) self.assertIn('Unexpected hashing algorithm', cm.exception.args[0]) self.assertIn('unknown-hash', cm.exception.args[0]) @istest def hash_git_data(self): checksums = { git_type: hashutil.hash_git_data(self.data, git_type) for git_type in self.git_checksums } self.assertEqual(checksums, self.git_checksums) @istest def hash_git_data_unknown_git_type(self): with self.assertRaises(ValueError) as cm: hashutil.hash_git_data(self.data, 'unknown-git-type') self.assertIn('Unexpected git object type', cm.exception.args[0]) self.assertIn('unknown-git-type', cm.exception.args[0]) @istest def hash_file(self): fobj = io.BytesIO(self.data) checksums = hashutil.hash_file(fobj, length=len(self.data)) self.assertEqual(checksums, self.checksums) @istest def hash_file_missing_length(self): fobj = io.BytesIO(self.data) with self.assertRaises(ValueError) as cm: hashutil.hash_file(fobj, algorithms=['sha1_git']) self.assertIn('Missing length', cm.exception.args[0]) @istest def hash_path(self): with tempfile.NamedTemporaryFile(delete=False) as f: f.write(self.data) - f.close() - hashes = hashutil.hash_path(f.name) + + hashes = hashutil.hash_path(f.name) + os.remove(f.name) self.checksums['length'] = len(self.data) self.assertEquals(self.checksums, hashes) @istest def hash_to_hex(self): for type in self.checksums: hex = self.hex_checksums[type] hash = self.checksums[type] self.assertEquals(hashutil.hash_to_hex(hex), hex) self.assertEquals(hashutil.hash_to_hex(hash), hex) @istest def hash_to_bytes(self): for type in self.checksums: hex = self.hex_checksums[type] hash = self.checksums[type] self.assertEquals(hashutil.hash_to_bytes(hex), hash) self.assertEquals(hashutil.hash_to_bytes(hash), hash) @istest def hash_to_bytehex(self): for algo in self.checksums: self.assertEqual(self.hex_checksums[algo].encode('ascii'), hashutil.hash_to_bytehex(self.checksums[algo])) @istest def bytehex_to_hash(self): for algo in self.checksums: self.assertEqual(self.checksums[algo], hashutil.bytehex_to_hash( self.hex_checksums[algo].encode())) @istest def new_hash_unsupported_hashing_algorithm(self): try: hashutil._new_hash('blake2:10') except ValueError as e: self.assertEquals(str(e), 'Unexpected hashing algorithm blake2:10, ' 'expected one of blake2b512, blake2s256, ' 'sha1, sha1_git, sha256') @patch('swh.model.hashutil.hashlib') @istest def new_hash_blake2b(self, mock_hashlib): mock_hashlib.new.return_value = 'some-hashlib-object' h = hashutil._new_hash('blake2b512') self.assertEquals(h, 'some-hashlib-object') mock_hashlib.new.assert_called_with('blake2b512') @patch('swh.model.hashutil.hashlib') @istest def new_hash_blake2s(self, mock_hashlib): mock_hashlib.new.return_value = 'some-hashlib-object' h = hashutil._new_hash('blake2s256') self.assertEquals(h, 'some-hashlib-object') mock_hashlib.new.assert_called_with('blake2s256') class HashlibGit(unittest.TestCase): def setUp(self): self.blob_data = b'42\n' self.tree_data = b''.join([b'40000 barfoo\0', bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087'), b'100644 blah\0', bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20'), b'100644 hello\0', bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')]) self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 author Antoine R. Dumont (@ardumont) 1444054085 +0200 committer Antoine R. Dumont (@ardumont) 1444054085 +0200 initial """.encode('utf-8') # NOQA self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 type commit tag 0.0.1 tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 blah """.encode('utf-8') # NOQA self.checksums = { 'blob_sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a8453e1' 'e07157b6cd'), 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' '121dacdb1c'), 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' 'd629189653'), 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' 'e9e959f120'), } @istest def unknown_header_type(self): with self.assertRaises(ValueError) as cm: hashutil.hash_git_data(b'any-data', 'some-unknown-type') self.assertIn('Unexpected git object type', cm.exception.args[0]) @istest def hashdata_content(self): # when actual_hash = hashutil.hash_git_data(self.blob_data, git_type='blob') # then self.assertEqual(actual_hash, self.checksums['blob_sha1_git']) @istest def hashdata_tree(self): # when actual_hash = hashutil.hash_git_data(self.tree_data, git_type='tree') # then self.assertEqual(actual_hash, self.checksums['tree_sha1_git']) @istest def hashdata_revision(self): # when actual_hash = hashutil.hash_git_data(self.commit_data, git_type='commit') # then self.assertEqual(actual_hash, self.checksums['commit_sha1_git']) @istest def hashdata_tag(self): # when actual_hash = hashutil.hash_git_data(self.tag_data, git_type='tag') # then self.assertEqual(actual_hash, self.checksums['tag_sha1_git']) diff --git a/swh/model/tests/test_merkle.py b/swh/model/tests/test_merkle.py new file mode 100644 index 0000000..9f43892 --- /dev/null +++ b/swh/model/tests/test_merkle.py @@ -0,0 +1,229 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from swh.model import merkle + + +class TestedMerkleNode(merkle.MerkleNode): + type = 'tested_merkle_node_type' + + def __init__(self, data): + super().__init__(data) + self.compute_hash_called = 0 + + def compute_hash(self): + self.compute_hash_called += 1 + child_data = [ + child + b'=' + self[child].hash + for child in sorted(self) + ] + + return ( + b'hash(' + + b', '.join([self.data['value']] + child_data) + + b')' + ) + + +class TestedMerkleLeaf(merkle.MerkleLeaf): + type = 'tested_merkle_leaf_type' + + def __init__(self, data): + super().__init__(data) + self.compute_hash_called = 0 + + def compute_hash(self): + self.compute_hash_called += 1 + return b'hash(' + self.data['value'] + b')' + + +class TestMerkleLeaf(unittest.TestCase): + def setUp(self): + self.data = {'value': b'value'} + self.instance = TestedMerkleLeaf(self.data) + + def test_hash(self): + self.assertEqual(self.instance.compute_hash_called, 0) + instance_hash = self.instance.hash + self.assertEqual(self.instance.compute_hash_called, 1) + instance_hash2 = self.instance.hash + self.assertEqual(self.instance.compute_hash_called, 1) + self.assertEqual(instance_hash, instance_hash2) + + def test_data(self): + self.assertEqual(self.instance.get_data(), self.data) + + def test_collect(self): + collected = self.instance.collect() + self.assertEqual( + collected, { + self.instance.type: { + self.instance.hash: self.instance.get_data(), + }, + }, + ) + collected2 = self.instance.collect() + self.assertEqual(collected2, {}) + self.instance.reset_collect() + collected3 = self.instance.collect() + self.assertEqual(collected, collected3) + + def test_leaf(self): + with self.assertRaisesRegex(ValueError, 'is a leaf'): + self.instance[b'key1'] = 'Test' + + with self.assertRaisesRegex(ValueError, 'is a leaf'): + del self.instance[b'key1'] + + with self.assertRaisesRegex(ValueError, 'is a leaf'): + self.instance[b'key1'] + + with self.assertRaisesRegex(ValueError, 'is a leaf'): + self.instance.update(self.data) + + +class TestMerkleNode(unittest.TestCase): + maxDiff = None + + def setUp(self): + self.root = TestedMerkleNode({'value': b'root'}) + self.nodes = {b'root': self.root} + for i in (b'a', b'b', b'c'): + value = b'root/' + i + node = TestedMerkleNode({ + 'value': value, + }) + self.root[i] = node + self.nodes[value] = node + for j in (b'a', b'b', b'c'): + value2 = value + b'/' + j + node2 = TestedMerkleNode({ + 'value': value2, + }) + node[j] = node2 + self.nodes[value2] = node2 + for k in (b'a', b'b', b'c'): + value3 = value2 + b'/' + j + node3 = TestedMerkleNode({ + 'value': value3, + }) + node2[j] = node3 + self.nodes[value3] = node3 + + def test_hash(self): + for node in self.nodes.values(): + self.assertEqual(node.compute_hash_called, 0) + + # Root hash will compute hash for all the nodes + hash = self.root.hash + for node in self.nodes.values(): + self.assertEqual(node.compute_hash_called, 1) + self.assertIn(node.data['value'], hash) + + # Should use the cached value + hash2 = self.root.hash + self.assertEqual(hash, hash2) + for node in self.nodes.values(): + self.assertEqual(node.compute_hash_called, 1) + + # Should still use the cached value + hash3 = self.root.update_hash(force=False) + self.assertEqual(hash, hash3) + for node in self.nodes.values(): + self.assertEqual(node.compute_hash_called, 1) + + # Force update of the cached value for a deeply nested node + self.root[b'a'][b'b'].update_hash(force=True) + for key, node in self.nodes.items(): + # update_hash rehashes all children + if key.startswith(b'root/a/b'): + self.assertEqual(node.compute_hash_called, 2) + else: + self.assertEqual(node.compute_hash_called, 1) + + hash4 = self.root.hash + self.assertEqual(hash, hash4) + for key, node in self.nodes.items(): + # update_hash also invalidates all parents + if key in (b'root', b'root/a') or key.startswith(b'root/a/b'): + self.assertEqual(node.compute_hash_called, 2) + else: + self.assertEqual(node.compute_hash_called, 1) + + def test_collect(self): + collected = self.root.collect() + self.assertEqual(len(collected[self.root.type]), len(self.nodes)) + for node in self.nodes.values(): + self.assertTrue(node.collected) + collected2 = self.root.collect() + self.assertEqual(collected2, {}) + + def test_get(self): + for key in (b'a', b'b', b'c'): + self.assertEqual(self.root[key], self.nodes[b'root/' + key]) + + with self.assertRaisesRegex(KeyError, "b'nonexistent'"): + self.root[b'nonexistent'] + + def test_del(self): + hash_root = self.root.hash + hash_a = self.nodes[b'root/a'].hash + del self.root[b'a'][b'c'] + hash_root2 = self.root.hash + hash_a2 = self.nodes[b'root/a'].hash + + self.assertNotEqual(hash_root, hash_root2) + self.assertNotEqual(hash_a, hash_a2) + + self.assertEqual(self.nodes[b'root/a/c'].parents, []) + + with self.assertRaisesRegex(KeyError, "b'nonexistent'"): + del self.root[b'nonexistent'] + + def test_update(self): + hash_root = self.root.hash + hash_b = self.root[b'b'].hash + new_children = { + b'c': TestedMerkleNode({'value': b'root/b/new_c'}), + b'd': TestedMerkleNode({'value': b'root/b/d'}), + } + + # collect all nodes + self.root.collect() + + self.root[b'b'].update(new_children) + + # Ensure everyone got reparented + self.assertEqual(new_children[b'c'].parents, [self.root[b'b']]) + self.assertEqual(new_children[b'd'].parents, [self.root[b'b']]) + self.assertEqual(self.nodes[b'root/b/c'].parents, []) + + hash_root2 = self.root.hash + self.assertNotEqual(hash_root, hash_root2) + self.assertIn(b'root/b/new_c', hash_root2) + self.assertIn(b'root/b/d', hash_root2) + + hash_b2 = self.root[b'b'].hash + self.assertNotEqual(hash_b, hash_b2) + + for key, node in self.nodes.items(): + if key in (b'root', b'root/b'): + self.assertEqual(node.compute_hash_called, 2) + else: + self.assertEqual(node.compute_hash_called, 1) + + # Ensure we collected root, root/b, and both new children + collected_after_update = self.root.collect() + self.assertCountEqual( + collected_after_update[TestedMerkleNode.type], + [self.nodes[b'root'].hash, self.nodes[b'root/b'].hash, + new_children[b'c'].hash, new_children[b'd'].hash], + ) + + # test that noop updates doesn't invalidate anything + self.root[b'a'][b'b'].update({}) + self.assertEqual(self.root.collect(), {}) diff --git a/version.txt b/version.txt index 155f917..99574ed 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.17-0-g11de644 \ No newline at end of file +v0.0.18-0-g34228c5 \ No newline at end of file