diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py new file mode 100644 --- /dev/null +++ b/swh/model/from_disk.py @@ -0,0 +1,341 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import enum +import os +import stat + +from . import hashutil +from .merkle import MerkleLeaf, MerkleNode +from .identifiers import ( + directory_identifier, + identifier_to_bytes as id_to_bytes, + identifier_to_str as id_to_str, +) + + +class DentryPerms(enum.IntEnum): + """Admissible permissions for directory entries.""" + content = 0o100644 + """Content""" + executable_content = 0o100755 + """Executable content (e.g. executable script)""" + symlink = 0o120000 + """Symbolic link""" + directory = 0o040000 + """Directory""" + revision = 0o160000 + """Revision (e.g. submodule)""" + + +def mode_to_perms(mode): + """Convert a file mode to a permission compatible with Software Heritage + directory entries + + Args: + mode (int): a file mode as returned by :func:`os.stat` in + :attr:`os.stat_result.st_mode` + + Returns: + DentryPerms: one of the following values: + :const:`DentryPerms.content`: plain file + :const:`DentryPerms.executable_content`: executable file + :const:`DentryPerms.symlink`: symbolic link + :const:`DentryPerms.directory`: directory + + """ + if stat.S_ISLNK(mode): + return DentryPerms.symlink + if stat.S_ISDIR(mode): + return DentryPerms.directory + else: + # file is executable in any way + if mode & (0o111): + return DentryPerms.executable_content + else: + return DentryPerms.content + + +class Content(MerkleLeaf): + """Representation of a Software Heritage content as a node in a Merkle tree. + + The current Merkle hash for the Content nodes is the `sha1_git`, which + makes it consistent with what :class:`Directory` uses for its own hash + computation. + + """ + __slots__ = [] + type = 'content' + + @classmethod + def from_bytes(cls, *, mode, data): + """Convert data (raw :class:`bytes`) to a Software Heritage content entry + + Args: + mode (int): a file mode (passed to :func:`mode_to_perms`) + data (bytes): raw contents of the file + """ + ret = hashutil.hash_data(data) + ret['length'] = len(data) + ret['perms'] = mode_to_perms(mode) + ret['data'] = data + + return cls(ret) + + @classmethod + def from_symlink(cls, *, path, mode): + """Convert a symbolic link to a Software Heritage content entry""" + return cls.from_bytes(mode=mode, data=os.readlink(path)) + + @classmethod + def from_file(cls, *, path, data=False): + """Compute the Software Heritage content entry corresponding to an on-disk + file. + + The returned dictionary contains keys useful for both: + - loading the content in the archive (hashes, `length`) + - using the content as a directory entry in a directory + + Args: + path (bytes): path to the file for which we're computing the + content entry + data (bool): add the file data to the entry + """ + file_stat = os.lstat(path) + mode = file_stat.st_mode + + if stat.S_ISLNK(mode): + # Symbolic link: return a file whose contents are the link target + return cls.from_symlink(path=path, mode=mode) + elif not stat.S_ISREG(mode): + # not a regular file: return the empty file instead + return cls.from_bytes(mode=mode, data=b'') + + length = file_stat.st_size + + if not data: + ret = hashutil.hash_path(path) + else: + chunks = [] + + def append_chunk(x, chunks=chunks): + chunks.append(x) + + with open(path, 'rb') as fobj: + ret = hashutil.hash_file(fobj, length=length, + chunk_cb=append_chunk) + + ret['data'] = b''.join(chunks) + + ret['perms'] = mode_to_perms(mode) + ret['length'] = length + + obj = cls(ret) + return obj + + def __repr__(self): + return 'Content(id=%s)' % id_to_str(self.hash) + + def compute_hash(self): + return self.data['sha1_git'] + + +def accept_all_directories(dirname, entries): + """Default filter for :func:`Directory.from_disk` accepting all + directories + + Args: + dirname (bytes): directory name + entries (list): directory entries + """ + return True + + +def ignore_empty_directories(dirname, entries): + """Filter for :func:`directory_to_objects` ignoring empty directories + + Args: + dirname (bytes): directory name + entries (list): directory entries + Returns: + True if the directory is not empty, false if the directory is empty + """ + return bool(entries) + + +def ignore_named_directories(names, *, case_sensitive=True): + """Filter for :func:`directory_to_objects` to ignore directories named one + of names. + + Args: + names (list of bytes): names to ignore + case_sensitive (bool): whether to do the filtering in a case sensitive + way + Returns: + a directory filter for :func:`directory_to_objects` + """ + if not case_sensitive: + names = [name.lower() for name in names] + + def named_filter(dirname, entries, + names=names, case_sensitive=case_sensitive): + if case_sensitive: + return dirname not in names + else: + return dirname.lower() not in names + + return named_filter + + +class Directory(MerkleNode): + """Representation of a Software Heritage directory as a node in a Merkle Tree. + + This class can be used to generate, from an on-disk directory, all the + objects that need to be sent to the Software Heritage archive. + + The :func:`from_disk` constructor allows you to generate the data structure + from a directory on disk. The resulting :class:`Directory` can then be + manipulated as a dictionary, using the path as key. + + The :func:`collect` method is used to retrieve all the objects that need to + be added to the Software Heritage archive since the last collection, by + class (contents and directories). + + When using the dict-like methods to update the contents of the directory, + the affected levels of hierarchy are reset and can be collected again using + the same method. This enables the efficient collection of updated nodes, + for instance when the client is applying diffs. + """ + __slots__ = ['__entries'] + type = 'directory' + + @classmethod + def from_disk(cls, *, path, data=False, + dir_filter=accept_all_directories): + """Compute the Software Heritage objects for a given directory tree + + Args: + path (bytes): the directory to traverse + data (bool): whether to add the data to the content objects + dir_filter (function): a filter to ignore some directories by + name or contents. Takes two arguments: dirname and entries, and + returns True if the directory should be added, False if the + directory should be ignored. + """ + + top_path = path + dirs = {} + + for root, dentries, fentries in os.walk(top_path, topdown=False): + entries = {} + # Join fentries and dentries in the same processing, as symbolic + # links to directories appear in dentries... + for name in fentries + dentries: + path = os.path.join(root, name) + if not os.path.isdir(path) or os.path.islink(path): + content = Content.from_file(path=path, data=data) + entries[name] = content + else: + if dir_filter(name, dirs[path].entries): + entries[name] = dirs[path] + + dirs[root] = cls({'name': os.path.basename(root)}) + dirs[root].update(entries) + + return dirs[top_path] + + def __init__(self, data=None): + super().__init__(data=data) + self.__entries = None + + def invalidate_hash(self): + self.__entries = None + super().invalidate_hash() + + @staticmethod + def child_to_directory_entry(name, child): + if isinstance(child, Directory): + return { + 'type': 'dir', + 'perms': DentryPerms.directory, + 'target': child.hash, + 'name': name, + } + elif isinstance(child, Content): + return { + 'type': 'file', + 'perms': child.data['perms'], + 'target': child.hash, + 'name': name, + } + else: + raise ValueError('unknown child') + + def get_data(self, **kwargs): + return { + 'id': self.hash, + 'entries': self.entries, + } + + @property + def entries(self): + if self.__entries is None: + self.__entries = [ + self.child_to_directory_entry(name, child) + for name, child in self.items() + ] + + return self.__entries + + def compute_hash(self): + return id_to_bytes(directory_identifier({'entries': self.entries})) + + def __getitem__(self, key): + if not isinstance(key, bytes): + raise ValueError('Can only get a bytes from directory') + + # Convenience shortcut + if key == b'': + return self + + if b'/' not in key: + return super().__getitem__(key) + else: + key1, key2 = key.split(b'/', 1) + return super().__getitem__(key1)[key2] + + def __setitem__(self, key, value): + if not isinstance(key, bytes): + raise ValueError('Can only set a bytes directory entry') + if not isinstance(value, (Content, Directory)): + raise ValueError('Can only set a directory entry to a Content or ' + 'Directory') + + if key == b'': + raise ValueError('Directory entry must have a name') + if b'\x00' in key: + raise ValueError('Directory entry name must not contain nul bytes') + + if b'/' not in key: + return super().__setitem__(key, value) + else: + key1, key2 = key.rsplit(b'/', 1) + self[key1].add_child(key2, value) + + def __delitem__(self, key): + if not isinstance(key, bytes): + raise ValueError('Can only delete a bytes directory entry') + + if b'/' not in key: + super().__delitem__(key) + else: + key1, key2 = key.rsplit(b'/', 1) + del super().__getitem__(key1)[key2] + + def __repr__(self): + return 'Directory(%s, id=%s)' % ( + self.data['name'], + id_to_str(self.hash) if self.hash else '?', + ) diff --git a/swh/model/merkle.py b/swh/model/merkle.py new file mode 100644 --- /dev/null +++ b/swh/model/merkle.py @@ -0,0 +1,286 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Merkle tree data structure""" + +import abc +import collections + + +def deep_update(left, right): + """Recursively update the left mapping with deeply nested values from the right + mapping. + + This function is useful to merge the results of several calls to + :func:`MerkleNode.collect`. + + Arguments: + left: a mapping (modified by the update operation) + right: a mapping + + Returns: + the left mapping, updated with nested values from the right mapping + + Example: + >>> a = { + ... 'key1': { + ... 'key2': { + ... 'key3': 'value1/2/3', + ... }, + ... }, + ... } + >>> deep_update(a, { + ... 'key1': { + ... 'key2': { + ... 'key4': 'value1/2/4', + ... }, + ... }, + ... }) == { + ... 'key1': { + ... 'key2': { + ... 'key3': 'value1/2/3', + ... 'key4': 'value1/2/4', + ... }, + ... }, + ... } + True + >>> deep_update(a, { + ... 'key1': { + ... 'key2': { + ... 'key3': 'newvalue1/2/3', + ... }, + ... }, + ... }) == { + ... 'key1': { + ... 'key2': { + ... 'key3': 'newvalue1/2/3', + ... 'key4': 'value1/2/4', + ... }, + ... }, + ... } + True + + """ + for key, rvalue in right.items(): + if isinstance(rvalue, collections.Mapping): + new_lvalue = deep_update(left.get(key, {}), rvalue) + left[key] = new_lvalue + else: + left[key] = rvalue + return left + + +class MerkleNode(dict, metaclass=abc.ABCMeta): + """Representation of a node in a Merkle Tree. + + A (generalized) `Merkle Tree`_ is a tree in which every node is labeled + with a hash of its own data and the hash of its children. + + .. _Merkle Tree: https://en.wikipedia.org/wiki/Merkle_tree + + In pseudocode:: + + node.hash = hash(node.data + + sum(child.hash for child in node.children)) + + This class efficiently implements the Merkle Tree data structure on top of + a Python :class:`dict`, minimizing hash computations and new data + collections when updating nodes. + + Node data is stored in the :attr:`data` attribute, while (named) children + are stored as items of the underlying dictionary. + + Addition, update and removal of objects are instrumented to automatically + invalidate the hashes of the current node as well as its registered + parents; It also resets the collection status of the objects so the updated + objects can be collected. + + The collection of updated data from the tree is implemented through the + :func:`collect` function and associated helpers. + + Attributes: + data (dict): data associated to the current node + parents (list): known parents of the current node + collected (bool): whether the current node has been collected + + """ + __slots__ = ['parents', 'data', '__hash', 'collected'] + + type = None + """Type of the current node (used as a classifier for :func:`collect`)""" + + def __init__(self, data=None): + super().__init__() + self.parents = [] + self.data = data + self.__hash = None + self.collected = False + + def invalidate_hash(self): + """Invalidate the cached hash of the current node.""" + if not self.__hash: + return + + self.__hash = None + self.collected = False + for parent in self.parents: + parent.invalidate_hash() + + def update_hash(self, *, force=False): + """Recursively compute the hash of the current node. + + Args: + force (bool): invalidate the cache and force the computation for + this node and all children. + """ + if self.__hash and not force: + return self.__hash + + if force: + self.invalidate_hash() + + for child in self.values(): + child.update_hash(force=force) + + self.__hash = self.compute_hash() + return self.__hash + + @property + def hash(self): + """The hash of the current node, as calculated by + :func:`compute_hash`. + """ + return self.update_hash() + + @abc.abstractmethod + def compute_hash(self): + """Compute the hash of the current node. + + The hash should depend on the data of the node, as well as on hashes + of the children nodes. + """ + raise NotImplementedError('Must implement compute_hash method') + + def __setitem__(self, name, new_child): + """Add a child, invalidating the current hash""" + self.invalidate_hash() + + super().__setitem__(name, new_child) + + new_child.parents.append(self) + + def __delitem__(self, name): + """Remove a child, invalidating the current hash""" + if name in self: + self.invalidate_hash() + self[name].parents.remove(self) + super().__delitem__(name) + else: + raise KeyError(name) + + def update(self, new_children): + """Add several named children from a dictionary""" + if not new_children: + return + + self.invalidate_hash() + + for name, new_child in new_children.items(): + new_child.parents.append(self) + if name in self: + self[name].parents.remove(self) + + super().update(new_children) + + def get_data(self, **kwargs): + """Retrieve and format the collected data for the current node, for use by + :func:`collect`. + + Can be overridden, for instance when you want the collected data to + contain information about the child nodes. + + Arguments: + kwargs: allow subclasses to alter behaviour depending on how + :func:`collect` is called. + + Returns: + data formatted for :func:`collect` + """ + return self.data + + def collect_node(self, **kwargs): + """Collect the data for the current node, for use by :func:`collect`. + + Arguments: + kwargs: passed as-is to :func:`get_data`. + + Returns: + A :class:`dict` compatible with :func:`collect`. + """ + if not self.collected: + self.collected = True + return {self.type: {self.hash: self.get_data(**kwargs)}} + else: + return {} + + def collect(self, **kwargs): + """Collect the data for all nodes in the subtree rooted at `self`. + + The data is deduplicated by type and by hash. + + Arguments: + kwargs: passed as-is to :func:`get_data`. + + Returns: + A :class:`dict` with the following structure:: + + { + 'typeA': { + node1.hash: node1.get_data(), + node2.hash: node2.get_data(), + }, + 'typeB': { + node3.hash: node3.get_data(), + ... + }, + ... + } + """ + ret = self.collect_node(**kwargs) + for child in self.values(): + deep_update(ret, child.collect(**kwargs)) + + return ret + + def reset_collect(self): + """Recursively unmark collected nodes in the subtree rooted at `self`. + + This lets the caller use :func:`collect` again. + """ + self.collected = False + + for child in self.values(): + child.reset_collect() + + +class MerkleLeaf(MerkleNode): + """A leaf to a Merkle tree. + + A Merkle leaf is simply a Merkle node with children disabled. + """ + __slots__ = [] + + def __setitem__(self, name, child): + raise ValueError('%s is a leaf' % self.__class__.__name__) + + def __getitem__(self, name): + raise ValueError('%s is a leaf' % self.__class__.__name__) + + def __delitem__(self, name): + raise ValueError('%s is a leaf' % self.__class__.__name__) + + def update(self, new_children): + """Children update operation. Disabled for leaves.""" + raise ValueError('%s is a leaf' % self.__class__.__name__) diff --git a/swh/model/tests/generate_testdata_from_disk.py b/swh/model/tests/generate_testdata_from_disk.py new file mode 100644 --- /dev/null +++ b/swh/model/tests/generate_testdata_from_disk.py @@ -0,0 +1,92 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from operator import itemgetter +import os +import sys + +from swh.model.from_disk import Directory, DentryPerms +from swh.model.hashutil import ALGORITHMS, hash_to_hex + + +def generate_from_directory(varname, directory, indent=0): + """Generate test data from a given directory""" + def get_data(member, path): + yield (path, member.get_data()) + if isinstance(member, Directory): + for name, child in member.items(): + yield from get_data(child, os.path.join(path, name)) + + data = dict(get_data(directory, b'')) + out = [] + + def format_hash(h, indent=0): + spindent = ' ' * indent + if len(h) > 20: + cutoff = len(h)//2 + parts = h[:cutoff], h[cutoff:] + else: + parts = [h] + + out.append('hash_to_bytes(\n') + for part in parts: + out.append(spindent + ' %s\n' % repr(hash_to_hex(part))) + out.append(spindent + ')') + + def format_dict_items(d, indent=0): + spindent = ' ' * indent + for key, value in sorted(d.items()): + if isinstance(key, bytes): + out.append(spindent + repr(key) + ': {\n') + format_dict_items(value, indent=indent + 4) + out.append(spindent + '}') + else: + out.append(spindent + repr(key) + ': ') + if key == 'entries': + if not value: + out.append('[]') + else: + out.append('[') + last_index = len(value) - 1 + for i, entry in enumerate( + sorted(value, key=itemgetter('name'))): + if i: + out.append(' ') + out.append('{\n') + format_dict_items(entry, indent=indent + 4) + if i != last_index: + out.append(spindent + '},') + out.append(spindent + '}]') + elif key in ALGORITHMS | {'id', 'target'}: + format_hash(value, indent=indent) + elif isinstance(value, DentryPerms): + out.append(str(value)) + else: + out.append(repr(value)) + out.append(',\n') + + spindent = ' ' * indent + out.append(spindent + '%s = {\n' % varname) + format_dict_items(data, indent=4 + indent) + out.append(spindent + '}') + + return ''.join(out) + + +if __name__ == '__main__': + if not sys.argv[1:]: + print("Usage: %s dir1 dir2" % sys.argv[0], file=sys.stderr) + exit(2) + + for dirname in sys.argv[1:]: + basename = os.path.basename(dirname) + varname = 'expected_%s' % basename + testdata = generate_from_directory( + varname, + Directory.from_disk(path=os.fsencode(dirname)), + indent=8 + ) + print(testdata) + print() diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py new file mode 100644 --- /dev/null +++ b/swh/model/tests/test_from_disk.py @@ -0,0 +1,658 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import tarfile +import tempfile +import unittest + +from swh.model import from_disk +from swh.model.from_disk import Content, Directory, DentryPerms +from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes + + +class ModeToPerms(unittest.TestCase): + def setUp(self): + super().setUp() + + # Generate a full permissions map + self.perms_map = {} + + # Symlinks + for i in range(0o120000, 0o127777 + 1): + self.perms_map[i] = DentryPerms.symlink + + # Directories + for i in range(0o040000, 0o047777 + 1): + self.perms_map[i] = DentryPerms.directory + + # Other file types: socket, regular file, block device, character + # device, fifo all map to regular files + for ft in [0o140000, 0o100000, 0o060000, 0o020000, 0o010000]: + for i in range(ft, ft + 0o7777 + 1): + if i & 0o111: + # executable bits are set + self.perms_map[i] = DentryPerms.executable_content + else: + self.perms_map[i] = DentryPerms.content + + def test_exhaustive_mode_to_perms(self): + for fmode, perm in self.perms_map.items(): + self.assertEqual(perm, from_disk.mode_to_perms(fmode)) + + +class DataMixin: + maxDiff = None + + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory( + prefix=b'swh.model.from_disk' + ) + self.contents = { + b'file': { + 'data': b'42\n', + 'sha1': hash_to_bytes( + '34973274ccef6ab4dfaaf86599792fa9c3fe4689' + ), + 'sha256': hash_to_bytes( + '084c799cd551dd1d8d5c5f9a5d593b2e' + '931f5e36122ee5c793c1d08a19839cc0' + ), + 'sha1_git': hash_to_bytes( + 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd'), + 'blake2s256': hash_to_bytes( + 'd5fe1939576527e42cfd76a9455a2432' + 'fe7f56669564577dd93c4280e76d661d' + ), + 'length': 3, + 'mode': 0o100644 + }, + } + + self.symlinks = { + b'symlink': { + 'data': b'target', + 'blake2s256': hash_to_bytes( + '595d221b30fdd8e10e2fdf18376e688e' + '9f18d56fd9b6d1eb6a822f8c146c6da6' + ), + 'sha1': hash_to_bytes( + '0e8a3ad980ec179856012b7eecf4327e99cd44cd' + ), + 'sha1_git': hash_to_bytes( + '1de565933b05f74c75ff9a6520af5f9f8a5a2f1d' + ), + 'sha256': hash_to_bytes( + '34a04005bcaf206eec990bd9637d9fdb' + '6725e0a0c0d4aebf003f17f4c956eb5c' + ), + 'length': 6, + 'perms': DentryPerms.symlink, + } + } + + self.specials = { + b'fifo': os.mkfifo, + b'devnull': lambda path: os.mknod(path, device=os.makedev(1, 3)), + } + + self.empty_content = { + 'data': b'', + 'length': 0, + 'blake2s256': hash_to_bytes( + '69217a3079908094e11121d042354a7c' + '1f55b6482ca1a51e1b250dfd1ed0eef9' + ), + 'sha1': hash_to_bytes( + 'da39a3ee5e6b4b0d3255bfef95601890afd80709' + ), + 'sha1_git': hash_to_bytes( + 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391' + ), + 'sha256': hash_to_bytes( + 'e3b0c44298fc1c149afbf4c8996fb924' + '27ae41e4649b934ca495991b7852b855' + ), + 'perms': DentryPerms.content, + } + + self.empty_directory = { + 'id': hash_to_bytes( + '4b825dc642cb6eb9a060e54bf8d69288fbee4904' + ), + 'entries': [], + } + + # Generated with generate_testdata_from_disk + self.tarball_contents = { + b'': { + 'entries': [{ + 'name': b'bar', + 'perms': DentryPerms.directory, + 'target': hash_to_bytes( + '3c1f578394f4623f74a0ba7fe761729f59fc6ec4' + ), + 'type': 'dir', + }, { + 'name': b'empty-folder', + 'perms': DentryPerms.directory, + 'target': hash_to_bytes( + '4b825dc642cb6eb9a060e54bf8d69288fbee4904' + ), + 'type': 'dir', + }, { + 'name': b'foo', + 'perms': DentryPerms.directory, + 'target': hash_to_bytes( + '2b41c40f0d1fbffcba12497db71fba83fcca96e5' + ), + 'type': 'dir', + }, { + 'name': b'link-to-another-quote', + 'perms': DentryPerms.symlink, + 'target': hash_to_bytes( + '7d5c08111e21c8a9f71540939998551683375fad' + ), + 'type': 'file', + }, { + 'name': b'link-to-binary', + 'perms': DentryPerms.symlink, + 'target': hash_to_bytes( + 'e86b45e538d9b6888c969c89fbd22a85aa0e0366' + ), + 'type': 'file', + }, { + 'name': b'link-to-foo', + 'perms': DentryPerms.symlink, + 'target': hash_to_bytes( + '19102815663d23f8b75a47e7a01965dcdc96468c' + ), + 'type': 'file', + }, { + 'name': b'some-binary', + 'perms': DentryPerms.executable_content, + 'target': hash_to_bytes( + '68769579c3eaadbe555379b9c3538e6628bae1eb' + ), + 'type': 'file', + }], + 'id': hash_to_bytes( + 'e8b0f1466af8608c8a3fb9879db172b887e80759' + ), + }, + b'bar': { + 'entries': [{ + 'name': b'barfoo', + 'perms': DentryPerms.directory, + 'target': hash_to_bytes( + 'c3020f6bf135a38c6df3afeb5fb38232c5e07087' + ), + 'type': 'dir', + }], + 'id': hash_to_bytes( + '3c1f578394f4623f74a0ba7fe761729f59fc6ec4' + ), + }, + b'bar/barfoo': { + 'entries': [{ + 'name': b'another-quote.org', + 'perms': DentryPerms.content, + 'target': hash_to_bytes( + '133693b125bad2b4ac318535b84901ebb1f6b638' + ), + 'type': 'file', + }], + 'id': hash_to_bytes( + 'c3020f6bf135a38c6df3afeb5fb38232c5e07087' + ), + }, + b'bar/barfoo/another-quote.org': { + 'blake2s256': hash_to_bytes( + 'd26c1cad82d43df0bffa5e7be11a60e3' + '4adb85a218b433cbce5278b10b954fe8' + ), + 'length': 72, + 'perms': DentryPerms.content, + 'sha1': hash_to_bytes( + '90a6138ba59915261e179948386aa1cc2aa9220a' + ), + 'sha1_git': hash_to_bytes( + '133693b125bad2b4ac318535b84901ebb1f6b638' + ), + 'sha256': hash_to_bytes( + '3db5ae168055bcd93a4d08285dc99ffe' + 'e2883303b23fac5eab850273a8ea5546' + ), + }, + b'empty-folder': { + 'entries': [], + 'id': hash_to_bytes( + '4b825dc642cb6eb9a060e54bf8d69288fbee4904' + ), + }, + b'foo': { + 'entries': [{ + 'name': b'barfoo', + 'perms': DentryPerms.symlink, + 'target': hash_to_bytes( + '8185dfb2c0c2c597d16f75a8a0c37668567c3d7e' + ), + 'type': 'file', + }, { + 'name': b'quotes.md', + 'perms': DentryPerms.content, + 'target': hash_to_bytes( + '7c4c57ba9ff496ad179b8f65b1d286edbda34c9a' + ), + 'type': 'file', + }, { + 'name': b'rel-link-to-barfoo', + 'perms': DentryPerms.symlink, + 'target': hash_to_bytes( + 'acac326ddd63b0bc70840659d4ac43619484e69f' + ), + 'type': 'file', + }], + 'id': hash_to_bytes( + '2b41c40f0d1fbffcba12497db71fba83fcca96e5' + ), + }, + b'foo/barfoo': { + 'blake2s256': hash_to_bytes( + 'e1252f2caa4a72653c4efd9af871b62b' + 'f2abb7bb2f1b0e95969204bd8a70d4cd' + ), + 'data': b'bar/barfoo', + 'length': 10, + 'perms': DentryPerms.symlink, + 'sha1': hash_to_bytes( + '9057ee6d0162506e01c4d9d5459a7add1fedac37' + ), + 'sha1_git': hash_to_bytes( + '8185dfb2c0c2c597d16f75a8a0c37668567c3d7e' + ), + 'sha256': hash_to_bytes( + '29ad3f5725321b940332c78e403601af' + 'ff61daea85e9c80b4a7063b6887ead68' + ), + }, + b'foo/quotes.md': { + 'blake2s256': hash_to_bytes( + 'bf7ce4fe304378651ee6348d3e9336ed' + '5ad603d33e83c83ba4e14b46f9b8a80b' + ), + 'length': 66, + 'perms': DentryPerms.content, + 'sha1': hash_to_bytes( + '1bf0bb721ac92c18a19b13c0eb3d741cbfadebfc' + ), + 'sha1_git': hash_to_bytes( + '7c4c57ba9ff496ad179b8f65b1d286edbda34c9a' + ), + 'sha256': hash_to_bytes( + 'caca942aeda7b308859eb56f909ec96d' + '07a499491690c453f73b9800a93b1659' + ), + }, + b'foo/rel-link-to-barfoo': { + 'blake2s256': hash_to_bytes( + 'd9c327421588a1cf61f316615005a2e9' + 'c13ac3a4e96d43a24138d718fa0e30db' + ), + 'data': b'../bar/barfoo', + 'length': 13, + 'perms': DentryPerms.symlink, + 'sha1': hash_to_bytes( + 'dc51221d308f3aeb2754db48391b85687c2869f4' + ), + 'sha1_git': hash_to_bytes( + 'acac326ddd63b0bc70840659d4ac43619484e69f' + ), + 'sha256': hash_to_bytes( + '8007d20db2af40435f42ddef4b8ad76b' + '80adbec26b249fdf0473353f8d99df08' + ), + }, + b'link-to-another-quote': { + 'blake2s256': hash_to_bytes( + '2d0e73cea01ba949c1022dc10c8a43e6' + '6180639662e5dc2737b843382f7b1910' + ), + 'data': b'bar/barfoo/another-quote.org', + 'length': 28, + 'perms': DentryPerms.symlink, + 'sha1': hash_to_bytes( + 'cbeed15e79599c90de7383f420fed7acb48ea171' + ), + 'sha1_git': hash_to_bytes( + '7d5c08111e21c8a9f71540939998551683375fad' + ), + 'sha256': hash_to_bytes( + 'e6e17d0793aa750a0440eb9ad5b80b25' + '8076637ef0fb68f3ac2e59e4b9ac3ba6' + ), + }, + b'link-to-binary': { + 'blake2s256': hash_to_bytes( + '9ce18b1adecb33f891ca36664da676e1' + '2c772cc193778aac9a137b8dc5834b9b' + ), + 'data': b'some-binary', + 'length': 11, + 'perms': DentryPerms.symlink, + 'sha1': hash_to_bytes( + 'd0248714948b3a48a25438232a6f99f0318f59f1' + ), + 'sha1_git': hash_to_bytes( + 'e86b45e538d9b6888c969c89fbd22a85aa0e0366' + ), + 'sha256': hash_to_bytes( + '14126e97d83f7d261c5a6889cee73619' + '770ff09e40c5498685aba745be882eff' + ), + }, + b'link-to-foo': { + 'blake2s256': hash_to_bytes( + '08d6cad88075de8f192db097573d0e82' + '9411cd91eb6ec65e8fc16c017edfdb74' + ), + 'data': b'foo', + 'length': 3, + 'perms': DentryPerms.symlink, + 'sha1': hash_to_bytes( + '0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33' + ), + 'sha1_git': hash_to_bytes( + '19102815663d23f8b75a47e7a01965dcdc96468c' + ), + 'sha256': hash_to_bytes( + '2c26b46b68ffc68ff99b453c1d304134' + '13422d706483bfa0f98a5e886266e7ae' + ), + }, + b'some-binary': { + 'blake2s256': hash_to_bytes( + '922e0f7015035212495b090c27577357' + 'a740ddd77b0b9e0cd23b5480c07a18c6' + ), + 'length': 5, + 'perms': DentryPerms.executable_content, + 'sha1': hash_to_bytes( + '0bbc12d7f4a2a15b143da84617d95cb223c9b23c' + ), + 'sha1_git': hash_to_bytes( + '68769579c3eaadbe555379b9c3538e6628bae1eb' + ), + 'sha256': hash_to_bytes( + 'bac650d34a7638bb0aeb5342646d24e3' + 'b9ad6b44c9b383621faa482b990a367d' + ), + }, + } + + def tearDown(self): + self.tmpdir.cleanup() + + def assertContentEqual(self, left, right, check_data=False): # NoQA + if not isinstance(left, Content): + raise ValueError('%s is not a Content' % left) + if isinstance(right, Content): + right = right.get_data() + + keys = DEFAULT_ALGORITHMS | { + 'length', + 'perms', + } + if check_data: + keys |= {'data'} + + failed = [] + for key in keys: + try: + lvalue = left.data[key] + if key == 'perms' and 'perms' not in right: + rvalue = from_disk.mode_to_perms(right['mode']) + else: + rvalue = right[key] + except KeyError: + failed.append(key) + continue + + if lvalue != rvalue: + failed.append(key) + + if failed: + raise self.failureException( + 'Content mismatched:\n' + + '\n'.join( + 'content[%s] = %r != %r' % ( + key, left.data.get(key), right.get(key)) + for key in failed + ) + ) + + def assertDirectoryEqual(self, left, right): # NoQA + if not isinstance(left, Directory): + raise ValueError('%s is not a Directory' % left) + if isinstance(right, Directory): + right = right.get_data() + + return self.assertCountEqual(left.entries, right['entries']) + + def make_contents(self, directory): + for filename, content in self.contents.items(): + path = os.path.join(directory, filename) + with open(path, 'wb') as f: + f.write(content['data']) + os.chmod(path, content['mode']) + + def make_symlinks(self, directory): + for filename, symlink in self.symlinks.items(): + path = os.path.join(directory, filename) + os.symlink(symlink['data'], path) + + def make_specials(self, directory): + for filename, fn in self.specials.items(): + path = os.path.join(directory, filename) + fn(path) + + def make_from_tarball(self, directory): + tarball = os.path.join(os.fsencode(os.path.dirname(__file__)), + b'../../../..', + b'swh-storage-testdata', + b'dir-folders', + b'sample-folder.tgz') + + with tarfile.open(tarball, 'r:gz') as f: + f.extractall(os.fsdecode(directory)) + + +class TestContent(DataMixin, unittest.TestCase): + def setUp(self): + super().setUp() + + def test_data_to_content(self): + for filename, content in self.contents.items(): + conv_content = Content.from_bytes(mode=content['mode'], + data=content['data']) + self.assertContentEqual(conv_content, content) + + +class SymlinkToContent(DataMixin, unittest.TestCase): + def setUp(self): + super().setUp() + self.make_symlinks(self.tmpdir.name) + + def test_symlink_to_content(self): + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir.name, filename) + perms = 0o120000 + conv_content = Content.from_symlink(path=path, mode=perms) + self.assertContentEqual(conv_content, symlink) + + +class FileToContent(DataMixin, unittest.TestCase): + def setUp(self): + super().setUp() + self.make_contents(self.tmpdir.name) + self.make_symlinks(self.tmpdir.name) + self.make_specials(self.tmpdir.name) + + def test_file_to_content(self): + for data in [False, True]: + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir.name, filename) + conv_content = Content.from_file(path=path, data=data) + self.assertContentEqual(conv_content, symlink, check_data=data) + + for filename, content in self.contents.items(): + path = os.path.join(self.tmpdir.name, filename) + conv_content = Content.from_file(path=path, data=data) + self.assertContentEqual(conv_content, content, check_data=data) + + for filename in self.specials: + path = os.path.join(self.tmpdir.name, filename) + conv_content = Content.from_file(path=path, data=data) + self.assertContentEqual(conv_content, self.empty_content, + check_data=data) + + +class DirectoryToObjects(DataMixin, unittest.TestCase): + def setUp(self): + super().setUp() + contents = os.path.join(self.tmpdir.name, b'contents') + os.mkdir(contents) + self.make_contents(contents) + symlinks = os.path.join(self.tmpdir.name, b'symlinks') + os.mkdir(symlinks) + self.make_symlinks(symlinks) + specials = os.path.join(self.tmpdir.name, b'specials') + os.mkdir(specials) + self.make_specials(specials) + empties = os.path.join(self.tmpdir.name, b'empty1', b'empty2') + os.makedirs(empties) + + def test_directory_to_objects(self): + directory = Directory.from_disk(path=self.tmpdir.name) + + for name, value in self.contents.items(): + self.assertContentEqual(directory[b'contents/' + name], value) + + for name, value in self.symlinks.items(): + self.assertContentEqual(directory[b'symlinks/' + name], value) + + for name in self.specials: + self.assertContentEqual( + directory[b'specials/' + name], + self.empty_content, + ) + + self.assertEqual( + directory[b'empty1/empty2'].get_data(), + self.empty_directory, + ) + + # Raise on non existent file + with self.assertRaisesRegex(KeyError, "b'nonexistent'"): + directory[b'empty1/nonexistent'] + + # Raise on non existent directory + with self.assertRaisesRegex(KeyError, "b'nonexistentdir'"): + directory[b'nonexistentdir/file'] + + objs = directory.collect() + + self.assertCountEqual(['content', 'directory'], objs) + + self.assertEqual(len(objs['directory']), 6) + self.assertEqual(len(objs['content']), + len(self.contents) + + len(self.symlinks) + + 1) + + def test_directory_to_objects_ignore_empty(self): + directory = Directory.from_disk( + path=self.tmpdir.name, + dir_filter=from_disk.ignore_empty_directories + ) + + for name, value in self.contents.items(): + self.assertContentEqual(directory[b'contents/' + name], value) + + for name, value in self.symlinks.items(): + self.assertContentEqual(directory[b'symlinks/' + name], value) + + for name in self.specials: + self.assertContentEqual( + directory[b'specials/' + name], + self.empty_content, + ) + + # empty directories have been ignored recursively + with self.assertRaisesRegex(KeyError, "b'empty1'"): + directory[b'empty1'] + with self.assertRaisesRegex(KeyError, "b'empty1'"): + directory[b'empty1/empty2'] + + objs = directory.collect() + + self.assertCountEqual(['content', 'directory'], objs) + + self.assertEqual(len(objs['directory']), 4) + self.assertEqual(len(objs['content']), + len(self.contents) + + len(self.symlinks) + + 1) + + def test_directory_to_objects_ignore_name(self): + directory = Directory.from_disk( + path=self.tmpdir.name, + dir_filter=from_disk.ignore_named_directories([b'symlinks']) + ) + for name, value in self.contents.items(): + self.assertContentEqual(directory[b'contents/' + name], value) + + for name in self.specials: + self.assertContentEqual( + directory[b'specials/' + name], + self.empty_content, + ) + + self.assertEqual( + directory[b'empty1/empty2'].get_data(), + self.empty_directory, + ) + + with self.assertRaisesRegex(KeyError, "b'symlinks'"): + directory[b'symlinks'] + + objs = directory.collect() + + self.assertCountEqual(['content', 'directory'], objs) + + self.assertEqual(len(objs['directory']), 5) + self.assertEqual(len(objs['content']), + len(self.contents) + + 1) + + +class TarballTest(DataMixin, unittest.TestCase): + def setUp(self): + super().setUp() + self.make_from_tarball(self.tmpdir.name) + + def test_contents_match(self): + directory = Directory.from_disk( + path=os.path.join(self.tmpdir.name, b'sample-folder') + ) + + for name, data in self.tarball_contents.items(): + obj = directory[name] + if isinstance(obj, Content): + self.assertContentEqual(obj, data) + elif isinstance(obj, Directory): + self.assertDirectoryEqual(obj, data) + else: + raise self.failureException('Unknown type for %s' % obj) diff --git a/swh/model/tests/test_merkle.py b/swh/model/tests/test_merkle.py new file mode 100644 --- /dev/null +++ b/swh/model/tests/test_merkle.py @@ -0,0 +1,229 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from swh.model import merkle + + +class TestedMerkleNode(merkle.MerkleNode): + type = 'tested_merkle_node_type' + + def __init__(self, data): + super().__init__(data) + self.compute_hash_called = 0 + + def compute_hash(self): + self.compute_hash_called += 1 + child_data = [ + child + b'=' + self[child].hash + for child in sorted(self) + ] + + return ( + b'hash(' + + b', '.join([self.data['value']] + child_data) + + b')' + ) + + +class TestedMerkleLeaf(merkle.MerkleLeaf): + type = 'tested_merkle_leaf_type' + + def __init__(self, data): + super().__init__(data) + self.compute_hash_called = 0 + + def compute_hash(self): + self.compute_hash_called += 1 + return b'hash(' + self.data['value'] + b')' + + +class TestMerkleLeaf(unittest.TestCase): + def setUp(self): + self.data = {'value': b'value'} + self.instance = TestedMerkleLeaf(self.data) + + def test_hash(self): + self.assertEqual(self.instance.compute_hash_called, 0) + instance_hash = self.instance.hash + self.assertEqual(self.instance.compute_hash_called, 1) + instance_hash2 = self.instance.hash + self.assertEqual(self.instance.compute_hash_called, 1) + self.assertEqual(instance_hash, instance_hash2) + + def test_data(self): + self.assertEqual(self.instance.get_data(), self.data) + + def test_collect(self): + collected = self.instance.collect() + self.assertEqual( + collected, { + self.instance.type: { + self.instance.hash: self.instance.get_data(), + }, + }, + ) + collected2 = self.instance.collect() + self.assertEqual(collected2, {}) + self.instance.reset_collect() + collected3 = self.instance.collect() + self.assertEqual(collected, collected3) + + def test_leaf(self): + with self.assertRaisesRegex(ValueError, 'is a leaf'): + self.instance[b'key1'] = 'Test' + + with self.assertRaisesRegex(ValueError, 'is a leaf'): + del self.instance[b'key1'] + + with self.assertRaisesRegex(ValueError, 'is a leaf'): + self.instance[b'key1'] + + with self.assertRaisesRegex(ValueError, 'is a leaf'): + self.instance.update(self.data) + + +class TestMerkleNode(unittest.TestCase): + maxDiff = None + + def setUp(self): + self.root = TestedMerkleNode({'value': b'root'}) + self.nodes = {b'root': self.root} + for i in (b'a', b'b', b'c'): + value = b'root/' + i + node = TestedMerkleNode({ + 'value': value, + }) + self.root[i] = node + self.nodes[value] = node + for j in (b'a', b'b', b'c'): + value2 = value + b'/' + j + node2 = TestedMerkleNode({ + 'value': value2, + }) + node[j] = node2 + self.nodes[value2] = node2 + for k in (b'a', b'b', b'c'): + value3 = value2 + b'/' + j + node3 = TestedMerkleNode({ + 'value': value3, + }) + node2[j] = node3 + self.nodes[value3] = node3 + + def test_hash(self): + for node in self.nodes.values(): + self.assertEqual(node.compute_hash_called, 0) + + # Root hash will compute hash for all the nodes + hash = self.root.hash + for node in self.nodes.values(): + self.assertEqual(node.compute_hash_called, 1) + self.assertIn(node.data['value'], hash) + + # Should use the cached value + hash2 = self.root.hash + self.assertEqual(hash, hash2) + for node in self.nodes.values(): + self.assertEqual(node.compute_hash_called, 1) + + # Should still use the cached value + hash3 = self.root.update_hash(force=False) + self.assertEqual(hash, hash3) + for node in self.nodes.values(): + self.assertEqual(node.compute_hash_called, 1) + + # Force update of the cached value for a deeply nested node + self.root[b'a'][b'b'].update_hash(force=True) + for key, node in self.nodes.items(): + # update_hash rehashes all children + if key.startswith(b'root/a/b'): + self.assertEqual(node.compute_hash_called, 2) + else: + self.assertEqual(node.compute_hash_called, 1) + + hash4 = self.root.hash + self.assertEqual(hash, hash4) + for key, node in self.nodes.items(): + # update_hash also invalidates all parents + if key in (b'root', b'root/a') or key.startswith(b'root/a/b'): + self.assertEqual(node.compute_hash_called, 2) + else: + self.assertEqual(node.compute_hash_called, 1) + + def test_collect(self): + collected = self.root.collect() + self.assertEqual(len(collected[self.root.type]), len(self.nodes)) + for node in self.nodes.values(): + self.assertTrue(node.collected) + collected2 = self.root.collect() + self.assertEqual(collected2, {}) + + def test_get(self): + for key in (b'a', b'b', b'c'): + self.assertEqual(self.root[key], self.nodes[b'root/' + key]) + + with self.assertRaisesRegex(KeyError, "b'nonexistent'"): + self.root[b'nonexistent'] + + def test_del(self): + hash_root = self.root.hash + hash_a = self.nodes[b'root/a'].hash + del self.root[b'a'][b'c'] + hash_root2 = self.root.hash + hash_a2 = self.nodes[b'root/a'].hash + + self.assertNotEqual(hash_root, hash_root2) + self.assertNotEqual(hash_a, hash_a2) + + self.assertEqual(self.nodes[b'root/a/c'].parents, []) + + with self.assertRaisesRegex(KeyError, "b'nonexistent'"): + del self.root[b'nonexistent'] + + def test_update(self): + hash_root = self.root.hash + hash_b = self.root[b'b'].hash + new_children = { + b'c': TestedMerkleNode({'value': b'root/b/new_c'}), + b'd': TestedMerkleNode({'value': b'root/b/d'}), + } + + # collect all nodes + self.root.collect() + + self.root[b'b'].update(new_children) + + # Ensure everyone got reparented + self.assertEqual(new_children[b'c'].parents, [self.root[b'b']]) + self.assertEqual(new_children[b'd'].parents, [self.root[b'b']]) + self.assertEqual(self.nodes[b'root/b/c'].parents, []) + + hash_root2 = self.root.hash + self.assertNotEqual(hash_root, hash_root2) + self.assertIn(b'root/b/new_c', hash_root2) + self.assertIn(b'root/b/d', hash_root2) + + hash_b2 = self.root[b'b'].hash + self.assertNotEqual(hash_b, hash_b2) + + for key, node in self.nodes.items(): + if key in (b'root', b'root/b'): + self.assertEqual(node.compute_hash_called, 2) + else: + self.assertEqual(node.compute_hash_called, 1) + + # Ensure we collected root, root/b, and both new children + collected_after_update = self.root.collect() + self.assertCountEqual( + collected_after_update[TestedMerkleNode.type], + [self.nodes[b'root'].hash, self.nodes[b'root/b'].hash, + new_children[b'c'].hash, new_children[b'd'].hash], + ) + + # test that noop updates doesn't invalidate anything + self.root[b'a'][b'b'].update({}) + self.assertEqual(self.root.collect(), {})