Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9343678
D248.id825.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
35 KB
Subscribers
None
D248.id825.diff
View Options
diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
new file mode 100644
--- /dev/null
+++ b/swh/model/from_disk.py
@@ -0,0 +1,311 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import enum
+import os
+import stat
+
+from . import hashutil
+from .merkle import MerkleLeaf, MerkleNode
+from .identifiers import (
+ directory_identifier,
+ identifier_to_bytes as id_to_bytes,
+ identifier_to_str as id_to_str,
+)
+
+
+class DentryPerms(enum.IntEnum):
+ """Admissible permissions for directory entries."""
+ content = 0o100644
+ """Content"""
+ executable_content = 0o100755
+ """Executable content (e.g. executable script)"""
+ symlink = 0o120000
+ """Symbolic link"""
+ directory = 0o040000
+ """Directory"""
+ revision = 0o160000
+ """Revision (e.g. submodule)"""
+
+
+def mode_to_perms(mode):
+ """Convert a file mode to a permission compatible with Software Heritage
+ directory entries
+
+ Args:
+ mode (int): a file mode as returned by :func:`os.stat` in
+ :attr:`os.stat_result.st_mode`
+
+ Returns:
+ DentryPerms: one of the following values:
+ :const:`DentryPerms.content`: plain file
+ :const:`DentryPerms.executable_content`: executable file
+ :const:`DentryPerms.symlink`: symbolic link
+ :const:`DentryPerms.directory`: directory
+
+ """
+ if stat.S_ISLNK(mode):
+ return DentryPerms.symlink
+ if stat.S_ISDIR(mode):
+ return DentryPerms.directory
+ else:
+ # file is executable in any way
+ if mode & (0o111):
+ return DentryPerms.executable_content
+ else:
+ return DentryPerms.content
+
+
+class Content(MerkleLeaf):
+ """Representation of a Software Heritage content as a node in a Merkle tree.
+
+ The current data structure uses the `sha1_git` hash as a key.
+
+ """
+ __slots__ = []
+ type = 'content'
+
+ @classmethod
+ def from_bytes(cls, *, mode, data):
+ """Convert data (raw :class:`bytes`) to a Software Heritage content entry
+
+ Args:
+ mode (int): a file mode (passed to :func:`mode_to_perms`)
+ data (bytes): raw contents of the file
+ """
+ ret = hashutil.hash_data(data)
+ ret['length'] = len(data)
+ ret['perms'] = mode_to_perms(mode)
+ ret['data'] = data
+
+ return cls(ret)
+
+ @classmethod
+ def from_symlink(cls, *, path, mode):
+ """Convert a symbolic link to a Software Heritage content entry"""
+ return cls.from_bytes(mode=mode, data=os.readlink(path))
+
+ @classmethod
+ def from_file(cls, *, path, data=False):
+ """Compute the Software Heritage content entry corresponding to an on-disk
+ file.
+
+ The returned dictionary contains keys useful for both:
+ - loading the content in the archive (hashes, `length`)
+ - using the content as a directory entry in a directory
+
+ Args:
+ path (bytes): path to the file for which we're computing the
+ content entry
+ data (bool): add the file data to the entry
+ """
+ file_stat = os.lstat(path)
+ mode = file_stat.st_mode
+
+ if stat.S_ISLNK(mode):
+ # Symbolic link: return a file whose contents are the link target
+ return cls.from_symlink(path=path, mode=mode)
+ elif not stat.S_ISREG(mode):
+ # not a regular file: return the empty file instead
+ return cls.from_bytes(mode=mode, data=b'')
+
+ length = file_stat.st_size
+
+ if not data:
+ ret = hashutil.hash_path(path)
+ else:
+ chunks = []
+
+ def append_chunk(x, chunks=chunks):
+ chunks.append(x)
+
+ with open(path, 'rb') as fobj:
+ ret = hashutil.hash_file(fobj, length=length,
+ chunk_cb=append_chunk)
+
+ ret['data'] = b''.join(chunks)
+
+ ret['perms'] = mode_to_perms(mode)
+ ret['length'] = length
+
+ obj = cls(ret)
+ return obj
+
+ def __repr__(self):
+ return 'Content(id=%s)' % id_to_str(self.hash)
+
+ def compute_hash(self):
+ return self.data['sha1_git']
+
+
+def accept_all_directories(dirname, entries):
+ """Default filter for :func:`Directory.from_disk` accepting all
+ directories
+
+ Args:
+ dirname (bytes): directory name
+ entries (list): directory entries
+ """
+ return True
+
+
+def ignore_empty_directories(dirname, entries):
+ """Filter for :func:`directory_to_objects` ignoring empty directories
+
+ Args:
+ dirname (bytes): directory name
+ entries (list): directory entries
+ Returns:
+ True if the directory is not empty, false if the directory is empty
+ """
+ return bool(entries)
+
+
+def ignore_named_directories(names, *, case_sensitive=True):
+ """Filter for :func:`directory_to_objects` to ignore directories named one
+ of names.
+
+ Args:
+ names (list of bytes): names to ignore
+ case_sensitive (bool): whether to do the filtering in a case sensitive
+ way
+ Returns:
+ a directory filter for :func:`directory_to_objects`
+ """
+ if not case_sensitive:
+ names = [name.lower() for name in names]
+
+ def named_filter(dirname, entries,
+ names=names, case_sensitive=case_sensitive):
+ if case_sensitive:
+ return dirname not in names
+ else:
+ return dirname.lower() not in names
+
+ return named_filter
+
+
+class Directory(MerkleNode):
+ __slots__ = ['__entries']
+ type = 'directory'
+
+ @classmethod
+ def from_disk(cls, *, path, data=False,
+ dir_filter=accept_all_directories):
+ """Compute the Software Heritage objects for a given directory tree
+
+ Args:
+ path (bytes): the directory to traverse
+ data (bool): whether to add the data to the content objects
+ dir_filter (function): a filter to ignore some directories by
+ name or contents. Takes two arguments: dirname and entries, and
+ returns True if the directory should be added, False if the
+ directory should be ignored.
+ """
+
+ top_path = path
+ dirs = {}
+
+ for root, dentries, fentries in os.walk(top_path, topdown=False):
+ entries = {}
+ # Join fentries and dentries in the same processing, as symbolic
+ # links to directories appear in dentries...
+ for name in fentries + dentries:
+ path = os.path.join(root, name)
+ if not os.path.isdir(path) or os.path.islink(path):
+ content = Content.from_file(path=path, data=data)
+ entries[name] = content
+ else:
+ if dir_filter(name, dirs[path].entries):
+ entries[name] = dirs[path]
+
+ dirs[root] = cls({'name': os.path.basename(root)})
+ dirs[root].update(entries)
+
+ return dirs[top_path]
+
+ def __init__(self, data=None):
+ super().__init__(data=data)
+ self.__entries = None
+
+ def invalidate_hash(self):
+ self.__entries = None
+ super().invalidate_hash()
+
+ @staticmethod
+ def child_to_directory_entry(name, child):
+ if isinstance(child, Directory):
+ return {
+ 'type': 'dir',
+ 'perms': DentryPerms.directory,
+ 'target': child.hash,
+ 'name': name,
+ }
+ elif isinstance(child, Content):
+ return {
+ 'type': 'file',
+ 'perms': child.data['perms'],
+ 'target': child.hash,
+ 'name': name,
+ }
+ else:
+ raise ValueError('unknown child')
+
+ def get_data(self, **kwargs):
+ return {
+ 'id': self.hash,
+ 'entries': self.entries,
+ }
+
+ @property
+ def entries(self):
+ if self.__entries is None:
+ self.__entries = [
+ self.child_to_directory_entry(name, child)
+ for name, child in self.items()
+ ]
+
+ return self.__entries
+
+ def compute_hash(self):
+ return id_to_bytes(directory_identifier({'entries': self.entries}))
+
+ def __getitem__(self, key):
+ if not isinstance(key, bytes):
+ raise ValueError('Can only get a bytes from directory')
+ if b'/' not in key:
+ return super().__getitem__(key)
+ else:
+ key1, key2 = key.split(b'/', 1)
+ return super().__getitem__(key1)[key2]
+
+ def __setitem__(self, key, value):
+ if not isinstance(key, bytes):
+ raise ValueError('Can only set a bytes directory entry')
+ if not isinstance(value, (Content, Directory)):
+ raise ValueError('Can only set a directory entry to a Content or '
+ 'Directory')
+
+ if b'/' not in key:
+ return super().__setitem__(key, value)
+ else:
+ key1, key2 = key.rsplit(b'/', 1)
+ self[key1].add_child(key2, value)
+
+ def __delitem__(self, key):
+ if not isinstance(key, bytes):
+ raise ValueError('Can only delete a bytes directory entry')
+
+ if b'/' not in key:
+ super().__delitem__(key)
+ else:
+ key1, key2 = key.rsplit(b'/', 1)
+ del super().__getitem__(key1)[key2]
+
+ def __repr__(self):
+ return 'Directory(%s, id=%s)' % (
+ self.data['name'],
+ id_to_str(self.hash) if self.hash else '?',
+ )
diff --git a/swh/model/merkle.py b/swh/model/merkle.py
new file mode 100644
--- /dev/null
+++ b/swh/model/merkle.py
@@ -0,0 +1,272 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Merkle tree data structure"""
+
+import abc
+import collections
+
+
+def deep_update(left, right):
+ """Recursively update the left mapping with deeply nested values from the right
+ mapping.
+
+ This function is useful to merge the results of several calls to
+ :func:`MerkleNode.collect`.
+
+ Arguments:
+ left: a mapping (modified by the update operation)
+ right: a mapping
+
+ Returns:
+ the left mapping, updated with nested values from the right mapping
+
+ Example:
+ >>> a = {
+ ... 'key1': {
+ ... 'key2': {
+ ... 'key3': 'value1/2/3',
+ ... },
+ ... },
+ ... }
+ >>> deep_update(a, {
+ ... 'key1': {
+ ... 'key2': {
+ ... 'key4': 'value1/2/4',
+ ... },
+ ... },
+ ... })
+ {'key1': {'key2': {'key3': 'value1/2/3', 'key4': 'value1/2/4'}}}
+ >>> deep_update(a, {
+ ... 'key1': {
+ ... 'key2': {
+ ... 'key3': 'newvalue1/2/3',
+ ... },
+ ... },
+ ... })
+ {'key1': {'key2': {'key3': 'newvalue1/2/3', 'key4': 'value1/2/4'}}}
+
+ """
+ for key, rvalue in right.items():
+ if isinstance(rvalue, collections.Mapping):
+ new_lvalue = deep_update(left.get(key, {}), rvalue)
+ left[key] = new_lvalue
+ else:
+ left[key] = rvalue
+ return left
+
+
+class MerkleNode(dict, metaclass=abc.ABCMeta):
+ """Representation of a node in a Merkle Tree.
+
+ A (generalized) `Merkle Tree`_ is a tree in which every node is labeled
+ with a hash of its own data and the hash of its children.
+
+ .. _Merkle Tree: https://en.wikipedia.org/wiki/Merkle_tree
+
+ In pseudocode::
+
+ node.hash = hash(node.data
+ + sum(child.hash for child in node.children))
+
+ This class efficiently implements the Merkle Tree data structure on top of
+ a Python :class:`dict`, minimizing hash computations and new data
+ collections when updating nodes.
+
+ Node data is stored in the :attr:`data` attribute, while (named) children
+ are stored as items of the underlying dictionary.
+
+ Addition, update and removal of objects are instrumented to automatically
+ invalidate the hashes of the current node as well as its registered
+ parents; It also resets the collection status of the objects so the updated
+ objects can be collected.
+
+ The collection of updated data from the tree is implemented through the
+ :func:`collect` function and associated helpers.
+
+ Attributes:
+ data (dict): data associated to the current node
+ parents (list): known parents of the current node
+ collected (bool): whether the current node has been collected
+
+ """
+ __slots__ = ['parents', 'data', '__hash', 'collected']
+
+ type = None
+ """Type of the current node (used as a classifier for :func:`collect`)"""
+
+ def __init__(self, data=None):
+ super().__init__()
+ self.parents = []
+ self.data = data
+ self.__hash = None
+ self.collected = False
+
+ def invalidate_hash(self):
+ """Invalidate the cached hash of the current node."""
+ if not self.__hash:
+ return
+
+ self.__hash = None
+ self.collected = False
+ for parent in self.parents:
+ parent.invalidate_hash()
+
+ def update_hash(self, *, force=False):
+ """Recursively compute the hash of the current node.
+
+ Args:
+ force (bool): invalidate the cache and force the computation for
+ this node and all children.
+ """
+ if self.__hash and not force:
+ return self.__hash
+
+ if force:
+ self.invalidate_hash()
+
+ for child in self.values():
+ child.update_hash(force=force)
+
+ self.__hash = self.compute_hash()
+ return self.__hash
+
+ @property
+ def hash(self):
+ """The hash of the current node, as calculated by
+ :func:`compute_hash`.
+ """
+ return self.update_hash()
+
+ @abc.abstractmethod
+ def compute_hash(self):
+ """Compute the hash of the current node.
+
+ The hash should depend on the data of the node, as well as on hashes
+ of the children nodes.
+ """
+ raise NotImplementedError('Must implement compute_hash method')
+
+ def __setitem__(self, name, new_child):
+ """Add a child, invalidating the current hash"""
+ self.invalidate_hash()
+
+ super().__setitem__(name, new_child)
+
+ new_child.parents.append(self)
+
+ def __delitem__(self, name):
+ """Remove a child, invalidating the current hash"""
+ if name in self:
+ self.invalidate_hash()
+ self[name].parents.remove(self)
+ super().__delitem__(name)
+ else:
+ raise KeyError(name)
+
+ def update(self, new_children):
+ """Add several named children from a dictionary"""
+ if not new_children:
+ return
+
+ self.invalidate_hash()
+
+ for name, new_child in new_children.items():
+ new_child.parents.append(self)
+ if name in self:
+ self[name].parents.remove(self)
+
+ super().update(new_children)
+
+ def get_data(self, **kwargs):
+ """Retrieve and format the collected data for the current node, for use by
+ :func:`collect`.
+
+ Can be overridden, for instance when you want the collected data to
+ contain information about the child nodes.
+
+ Arguments:
+ kwargs: allow subclasses to alter behaviour depending on how
+ :func:`collect` is called.
+
+ Returns:
+ data formatted for :func:`collect`
+ """
+ return self.data
+
+ def collect_node(self, **kwargs):
+ """Collect the data for the current node, for use by :func:`collect`.
+
+ Arguments:
+ kwargs: passed as-is to :func:`get_data`.
+
+ Returns:
+ A :class:`dict` compatible with :func:`collect`.
+ """
+ if not self.collected:
+ self.collected = True
+ return {self.type: {self.hash: self.get_data(**kwargs)}}
+ else:
+ return {}
+
+ def collect(self, **kwargs):
+ """Collect the data for all nodes in the subtree rooted at `self`.
+
+ The data is deduplicated by type and by hash.
+
+ Arguments:
+ kwargs: passed as-is to :func:`get_data`.
+
+ Returns:
+ A :class:`dict` with the following structure::
+
+ {
+ 'typeA': {
+ node1.hash: node1.get_data(),
+ node2.hash: node2.get_data(),
+ },
+ 'typeB': {
+ node3.hash: node3.get_data(),
+ ...
+ },
+ ...
+ }
+ """
+ ret = self.collect_node(**kwargs)
+ for child in self.values():
+ deep_update(ret, child.collect(**kwargs))
+
+ return ret
+
+ def reset_collect(self):
+ """Recursively unmark collected nodes in the subtree rooted at `self`.
+
+ This lets the caller use :func:`collect` again.
+ """
+ self.collected = False
+
+ for child in self.values():
+ child.reset_collect()
+
+
+class MerkleLeaf(MerkleNode):
+ """A leaf to a Merkle tree.
+
+ A Merkle leaf is simply a Merkle node with children disabled.
+ """
+ __slots__ = []
+
+ def __setitem__(self, name, child):
+ raise ValueError('%s is a leaf' % self.__class__.__name__)
+
+ def __getitem__(self, name):
+ raise ValueError('%s is a leaf' % self.__class__.__name__)
+
+ def __delitem__(self, name):
+ raise ValueError('%s is a leaf' % self.__class__.__name__)
+
+ def update(self, new_children):
+ """Children update operation. Disabled for leaves."""
+ raise ValueError('%s is a leaf' % self.__class__.__name__)
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
new file mode 100644
--- /dev/null
+++ b/swh/model/tests/test_from_disk.py
@@ -0,0 +1,268 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import tempfile
+import unittest
+
+from swh.model import from_disk
+from swh.model.from_disk import Content, Directory
+from swh.model.hashutil import hash_to_bytes
+
+
+class ModeToPerms(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+
+ perms = from_disk.DentryPerms
+
+ # Generate a full permissions map
+ self.perms_map = {}
+
+ # Symlinks
+ for i in range(0o120000, 0o127777 + 1):
+ self.perms_map[i] = perms.symlink
+
+ # Directories
+ for i in range(0o040000, 0o047777 + 1):
+ self.perms_map[i] = perms.directory
+
+ # Other file types: socket, regular file, block device, character
+ # device, fifo all map to regular files
+ for ft in [0o140000, 0o100000, 0o060000, 0o020000, 0o010000]:
+ for i in range(ft, ft + 0o7777 + 1):
+ if i & 0o111:
+ # executable bits are set
+ self.perms_map[i] = perms.executable_content
+ else:
+ self.perms_map[i] = perms.content
+
+ def test_exhaustive_mode_to_perms(self):
+ for fmode, perm in self.perms_map.items():
+ self.assertEqual(perm, from_disk.mode_to_perms(fmode))
+
+
+class DataMixin:
+ maxDiff = None
+
+ def setUp(self):
+ self.tmpdir = tempfile.TemporaryDirectory(
+ prefix=b'swh.model.from_disk'
+ )
+ self.contents = {
+ b'file': {
+ 'data': b'42\n',
+ 'sha1': hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4689'
+ ),
+ 'sha256': hash_to_bytes(
+ '084c799cd551dd1d8d5c5f9a5d593b2e'
+ '931f5e36122ee5c793c1d08a19839cc0'
+ ),
+ 'sha1_git': hash_to_bytes(
+ 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd'),
+ 'blake2s256': hash_to_bytes(
+ 'd5fe1939576527e42cfd76a9455a2432'
+ 'fe7f56669564577dd93c4280e76d661d'
+ ),
+ 'length': 3,
+ 'mode': 0o100644
+ },
+ }
+
+ self.symlinks = {
+ b'symlink': {
+ 'data': b'target',
+ 'blake2s256': hash_to_bytes(
+ '595d221b30fdd8e10e2fdf18376e688e'
+ '9f18d56fd9b6d1eb6a822f8c146c6da6'
+ ),
+ 'sha1': hash_to_bytes(
+ '0e8a3ad980ec179856012b7eecf4327e99cd44cd'
+ ),
+ 'sha1_git': hash_to_bytes(
+ '1de565933b05f74c75ff9a6520af5f9f8a5a2f1d'
+ ),
+ 'sha256': hash_to_bytes(
+ '34a04005bcaf206eec990bd9637d9fdb'
+ '6725e0a0c0d4aebf003f17f4c956eb5c'
+ ),
+ 'length': 6,
+ }
+ }
+
+ self.specials = {
+ b'fifo': os.mkfifo,
+ b'devnull': lambda path: os.mknod(path, device=os.makedev(1, 3)),
+ }
+
+ self.empty_content = {
+ 'data': b'',
+ 'length': 0,
+ 'length': 0,
+ 'blake2s256': hash_to_bytes(
+ '69217a3079908094e11121d042354a7c'
+ '1f55b6482ca1a51e1b250dfd1ed0eef9'
+ ),
+ 'sha1': hash_to_bytes(
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
+ ),
+ 'sha1_git': hash_to_bytes(
+ 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
+ ),
+ 'sha256': hash_to_bytes(
+ 'e3b0c44298fc1c149afbf4c8996fb924'
+ '27ae41e4649b934ca495991b7852b855'
+ ),
+ }
+
+ def tearDown(self):
+ self.tmpdir.cleanup()
+
+ def make_contents(self, directory):
+ for filename, content in self.contents.items():
+ path = os.path.join(directory, filename)
+ with open(path, 'wb') as f:
+ f.write(content['data'])
+ os.chmod(path, content['mode'])
+
+ def make_symlinks(self, directory):
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(directory, filename)
+ os.symlink(symlink['data'], path)
+
+ def make_specials(self, directory):
+ for filename, fn in self.specials.items():
+ path = os.path.join(directory, filename)
+ fn(path)
+
+
+class TestContent(DataMixin, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+
+ def test_data_to_content(self):
+ for filename, content in self.contents.items():
+ mode = content.pop('mode')
+ conv_content = Content.from_bytes(mode=mode, data=content['data'])
+ structure = conv_content.data.copy()
+ self.assertEqual(structure.pop('perms'),
+ from_disk.mode_to_perms(mode))
+ self.assertEqual(structure, content)
+
+
+class SymlinkToContent(DataMixin, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ self.make_symlinks(self.tmpdir.name)
+
+ def test_symlink_to_content(self):
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o120000
+ conv_content = Content.from_symlink(path=path, mode=perms)
+ conv_content = conv_content.data.copy()
+ self.assertEqual(conv_content.pop('perms'), perms)
+ self.assertEqual(conv_content, symlink)
+
+
+class FileToContent(DataMixin, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ self.make_contents(self.tmpdir.name)
+ self.make_symlinks(self.tmpdir.name)
+ self.make_specials(self.tmpdir.name)
+
+ def test_file_to_content(self):
+ for data in [False, True]:
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o120000
+ conv_content = Content.from_file(path=path, data=data)
+ conv_content = conv_content.data.copy()
+ self.assertEqual(conv_content.pop('perms'), perms)
+ if not data:
+ conv_content['data'] = symlink['data']
+ self.assertEqual(conv_content, symlink)
+
+ for filename, content in self.contents.items():
+ content = content.copy()
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o100644
+ if content.pop('mode') & 0o111:
+ perms = 0o100755
+ conv_content = Content.from_file(path=path, data=data)
+ conv_content = conv_content.data.copy()
+ self.assertEqual(conv_content.pop('perms'), perms)
+ if not data:
+ conv_content['data'] = content['data']
+ self.assertEqual(conv_content, content)
+
+ for filename in self.specials:
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o100644
+ conv_content = Content.from_file(path=path, data=data)
+ conv_content = conv_content.data.copy()
+ self.assertEqual(conv_content.pop('perms'), perms)
+ if not data:
+ conv_content['data'] = b''
+ self.assertEqual(conv_content, self.empty_content)
+
+
+class DirectoryToObjects(DataMixin, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ contents = os.path.join(self.tmpdir.name, b'contents')
+ os.mkdir(contents)
+ self.make_contents(contents)
+ symlinks = os.path.join(self.tmpdir.name, b'symlinks')
+ os.mkdir(symlinks)
+ self.make_symlinks(symlinks)
+ specials = os.path.join(self.tmpdir.name, b'specials')
+ os.mkdir(specials)
+ self.make_specials(specials)
+ empties = os.path.join(self.tmpdir.name, b'empty1', b'empty2')
+ os.makedirs(empties)
+
+ def test_directory_to_objects(self):
+ objs = Directory.from_disk(path=self.tmpdir.name).collect()
+
+ self.assertIn('content', objs)
+ self.assertIn('directory', objs)
+
+ self.assertEqual(len(objs['directory']), 6)
+ self.assertEqual(len(objs['content']),
+ len(self.contents)
+ + len(self.symlinks)
+ + 1)
+
+ def test_directory_to_objects_ignore_empty(self):
+ objs = Directory.from_disk(
+ path=self.tmpdir.name,
+ dir_filter=from_disk.ignore_empty_directories
+ ).collect()
+
+ self.assertIn('content', objs)
+ self.assertIn('directory', objs)
+
+ self.assertEqual(len(objs['directory']), 4)
+ self.assertEqual(len(objs['content']),
+ len(self.contents)
+ + len(self.symlinks)
+ + 1)
+
+ def test_directory_to_objects_ignore_name(self):
+ objs = Directory.from_disk(
+ path=self.tmpdir.name,
+ dir_filter=from_disk.ignore_named_directories([b'symlinks'])
+ ).collect()
+
+ self.assertIn('content', objs)
+ self.assertIn('directory', objs)
+
+ self.assertEqual(len(objs['directory']), 5)
+ self.assertEqual(len(objs['content']),
+ len(self.contents)
+ + 1)
diff --git a/swh/model/tests/test_merkle.py b/swh/model/tests/test_merkle.py
new file mode 100644
--- /dev/null
+++ b/swh/model/tests/test_merkle.py
@@ -0,0 +1,229 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+
+from swh.model import merkle
+
+
+class TestedMerkleNode(merkle.MerkleNode):
+ type = 'tested_merkle_node_type'
+
+ def __init__(self, data):
+ super().__init__(data)
+ self.compute_hash_called = 0
+
+ def compute_hash(self):
+ self.compute_hash_called += 1
+ child_data = [
+ child + b'=' + self[child].hash
+ for child in sorted(self)
+ ]
+
+ return (
+ b'hash('
+ + b', '.join([self.data['value']] + child_data)
+ + b')'
+ )
+
+
+class TestedMerkleLeaf(merkle.MerkleLeaf):
+ type = 'tested_merkle_leaf_type'
+
+ def __init__(self, data):
+ super().__init__(data)
+ self.compute_hash_called = 0
+
+ def compute_hash(self):
+ self.compute_hash_called += 1
+ return b'hash(' + self.data['value'] + b')'
+
+
+class TestMerkleLeaf(unittest.TestCase):
+ def setUp(self):
+ self.data = {'value': b'value'}
+ self.instance = TestedMerkleLeaf(self.data)
+
+ def test_hash(self):
+ self.assertEqual(self.instance.compute_hash_called, 0)
+ instance_hash = self.instance.hash
+ self.assertEqual(self.instance.compute_hash_called, 1)
+ instance_hash2 = self.instance.hash
+ self.assertEqual(self.instance.compute_hash_called, 1)
+ self.assertEqual(instance_hash, instance_hash2)
+
+ def test_data(self):
+ self.assertEqual(self.instance.get_data(), self.data)
+
+ def test_collect(self):
+ collected = self.instance.collect()
+ self.assertEqual(
+ collected, {
+ self.instance.type: {
+ self.instance.hash: self.instance.get_data(),
+ },
+ },
+ )
+ collected2 = self.instance.collect()
+ self.assertEqual(collected2, {})
+ self.instance.reset_collect()
+ collected3 = self.instance.collect()
+ self.assertEqual(collected, collected3)
+
+ def test_leaf(self):
+ with self.assertRaisesRegex(ValueError, 'is a leaf'):
+ self.instance[b'key1'] = 'Test'
+
+ with self.assertRaisesRegex(ValueError, 'is a leaf'):
+ del self.instance[b'key1']
+
+ with self.assertRaisesRegex(ValueError, 'is a leaf'):
+ self.instance[b'key1']
+
+ with self.assertRaisesRegex(ValueError, 'is a leaf'):
+ self.instance.update(self.data)
+
+
+class TestMerkleNode(unittest.TestCase):
+ maxDiff = None
+
+ def setUp(self):
+ self.root = TestedMerkleNode({'value': b'root'})
+ self.nodes = {b'root': self.root}
+ for i in (b'a', b'b', b'c'):
+ value = b'root/' + i
+ node = TestedMerkleNode({
+ 'value': value,
+ })
+ self.root[i] = node
+ self.nodes[value] = node
+ for j in (b'a', b'b', b'c'):
+ value2 = value + b'/' + j
+ node2 = TestedMerkleNode({
+ 'value': value2,
+ })
+ node[j] = node2
+ self.nodes[value2] = node2
+ for k in (b'a', b'b', b'c'):
+ value3 = value2 + b'/' + j
+ node3 = TestedMerkleNode({
+ 'value': value3,
+ })
+ node2[j] = node3
+ self.nodes[value3] = node3
+
+ def test_hash(self):
+ for node in self.nodes.values():
+ self.assertEqual(node.compute_hash_called, 0)
+
+ # Root hash will compute hash for all the nodes
+ hash = self.root.hash
+ for node in self.nodes.values():
+ self.assertEqual(node.compute_hash_called, 1)
+ self.assertIn(node.data['value'], hash)
+
+ # Should use the cached value
+ hash2 = self.root.hash
+ self.assertEqual(hash, hash2)
+ for node in self.nodes.values():
+ self.assertEqual(node.compute_hash_called, 1)
+
+ # Should still use the cached value
+ hash3 = self.root.update_hash(force=False)
+ self.assertEqual(hash, hash3)
+ for node in self.nodes.values():
+ self.assertEqual(node.compute_hash_called, 1)
+
+ # Force update of the cached value for a deeply nested node
+ self.root[b'a'][b'b'].update_hash(force=True)
+ for key, node in self.nodes.items():
+ # update_hash rehashes all children
+ if key.startswith(b'root/a/b'):
+ self.assertEqual(node.compute_hash_called, 2)
+ else:
+ self.assertEqual(node.compute_hash_called, 1)
+
+ hash4 = self.root.hash
+ self.assertEqual(hash, hash4)
+ for key, node in self.nodes.items():
+ # update_hash also invalidates all parents
+ if key in (b'root', b'root/a') or key.startswith(b'root/a/b'):
+ self.assertEqual(node.compute_hash_called, 2)
+ else:
+ self.assertEqual(node.compute_hash_called, 1)
+
+ def test_collect(self):
+ collected = self.root.collect()
+ self.assertEqual(len(collected[self.root.type]), len(self.nodes))
+ for node in self.nodes.values():
+ self.assertTrue(node.collected)
+ collected2 = self.root.collect()
+ self.assertEqual(collected2, {})
+
+ def test_get(self):
+ for key in (b'a', b'b', b'c'):
+ self.assertEqual(self.root[key], self.nodes[b'root/' + key])
+
+ with self.assertRaisesRegex(KeyError, "b'nonexistent'"):
+ self.root[b'nonexistent']
+
+ def test_del(self):
+ hash_root = self.root.hash
+ hash_a = self.nodes[b'root/a'].hash
+ del self.root[b'a'][b'c']
+ hash_root2 = self.root.hash
+ hash_a2 = self.nodes[b'root/a'].hash
+
+ self.assertNotEqual(hash_root, hash_root2)
+ self.assertNotEqual(hash_a, hash_a2)
+
+ self.assertEqual(self.nodes[b'root/a/c'].parents, [])
+
+ with self.assertRaisesRegex(KeyError, "b'nonexistent'"):
+ del self.root[b'nonexistent']
+
+ def test_update(self):
+ hash_root = self.root.hash
+ hash_b = self.root[b'b'].hash
+ new_children = {
+ b'c': TestedMerkleNode({'value': b'root/b/new_c'}),
+ b'd': TestedMerkleNode({'value': b'root/b/d'}),
+ }
+
+ # collect all nodes
+ self.root.collect()
+
+ self.root[b'b'].update(new_children)
+
+ # Ensure everyone got reparented
+ self.assertEqual(new_children[b'c'].parents, [self.root[b'b']])
+ self.assertEqual(new_children[b'd'].parents, [self.root[b'b']])
+ self.assertEqual(self.nodes[b'root/b/c'].parents, [])
+
+ hash_root2 = self.root.hash
+ self.assertNotEqual(hash_root, hash_root2)
+ self.assertIn(b'root/b/new_c', hash_root2)
+ self.assertIn(b'root/b/d', hash_root2)
+
+ hash_b2 = self.root[b'b'].hash
+ self.assertNotEqual(hash_b, hash_b2)
+
+ for key, node in self.nodes.items():
+ if key in (b'root', b'root/b'):
+ self.assertEqual(node.compute_hash_called, 2)
+ else:
+ self.assertEqual(node.compute_hash_called, 1)
+
+ # Ensure we collected root, root/b, and both new children
+ collected_after_update = self.root.collect()
+ self.assertCountEqual(
+ collected_after_update[TestedMerkleNode.type],
+ [self.nodes[b'root'].hash, self.nodes[b'root/b'].hash,
+ new_children[b'c'].hash, new_children[b'd'].hash],
+ )
+
+ # test that noop updates doesn't invalidate anything
+ self.root[b'a'][b'b'].update({})
+ self.assertEqual(self.root.collect(), {})
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 1:44 PM (4 d, 20 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222294
Attached To
D248: from_disk: convert on-disk data to Software Heritage archive objects
Event Timeline
Log In to Comment