Changeset View
Changeset View
Standalone View
Standalone View
swh/model/merkle.py
# Copyright (C) 2017-2020 The Software Heritage developers | # Copyright (C) 2017-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Merkle tree data structure""" | """Merkle tree data structure""" | ||||
import abc | from __future__ import annotations | ||||
from collections.abc import Mapping | |||||
from typing import Dict, Iterator, List, Set | |||||
def deep_update(left, right): | |||||
"""Recursively update the left mapping with deeply nested values from the right | |||||
mapping. | |||||
This function is useful to merge the results of several calls to | |||||
:func:`MerkleNode.collect`. | |||||
Arguments: | |||||
left: a mapping (modified by the update operation) | |||||
right: a mapping | |||||
Returns: | import abc | ||||
the left mapping, updated with nested values from the right mapping | from typing import Any, Dict, Iterator, List, Set | ||||
Example: | |||||
>>> a = { | |||||
... 'key1': { | |||||
... 'key2': { | |||||
... 'key3': 'value1/2/3', | |||||
... }, | |||||
... }, | |||||
... } | |||||
>>> deep_update(a, { | |||||
... 'key1': { | |||||
... 'key2': { | |||||
... 'key4': 'value1/2/4', | |||||
... }, | |||||
... }, | |||||
... }) == { | |||||
... 'key1': { | |||||
... 'key2': { | |||||
... 'key3': 'value1/2/3', | |||||
... 'key4': 'value1/2/4', | |||||
... }, | |||||
... }, | |||||
... } | |||||
True | |||||
>>> deep_update(a, { | |||||
... 'key1': { | |||||
... 'key2': { | |||||
... 'key3': 'newvalue1/2/3', | |||||
... }, | |||||
... }, | |||||
... }) == { | |||||
... 'key1': { | |||||
... 'key2': { | |||||
... 'key3': 'newvalue1/2/3', | |||||
... 'key4': 'value1/2/4', | |||||
... }, | |||||
... }, | |||||
... } | |||||
True | |||||
""" | |||||
for key, rvalue in right.items(): | |||||
if isinstance(rvalue, Mapping): | |||||
new_lvalue = deep_update(left.get(key, {}), rvalue) | |||||
left[key] = new_lvalue | |||||
else: | |||||
left[key] = rvalue | |||||
return left | |||||
class MerkleNode(dict, metaclass=abc.ABCMeta): | class MerkleNode(dict, metaclass=abc.ABCMeta): | ||||
"""Representation of a node in a Merkle Tree. | """Representation of a node in a Merkle Tree. | ||||
A (generalized) `Merkle Tree`_ is a tree in which every node is labeled | A (generalized) `Merkle Tree`_ is a tree in which every node is labeled | ||||
with a hash of its own data and the hash of its children. | with a hash of its own data and the hash of its children. | ||||
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | def invalidate_hash(self): | ||||
if not self.__hash: | if not self.__hash: | ||||
return | return | ||||
self.__hash = None | self.__hash = None | ||||
self.collected = False | self.collected = False | ||||
for parent in self.parents: | for parent in self.parents: | ||||
parent.invalidate_hash() | parent.invalidate_hash() | ||||
def update_hash(self, *, force=False): | def update_hash(self, *, force=False) -> Any: | ||||
"""Recursively compute the hash of the current node. | """Recursively compute the hash of the current node. | ||||
Args: | Args: | ||||
force (bool): invalidate the cache and force the computation for | force (bool): invalidate the cache and force the computation for | ||||
this node and all children. | this node and all children. | ||||
""" | """ | ||||
if self.__hash and not force: | if self.__hash and not force: | ||||
return self.__hash | return self.__hash | ||||
if force: | if force: | ||||
self.invalidate_hash() | self.invalidate_hash() | ||||
for child in self.values(): | for child in self.values(): | ||||
child.update_hash(force=force) | child.update_hash(force=force) | ||||
self.__hash = self.compute_hash() | self.__hash = self.compute_hash() | ||||
return self.__hash | return self.__hash | ||||
@property | @property | ||||
def hash(self): | def hash(self) -> Any: | ||||
"""The hash of the current node, as calculated by | """The hash of the current node, as calculated by | ||||
:func:`compute_hash`. | :func:`compute_hash`. | ||||
""" | """ | ||||
return self.update_hash() | return self.update_hash() | ||||
def __hash__(self): | |||||
olasd: That way we avoid giving all subclasses a new method.
For the object to be properly hashable… | |||||
Done Inline Actions
Ah right, I forgot about that builtin.
There is already an __eq__ method implemented as follow: def __eq__(self, other): return ( isinstance(other, MerkleNode) and super().__eq__(other) and self.data == other.data ) But comparing hash values is faster that comparing nodes data so better update it. anlambert: > That way we avoid giving all subclasses a new method.
Ah right, I forgot about that builtin. | |||||
Not Done Inline ActionsIt's also prone to collisions, because it's a sha1_git. vlorentz: It's also prone to collisions, because it's a `sha1_git`. | |||||
Done Inline ActionsAh good point, better keeping the current implementation then. anlambert: Ah good point, better keeping the current implementation then. | |||||
return hash(self.hash) | |||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def compute_hash(self): | def compute_hash(self) -> Any: | ||||
"""Compute the hash of the current node. | """Compute the hash of the current node. | ||||
The hash should depend on the data of the node, as well as on hashes | The hash should depend on the data of the node, as well as on hashes | ||||
of the children nodes. | of the children nodes. | ||||
""" | """ | ||||
raise NotImplementedError("Must implement compute_hash method") | raise NotImplementedError("Must implement compute_hash method") | ||||
def __setitem__(self, name, new_child): | def __setitem__(self, name, new_child): | ||||
Show All 38 Lines | def get_data(self, **kwargs): | ||||
kwargs: allow subclasses to alter behaviour depending on how | kwargs: allow subclasses to alter behaviour depending on how | ||||
:func:`collect` is called. | :func:`collect` is called. | ||||
Returns: | Returns: | ||||
data formatted for :func:`collect` | data formatted for :func:`collect` | ||||
""" | """ | ||||
return self.data | return self.data | ||||
def collect_node(self, **kwargs): | def collect_node(self) -> Set[MerkleNode]: | ||||
"""Collect the data for the current node, for use by :func:`collect`. | """Collect the current node if it has not been yet, for use by :func:`collect`.""" | ||||
Arguments: | |||||
kwargs: passed as-is to :func:`get_data`. | |||||
Returns: | |||||
A :class:`dict` compatible with :func:`collect`. | |||||
""" | |||||
if not self.collected: | if not self.collected: | ||||
self.collected = True | self.collected = True | ||||
return {self.object_type: {self.hash: self.get_data(**kwargs)}} | return {self} | ||||
else: | else: | ||||
return {} | return set() | ||||
def collect(self, **kwargs): | |||||
"""Collect the data for all nodes in the subtree rooted at `self`. | |||||
The data is deduplicated by type and by hash. | def collect(self) -> Set[MerkleNode]: | ||||
"""Collect the added and modified nodes in the subtree rooted at `self` | |||||
Arguments: | since the last collect operation. | ||||
kwargs: passed as-is to :func:`get_data`. | |||||
Returns: | Returns: | ||||
A :class:`dict` with the following structure:: | A :class:`set` of collected nodes | ||||
{ | |||||
'typeA': { | |||||
node1.hash: node1.get_data(), | |||||
node2.hash: node2.get_data(), | |||||
}, | |||||
'typeB': { | |||||
node3.hash: node3.get_data(), | |||||
... | |||||
}, | |||||
... | |||||
} | |||||
""" | """ | ||||
ret = self.collect_node(**kwargs) | ret = self.collect_node() | ||||
for child in self.values(): | for child in self.values(): | ||||
deep_update(ret, child.collect(**kwargs)) | ret.update(child.collect()) | ||||
return ret | return ret | ||||
def reset_collect(self): | def reset_collect(self): | ||||
"""Recursively unmark collected nodes in the subtree rooted at `self`. | """Recursively unmark collected nodes in the subtree rooted at `self`. | ||||
This lets the caller use :func:`collect` again. | This lets the caller use :func:`collect` again. | ||||
""" | """ | ||||
self.collected = False | self.collected = False | ||||
for child in self.values(): | for child in self.values(): | ||||
child.reset_collect() | child.reset_collect() | ||||
def iter_tree(self, dedup=True) -> Iterator["MerkleNode"]: | def iter_tree(self, dedup=True) -> Iterator[MerkleNode]: | ||||
"""Yields all children nodes, recursively. Common nodes are deduplicated | """Yields all children nodes, recursively. Common nodes are deduplicated | ||||
by default (deduplication can be turned off setting the given argument | by default (deduplication can be turned off setting the given argument | ||||
'dedup' to False). | 'dedup' to False). | ||||
""" | """ | ||||
yield from self._iter_tree(set(), dedup) | yield from self._iter_tree(set(), dedup) | ||||
def _iter_tree(self, seen: Set[bytes], dedup) -> Iterator["MerkleNode"]: | def _iter_tree(self, seen: Set[bytes], dedup) -> Iterator[MerkleNode]: | ||||
if self.hash not in seen: | if self.hash not in seen: | ||||
if dedup: | if dedup: | ||||
seen.add(self.hash) | seen.add(self.hash) | ||||
yield self | yield self | ||||
for child in self.values(): | for child in self.values(): | ||||
yield from child._iter_tree(seen=seen, dedup=dedup) | yield from child._iter_tree(seen=seen, dedup=dedup) | ||||
Show All 20 Lines |
That way we avoid giving all subclasses a new method.
For the object to be properly hashable (in terms of Python) we need to implement __eq__ too.