diff --git a/swh/storage/algos/diff.py b/swh/storage/algos/diff.py --- a/swh/storage/algos/diff.py +++ b/swh/storage/algos/diff.py @@ -14,6 +14,7 @@ import collections +from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import directory_identifier from .dir_iterators import ( @@ -21,7 +22,7 @@ ) # get the hash identifier for an empty directory -_empty_dir_hash = directory_identifier({'entries': []}) +_empty_dir_hash = hash_to_bytes(directory_identifier({'entries': []})) def _get_rev(storage, rev_id): diff --git a/swh/storage/algos/dir_iterators.py b/swh/storage/algos/dir_iterators.py --- a/swh/storage/algos/dir_iterators.py +++ b/swh/storage/algos/dir_iterators.py @@ -13,10 +13,11 @@ from enum import Enum +from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import directory_identifier # get the hash identifier for an empty directory -_empty_dir_hash = directory_identifier({'entries': []}) +_empty_dir_hash = hash_to_bytes(directory_identifier({'entries': []})) def _get_dir(storage, dir_id): @@ -69,19 +70,14 @@ Args: dir_id (bytes): identifier of a root directory """ - if dir_id: - if dir_id == _empty_dir_hash: - self.frames.append([]) - else: - # get directory entries - dir_data = _get_dir(self.storage, dir_id) - # sort them in lexicographical order - dir_data = sorted(dir_data, key=lambda e: e['name']) - # reverse the ordering in order to unstack the "smallest" - # entry each time the iterator advances - dir_data.reverse() - # push the directory frame to the main stack - self.frames.append(dir_data) + # get directory entries + dir_data = _get_dir(self.storage, dir_id) + # sort them in lexicographical order and reverse the ordering + # in order to unstack the "smallest" entry each time the + # iterator advances + dir_data = sorted(dir_data, key=lambda e: e['name'], reverse=True) + # push the directory frame to the main stack + self.frames.append(dir_data) def top(self): """ @@ -157,7 +153,8 @@ self.has_started = True return current - if descend and self.current_is_dir(): + if descend and self.current_is_dir() \ + and current['target'] != _empty_dir_hash: self._push_dir_frame(current['target']) else: self.drop() @@ -201,6 +198,36 @@ self.frames.pop() self.drop() + def __next__(self): + entry = self.step() + if not entry: + raise StopIteration + entry['path'] = self.current_path() + return entry + + def __iter__(self): + return DirectoryIterator(self.storage, self.root_dir_id, + self.base_path) + + +def dir_iterator(storage, dir_id): + """ + Return an iterator for recursively visiting a directory and + its sub-directories. The associated paths are visited in + lexicographic depth-first search order. + + Args: + storage (swh.storage.Storage): an instance of a swh storage + dir_id (bytes): a directory identifier + + Returns: + swh.storage.algos.dir_iterators.DirectoryIterator: an iterator + returning a dict at each iteration step describing a directory + entry. A 'path' field is added in that dict to store the + absolute path of the entry. + """ + return DirectoryIterator(storage, dir_id) + class Remaining(Enum): """ diff --git a/swh/storage/tests/algos/test_diff.py b/swh/storage/tests/algos/test_diff.py --- a/swh/storage/tests/algos/test_diff.py +++ b/swh/storage/tests/algos/test_diff.py @@ -7,84 +7,15 @@ import unittest -from nose.tools import istest, nottest from unittest.mock import patch +from nose.tools import istest, nottest + +from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import directory_identifier from swh.storage.algos import diff - -class DirectoryModel(object): - """ - Quick and dirty directory model to ease the writing - of revision trees differential tests. - """ - def __init__(self, name=''): - self.data = {} - self.data['name'] = name - self.data['perms'] = 16384 - self.data['type'] = 'dir' - self.data['entries'] = [] - self.data['entry_idx'] = {} - - def __getitem__(self, item): - if item == 'target': - return directory_identifier(self) - else: - return self.data[item] - - def add_file(self, path, sha1=None): - path_parts = path.split(b'/') - if len(path_parts) == 1: - self['entry_idx'][path] = len(self['entries']) - self['entries'].append({ - 'target': sha1, - 'name': path, - 'perms': 33188, - 'type': 'file' - }) - else: - if not path_parts[0] in self['entry_idx']: - self['entry_idx'][path_parts[0]] = len(self['entries']) - self['entries'].append(DirectoryModel(path_parts[0])) - if path_parts[1]: - dir_idx = self['entry_idx'][path_parts[0]] - self['entries'][dir_idx].add_file(b'/'.join(path_parts[1:]), sha1) - - def get_hash_data(self, entry_hash): - if self['target'] == entry_hash: - ret = [] - for e in self['entries']: - ret.append({ - 'target': e['target'], - 'name': e['name'], - 'perms': e['perms'], - 'type': e['type'] - }) - return ret - else: - for e in self['entries']: - if e['type'] == 'file' and e['target'] == entry_hash: - return e - elif e['type'] == 'dir': - data = e.get_hash_data(entry_hash) - if data: - return data - return None - - def get_path_data(self, path): - path_parts = path.split(b'/') - entry_idx = self['entry_idx'][path_parts[0]] - entry = self['entries'][entry_idx] - if len(path_parts) == 1: - return { - 'target': entry['target'], - 'name': entry['name'], - 'perms': entry['perms'], - 'type': entry['type'] - } - else: - return entry.get_path_data(b'/'.join(path_parts[1:])) +from .test_dir_iterator import DirectoryModel @patch('swh.storage.algos.diff._get_rev') @@ -95,20 +26,25 @@ def diff_revisions(self, rev_from, rev_to, from_dir_model, to_dir_model, expected_changes, mock_get_dir, mock_get_rev): + rev_from_bytes = hash_to_bytes(rev_from) + rev_to_bytes = hash_to_bytes(rev_to) + def _get_rev(*args, **kwargs): - if args[1] == rev_from: + if args[1] == rev_from_bytes: return {'directory': from_dir_model['target']} else: return {'directory': to_dir_model['target']} def _get_dir(*args, **kwargs): - return from_dir_model.get_hash_data(args[1]) or \ - to_dir_model.get_hash_data(args[1]) + from_dir = from_dir_model.get_hash_data(args[1]) + to_dir = to_dir_model.get_hash_data(args[1]) + return from_dir if from_dir != None else to_dir mock_get_rev.side_effect = _get_rev mock_get_dir.side_effect = _get_dir - changes = diff.diff_revisions(None, rev_from, rev_to, track_renaming=True) + changes = diff.diff_revisions(None, rev_from_bytes, rev_to_bytes, + track_renaming=True) self.assertEqual(changes, expected_changes) diff --git a/swh/storage/tests/algos/test_dir_iterator.py b/swh/storage/tests/algos/test_dir_iterator.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/algos/test_dir_iterator.py @@ -0,0 +1,153 @@ + +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from unittest.mock import patch + +from nose.tools import istest, nottest + +from swh.model.from_disk import DentryPerms +from swh.model.hashutil import hash_to_bytes, MultiHash +from swh.model.identifiers import directory_identifier +from swh.storage.algos.dir_iterators import dir_iterator + +# flake8: noqa + +class DirectoryModel(object): + """ + Quick and dirty directory model to ease the writing + of directory iterators and revision trees differential tests. + """ + def __init__(self, name=''): + self.data = {} + self.data['name'] = name + self.data['perms'] = DentryPerms.directory + self.data['type'] = 'dir' + self.data['entries'] = [] + self.data['entry_idx'] = {} + + def __getitem__(self, item): + if item == 'target': + return hash_to_bytes(directory_identifier(self)) + else: + return self.data[item] + + def add_file(self, path, sha1=None): + path_parts = path.split(b'/') + sha1 = hash_to_bytes(sha1) if sha1 \ + else MultiHash.from_data(path).digest()['sha1'] + if len(path_parts) == 1: + self['entry_idx'][path] = len(self['entries']) + self['entries'].append({ + 'target': sha1, + 'name': path, + 'perms': DentryPerms.content, + 'type': 'file' + }) + else: + if not path_parts[0] in self['entry_idx']: + self['entry_idx'][path_parts[0]] = len(self['entries']) + self['entries'].append(DirectoryModel(path_parts[0])) + if path_parts[1]: + dir_idx = self['entry_idx'][path_parts[0]] + self['entries'][dir_idx].add_file(b'/'.join(path_parts[1:]), sha1) + + def get_hash_data(self, entry_hash): + if self['target'] == entry_hash: + ret = [] + for e in self['entries']: + ret.append({ + 'target': e['target'], + 'name': e['name'], + 'perms': e['perms'], + 'type': e['type'] + }) + return ret + else: + for e in self['entries']: + if e['type'] == 'file' and e['target'] == entry_hash: + return e + elif e['type'] == 'dir': + data = e.get_hash_data(entry_hash) + if data: + return data + return None + + def get_path_data(self, path): + path_parts = path.split(b'/') + entry_idx = self['entry_idx'][path_parts[0]] + entry = self['entries'][entry_idx] + if len(path_parts) == 1: + return { + 'target': entry['target'], + 'name': entry['name'], + 'perms': entry['perms'], + 'type': entry['type'] + } + else: + return entry.get_path_data(b'/'.join(path_parts[1:])) + + +@patch('swh.storage.algos.dir_iterators._get_dir') +class TestDirectoryIterator(unittest.TestCase): + + @nottest + def check_iterated_paths(self, dir_model, expected_paths_order, + mock_get_dir): + + def _get_dir(*args, **kwargs): + return dir_model.get_hash_data(args[1]) + + mock_get_dir.side_effect = _get_dir # noqa + paths_order = [e['path'] for e in dir_iterator(None, dir_model['target'])] + self.assertEqual(paths_order, expected_paths_order) + + @istest + def test_dir_iterator_empty_dir(self, mock_get_dir): + dir_model = DirectoryModel() + expected_paths_order = [] + self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir) + + @istest + def test_dir_iterator_no_empty_dirs(self, mock_get_dir): + dir_model = DirectoryModel() + dir_model.add_file(b'xyz/gtr/uhb') + dir_model.add_file(b'bca/ef') + dir_model.add_file(b'abc/ab') + dir_model.add_file(b'abc/bc') + dir_model.add_file(b'xyz/ouy/poi') + + expected_paths_order = [b'abc', + b'abc/ab', + b'abc/bc', + b'bca', + b'bca/ef', + b'xyz', + b'xyz/gtr', + b'xyz/gtr/uhb', + b'xyz/ouy', + b'xyz/ouy/poi'] + + self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir) + + @istest + def test_dir_iterator_with_empty_dirs(self, mock_get_dir): + dir_model = DirectoryModel() + dir_model.add_file(b'xyz/gtr/') + dir_model.add_file(b'bca/ef') + dir_model.add_file(b'abc/') + dir_model.add_file(b'xyz/ouy/poi') + + expected_paths_order = [b'abc', + b'bca', + b'bca/ef', + b'xyz', + b'xyz/gtr', + b'xyz/ouy', + b'xyz/ouy/poi'] + + self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir)