diff --git a/swh/storage/algos/__init__.py b/swh/storage/algos/__init__.py new file mode 100644 diff --git a/swh/storage/algos/diff.py b/swh/storage/algos/diff.py new file mode 100644 --- /dev/null +++ b/swh/storage/algos/diff.py @@ -0,0 +1,403 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# Utility module to efficiently compute the list of changed files +# between two directory trees. +# The implementation is inspired from the work of Alberto Cortés +# for the go-git project. For more details, you can refer to: +# - this blog post: https://blog.sourced.tech/post/difftree/ +# - the reference implementation in go: +# https://github.com/src-d/go-git/tree/master/utils/merkletrie + + +import collections + +from swh.model.identifiers import directory_identifier + +from .dir_iterators import ( + DirectoryIterator, DoubleDirectoryIterator, Remaining +) + +# get the hash identifier for an empty directory +_empty_dir_hash = directory_identifier({'entries': []}) + + +def _get_rev(storage, rev_id): + """ + Returns revision data from swh storage. + """ + return list(storage.revision_get([rev_id]))[0] + + +class _RevisionChangesList(object): + """ + Helper class to track the changes between two + revision directories. + """ + + def __init__(self, storage, track_renaming): + """ + Args: + storage: instance of swh storage + track_renaming (bool): whether to track or not files renaming + """ + self.storage = storage + self.track_renaming = track_renaming + self.result = [] + # dicts used to track file renaming based on hash value + # we use a list instead of a single entry to handle the corner + # case when a repository contains multiple instance of + # the same file in different directories and a commit + # renames all of them + self.inserted_hash_idx = collections.defaultdict(list) + self.deleted_hash_idx = collections.defaultdict(list) + + def add_insert(self, it_to): + """ + Add a file insertion in the to directory. + + Args: + it_to (swh.storage.dir_iterators.DirectoryIterator): + iterator on the to directory + """ + to_hash = it_to.current_hash() + # if the current file hash has been previously marked as deleted, + # the file has been renamed + if self.track_renaming and self.deleted_hash_idx[to_hash]: + # pop the delete change index in the same order it was inserted + change = self.result[self.deleted_hash_idx[to_hash].pop(0)] + # change the delete change as a rename one + change['type'] = 'rename' + change['to'] = it_to.current() + change['to_path'] = it_to.current_path() + else: + # add the insert change in the list + self.result.append({'type': 'insert', + 'from': None, + 'from_path': None, + 'to': it_to.current(), + 'to_path': it_to.current_path()}) + # if rename tracking is activated, add the change index in + # the inserted_hash_idx dict + if self.track_renaming: + self.inserted_hash_idx[to_hash].append(len(self.result) - 1) + + def add_delete(self, it_from): + """ + Add a file deletion in the from directory. + + Args: + it_from (swh.storage.dir_iterators.DirectoryIterator): + iterator on the from directory + """ + from_hash = it_from.current_hash() + # if the current file has been previously marked as inserted, + # the file has been renamed + if self.track_renaming and self.inserted_hash_idx[from_hash]: + # pop the insert chnage index in the same order it was inserted + change = self.result[self.inserted_hash_idx[from_hash].pop(0)] + # change the insert change as a rename one + change['type'] = 'rename' + change['from'] = it_from.current() + change['from_path'] = it_from.current_path() + else: + # add the delete change in the list + self.result.append({'type': 'delete', + 'from': it_from.current(), + 'from_path': it_from.current_path(), + 'to': None, + 'to_path': None}) + # if rename tracking is activated, add the change index in + # the deleted_hash_idx dict + if self.track_renaming: + self.deleted_hash_idx[from_hash].append(len(self.result) - 1) + + def add_modify(self, it_from, it_to): + """ + Add a file modification in the to directory. + + Args: + it_from (swh.storage.dir_iterators.DirectoryIterator): + iterator on the from directory + it_to (swh.storage.dir_iterators.DirectoryIterator): + iterator on the to directory + """ + self.result.append({'type': 'modify', + 'from': it_from.current(), + 'from_path': it_from.current_path(), + 'to': it_to.current(), + 'to_path': it_to.current_path()}) + + def add_recursive(self, it, insert): + """ + Recursively add changes from a directory. + + Args: + it (swh.storage.dir_iterators.DirectoryIterator): + iterator on a directory + insert (bool): the type of changes to add (insertion + or deletion) + """ + # current iterated element is a regular file, + # simply add adequate change in the list + if not it.current_is_dir(): + if insert: + self.add_insert(it) + else: + self.add_delete(it) + return + # current iterated element is a directory, + dir_id = it.current_hash() + # handle empty dir insertion/deletion as the swh model allow + # to have such object compared to git + if dir_id == _empty_dir_hash: + if insert: + self.add_insert(it) + else: + self.add_delete(it) + # iterate on files reachable from it and add + # adequate changes in the list + else: + sub_it = DirectoryIterator(self.storage, dir_id, + it.current_path() + b'/') + sub_it_current = sub_it.step() + while sub_it_current: + if not sub_it.current_is_dir(): + if insert: + self.add_insert(sub_it) + else: + self.add_delete(sub_it) + sub_it_current = sub_it.step() + + def add_recursive_insert(self, it_to): + """ + Recursively add files insertion from a to directory. + + Args: + it_to (swh.storage.dir_iterators.DirectoryIterator): + iterator on a to directory + """ + self.add_recursive(it_to, True) + + def add_recursive_delete(self, it_from): + """ + Recursively add files deletion from a from directory. + + Args: + it_from (swh.storage.dir_iterators.DirectoryIterator): + iterator on a from directory + """ + self.add_recursive(it_from, False) + + +def _diff_elts_same_name(changes, it): + """" + Utility function to compare two directory entries with the + same name and add adequate changes if any. + + Args: + changes (_RevisionChangesList): the list of changes between + two revisions + it (swh.storage.dir_iterators.DoubleDirectoryIterator): + the iterator traversing two revision directories at the same time + """ + # compare the two current directory elements of the iterator + status = it.compare() + # elements have same hash and same permissions: + # no changes to add and call next on the two iterators + if status['same_hash'] and status['same_perms']: + it.next_both() + # elements are regular files and have been modified: + # insert the modification change in the list and + # call next on the two iterators + elif status['both_are_files']: + changes.add_modify(it.it_from, it.it_to) + it.next_both() + # one element is a regular file, the other a directory: + # recursively add delete/insert changes and call next + # on the two iterators + elif status['file_and_dir']: + changes.add_recursive_delete(it.it_from) + changes.add_recursive_insert(it.it_to) + it.next_both() + # both elements are directories: + elif status['both_are_dirs']: + # from directory is empty: + # recursively add insert changes in the to directory + # and call next on the two iterators + if status['from_is_empty_dir']: + changes.add_recursive_insert(it.it_to) + it.next_both() + # to directory is empty: + # recursively add delete changes in the from directory + # and call next on the two iterators + elif status['to_is_empty_dir']: + changes.add_recursive_delete(it.it_from) + it.next_both() + # both directories are not empty: + # call step on the two iterators to descend further in + # the directory trees. + elif not status['from_is_empty_dir'] and not status['to_is_empty_dir']: + it.step_both() + + +def _compare_paths(path1, path2): + """ + Function to compare paths in lexicographic depth-first order. + For instance, it returns: + - "a" < "b" + - "b/c/d" < "b" + - "c/foo.txt" < "c.txt" + """ + path1_parts = path1.split(b'/') + path2_parts = path2.split(b'/') + i = 0 + while True: + if len(path1_parts) == len(path2_parts) and i == len(path1_parts): + return 0 + elif len(path2_parts) == i: + return 1 + elif len(path1_parts) == i: + return -1 + else: + if path2_parts[i] > path1_parts[i]: + return -1 + elif path2_parts[i] < path1_parts[i]: + return 1 + i = i + 1 + + +def _diff_elts(changes, it): + """ + Utility function to compare two directory entries. + + Args: + changes (_RevisionChangesList): the list of changes between + two revisions + it (swh.storage.algos.dir_iterators.DoubleDirectoryIterator): + the iterator traversing two revision directories at the same time + """ + # compare current to and from path in depth-first lexicographic order + c = _compare_paths(it.it_from.current_path(), it.it_to.current_path()) + # current from path is lower than the current to path: + # the from path has been deleted + if c < 0: + changes.add_recursive_delete(it.it_from) + it.next_from() + # current from path is greather than the current to path: + # the to path has been inserted + elif c > 0: + changes.add_recursive_insert(it.it_to) + it.next_to() + # paths are the same and need more processing + else: + _diff_elts_same_name(changes, it) + + +def diff_directories(storage, from_dir, to_dir, track_renaming=False): + """ + Function that computes the differential between two directories, + i.e. the list of file changes (insertion / deletion / modification / + renaming) between them. + + Args: + storage (swh.storage.storage.Storage): instance of a swh + storage (either local or remote, for optimal performance + the use of a local storage is recommended) + from_dir (bytes): the swh identifier of the directory to compare from + to_dir (bytes): the swh identifier of the directory to compare to + track_renaming (bool): wheter or not to track files renaming + + Returns: + list: A list of dict representing the changes between the two + revisions. Each dict contains the following entries: + + - *type*: a string describing the type of change + (insert/delete/modify/rename) + + - *from*: a dict containing the directory entry metadata in the + from revision (None in case of an insertion) + + - *from_path*: bytes string corresponding to the absolute path + of the from revision entry (None in case of an insertion) + + - *to*: a dict containing the directory entry metadata in the + to revision (None in case of a deletion) + + - *to_path*: bytes string corresponding to the absolute path + of the to revision entry (None in case of a deletion) + + The returned list is sorted in lexicographic depth-first order + according to the value of the *to_path* field. + + """ + changes = _RevisionChangesList(storage, track_renaming) + it = DoubleDirectoryIterator(storage, from_dir, to_dir) + while True: + r = it.remaining() + if r == Remaining.NoMoreFiles: + break + elif r == Remaining.OnlyFromFilesRemain: + changes.add_recursive_delete(it.it_from) + it.next_from() + elif r == Remaining.OnlyToFilesRemain: + changes.add_recursive_insert(it.it_to) + it.next_to() + else: + _diff_elts(changes, it) + return changes.result + + +def diff_revisions(storage, from_rev, to_rev, track_renaming=False): + """ + Function that computes the differential between two revisions, + i.e. the list of file changes between the two associated directories. + + Args: + storage (swh.storage.storage.Storage): instance of a swh + storage (either local or remote, for optimal performance + the use of a local storage is recommended) + from_rev (bytes): the identifier of the revision to compare from + to_rev (bytes): the identifier of the revision to compare to + track_renaming (bool): wheter or not to track files renaming + + Returns: + list: A list of dict describing the introduced file changes + (see :func:`swh.storage.algos.diff.diff_directories`). + + """ + from_dir = None + if from_rev: + from_dir = _get_rev(storage, from_rev)['directory'] + to_dir = _get_rev(storage, to_rev)['directory'] + return diff_directories(storage, from_dir, to_dir, track_renaming) + + +def diff_revision(storage, rev, track_renaming=False): + """ + Function that computes the differential between a revision + and its first parent. + If the revision has no parents, the directory to compare from + is considered as empty. + In other words, it computes the changes introduced in a + specific revision. + + Args: + storage (swh.storage.storage.Storage): instance of a swh + storage (either local or remote, for optimal performance + the use of a local storage is recommended) + rev (bytes): the identifier of the revision from which to + compute the introduced changes. + track_renaming (bool): wheter or not to track files renaming + + Returns: + list: A list of dict describing the introduced file changes + (see :func:`swh.storage.algos.diff.diff_directories`). + """ + rev_data = _get_rev(storage, rev) + parent_rev = None + if rev_data['parents']: + parent_rev = rev_data['parents'][0] + return diff_revisions(storage, parent_rev, rev, track_renaming) diff --git a/swh/storage/algos/dir_iterators.py b/swh/storage/algos/dir_iterators.py new file mode 100644 --- /dev/null +++ b/swh/storage/algos/dir_iterators.py @@ -0,0 +1,347 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# Utility module to iterate on directory trees. +# The implementation is inspired from the work of Alberto Cortés +# for the go-git project. For more details, you can refer to: +# - this blog post: https://blog.sourced.tech/post/difftree/ +# - the reference implementation in go: +# https://github.com/src-d/go-git/tree/master/utils/merkletrie + + +from enum import Enum + +from swh.model.identifiers import directory_identifier + +# get the hash identifier for an empty directory +_empty_dir_hash = directory_identifier({'entries': []}) + + +def _get_dir(storage, dir_id): + """ + Returns directory data from swh storage. + """ + return storage.directory_ls(dir_id) if dir_id else [] + + +class DirectoryIterator(object): + """ + Helper class used to iterate on a directory tree in a depth-first search + way with some additionnal features: + - sibling nodes are iterated in lexicographic order by name + - it is possible to skip the visit of sub-directories nodes + for efficency reasons when comparing two trees (no need to + go deeper if two directories have the same hash) + """ + + def __init__(self, storage, dir_id, base_path=b''): + """ + Args: + storage (swh.storage.storage.Storage): instance of swh storage + (either local or remote) + dir_id (bytes): identifier of a root directory + base_path (bytes): optional base path used when traversing + a sub-directory + """ + self.storage = storage + self.root_dir_id = dir_id + self.base_path = base_path + self.restart() + + def restart(self): + """ + Restart the iteration at the beginning. + """ + # stack of frames representing currently visited directories: + # the root directory is at the bottom while the current one + # is at the top + self.frames = [] + self._push_dir_frame(self.root_dir_id) + self.has_started = False + + def _push_dir_frame(self, dir_id): + """ + Visit a sub-directory by pushing a new frame to the stack. + Each frame is itself a stack of directory entries. + + Args: + dir_id (bytes): identifier of a root directory + """ + if dir_id: + if dir_id == _empty_dir_hash: + self.frames.append([]) + else: + # get directory entries + dir_data = _get_dir(self.storage, dir_id) + # sort them in lexicographical order + dir_data = sorted(dir_data, key=lambda e: e['name']) + # reverse the ordering in order to unstack the "smallest" + # entry each time the iterator advances + dir_data.reverse() + # push the directory frame to the main stack + self.frames.append(dir_data) + + def top(self): + """ + Returns: + list: The top frame of the main directories stack + """ + if not self.frames: + return None + return self.frames[-1] + + def current(self): + """ + Returns: + dict: The current visited directory entry, i.e. the + top element from the top frame + """ + top_frame = self.top() + if not top_frame: + return None + return top_frame[-1] + + def current_hash(self): + """ + Returns: + bytes: The hash value of the currently visited directory + entry + """ + return self.current()['target'] + + def current_perms(self): + """ + Returns: + int: The permissions value of the currently visited directory + entry + """ + return self.current()['perms'] + + def current_path(self): + """ + Returns: + str: The absolute path from the root directory of + the currently visited directory entry + """ + top_frame = self.top() + if not top_frame: + return None + path = [] + for frame in self.frames: + path.append(frame[-1]['name']) + return self.base_path + b'/'.join(path) + + def current_is_dir(self): + """ + Returns: + bool: If the currently visited directory entry is + a directory + """ + return self.current()['type'] == 'dir' + + def _advance(self, descend): + """ + Method used to advance in the tree iteration. + + Args: + descend (bool): whether or not to push a new frame + if the currently visited element is a sub-directory + + Returns: + dict: The description of the newly visited directory entry + """ + current = self.current() + if not self.has_started or not current: + self.has_started = True + return current + + if descend and self.current_is_dir(): + self._push_dir_frame(current['target']) + else: + self.drop() + + return self.current() + + def next(self): + """ + Advance the tree iteration by dropping the current visited + directory entry from the top frame. If the top frame ends up empty, + the operation is recursively applied to remove all empty frames + as the tree is climbed up towards its root. + + Returns: + dict: The description of the newly visited directory entry + """ + return self._advance(False) + + def step(self): + """ + Advance the tree iteration like the next operation with the + difference that if the current visited element is a sub-directory + a new frame representing its content is pushed to the main stack. + + Returns: + dict: The description of the newly visited directory entry + """ + return self._advance(True) + + def drop(self): + """ + Drop the current visited element from the top frame. + If the frame ends up empty, the operation is recursively + applied. + """ + frame = self.top() + if not frame: + return + frame.pop() + if not frame: + self.frames.pop() + self.drop() + + +class Remaining(Enum): + """ + Enum to represent the current state when iterating + on both directory trees at the same time. + """ + NoMoreFiles = 0 + OnlyToFilesRemain = 1 + OnlyFromFilesRemain = 2 + BothHaveFiles = 3 + + +class DoubleDirectoryIterator(object): + """ + Helper class to traverse two directory trees at the same + time and compare their contents to detect changes between them. + """ + + def __init__(self, storage, dir_from, dir_to): + """ + Args: + storage: instance of swh storage + dir_from (bytes): hash identifier of the from directory + dir_to (bytes): hash identifier of the to directory + """ + self.storage = storage + self.dir_from = dir_from + self.dir_to = dir_to + self.restart() + + def restart(self): + """ + Restart the double iteration at the beginning. + """ + # initialize custom dfs iterators for the two directories + self.it_from = DirectoryIterator(self.storage, self.dir_from) + self.it_to = DirectoryIterator(self.storage, self.dir_to) + # grab the first element of each iterator + self.it_from.next() + self.it_to.next() + + def next_from(self): + """ + Apply the next operation on the from iterator. + """ + self.it_from.next() + + def next_to(self): + """ + Apply the next operation on the to iterator. + """ + self.it_to.next() + + def next_both(self): + """ + Apply the next operation on both iterators. + """ + self.next_from() + self.next_to() + + def step_from(self): + """ + Apply the step operation on the from iterator. + """ + self.it_from.step() + + def step_to(self): + """ + Apply the step operation on the from iterator. + """ + self.it_to.step() + + def step_both(self): + """ + Apply the step operation on the both iterators. + """ + self.step_from() + self.step_to() + + def remaining(self): + """ + Returns: + Remaining: the current state of the double iteration + """ + from_current = self.it_from.current() + to_current = self.it_to.current() + # no more files to iterate in both iterators + if not from_current and not to_current: + return Remaining.NoMoreFiles + # still some files to iterate in the to iterator + elif not from_current and to_current: + return Remaining.OnlyToFilesRemain + # still some files to iterate in the from iterator + elif from_current and not to_current: + return Remaining.OnlyFromFilesRemain + # still files to iterate in the both iterators + else: + return Remaining.BothHaveFiles + + def compare(self): + """ + Compare the current iterated directory entries in both iterators + and return the comparison status. + + Returns: + dict: The status of the comparison with the following bool values: + * *same_hash*: indicates if the two entries have the same hash + * *same_perms*: indicates if the two entries have the same + permissions + * *both_are_dirs*: indicates if the two entries are directories + * *both_are_files*: indicates if the two entries are regular + files + * *file_and_dir*: indicates if one of the entry is a directory + and the other a regular file + * *from_is_empty_dir*: indicates if the from entry is the + empty directory + * *from_is_empty_dir*: indicates if the to entry is the + empty directory + """ + from_current_hash = self.it_from.current_hash() + to_current_hash = self.it_to.current_hash() + from_current_perms = self.it_from.current_perms() + to_current_perms = self.it_to.current_perms() + from_is_dir = self.it_from.current_is_dir() + to_is_dir = self.it_to.current_is_dir() + status = {} + # compare hash + status['same_hash'] = from_current_hash == to_current_hash + # compare permissions + status['same_perms'] = from_current_perms == to_current_perms + # check if both elements are directories + status['both_are_dirs'] = from_is_dir and to_is_dir + # check if both elements are regular files + status['both_are_files'] = not from_is_dir and not to_is_dir + # check if one element is a directory, the other a regular file + status['file_and_dir'] = (not status['both_are_dirs'] and + not status['both_are_files']) + # check if the from element is the empty directory + status['from_is_empty_dir'] = (from_is_dir and + from_current_hash == _empty_dir_hash) + # check if the to element is the empty directory + status['to_is_empty_dir'] = (to_is_dir and + to_current_hash == _empty_dir_hash) + return status diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -218,3 +218,8 @@ def metadata_provider_get_by(self, provider): return self.post('provider/getby', {'provider': provider}) + + def diff_revision(self, revision, track_renaming=False): + return self.post('utils/diff_revision', + {'revision': revision, + 'track_renaming': track_renaming}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -348,6 +348,11 @@ return encode_data(g.storage.stat_counters()) +@app.route('/utils/diff_revision', methods=['POST']) +def diff_revision(): + return encode_data(g.storage.diff_revision(**decode_request(request))) + + def run_from_webserver(environ, start_response, config_path=DEFAULT_CONFIG_PATH): """Run the WSGI app from the webserver, loading the configuration.""" diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -15,6 +15,7 @@ from .common import db_transaction_generator, db_transaction from .db import Db from .exc import StorageDBError +from .algos import diff from swh.model.hashutil import ALGORITHMS from swh.objstorage import get_objstorage @@ -1586,3 +1587,19 @@ if not result: return None return dict(zip(self.db.metadata_provider_cols, result)) + + def diff_revision(self, revision, track_renaming=False): + """Compute the list of file changes introduced by a specific revision + (insertion/deletion/modification/renaming of files). + + Args: + revision (bytes): identifier of the revision from which to + compute the list of files changes + track_renaming (bool): whether or not to track files renaming + + Returns: + A list of dict describing the introduced file changes + (see :func:`swh.storage.utils.diff_revisions.diff_revisions` + for more details). + """ + return diff.diff_revision(self, revision, track_renaming) diff --git a/swh/storage/tests/algos/__init__.py b/swh/storage/tests/algos/__init__.py new file mode 100644 diff --git a/swh/storage/tests/algos/test_diff.py b/swh/storage/tests/algos/test_diff.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/algos/test_diff.py @@ -0,0 +1,368 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa + +import unittest + +from nose.tools import istest, nottest +from unittest.mock import patch + +from swh.model.identifiers import directory_identifier +from swh.storage.algos import diff + + +class DirectoryModel(object): + """ + Quick and dirty directory model to ease the writing + of revision trees differential tests. + """ + def __init__(self, name=''): + self.data = {} + self.data['name'] = name + self.data['perms'] = 16384 + self.data['type'] = 'dir' + self.data['entries'] = [] + self.data['entry_idx'] = {} + + def __getitem__(self, item): + if item == 'target': + return directory_identifier(self) + else: + return self.data[item] + + def add_file(self, path, sha1=None): + path_parts = path.split(b'/') + if len(path_parts) == 1: + self['entry_idx'][path] = len(self['entries']) + self['entries'].append({ + 'target': sha1, + 'name': path, + 'perms': 33188, + 'type': 'file' + }) + else: + if not path_parts[0] in self['entry_idx']: + self['entry_idx'][path_parts[0]] = len(self['entries']) + self['entries'].append(DirectoryModel(path_parts[0])) + if path_parts[1]: + dir_idx = self['entry_idx'][path_parts[0]] + self['entries'][dir_idx].add_file(b'/'.join(path_parts[1:]), sha1) + + def get_hash_data(self, entry_hash): + if self['target'] == entry_hash: + ret = [] + for e in self['entries']: + ret.append({ + 'target': e['target'], + 'name': e['name'], + 'perms': e['perms'], + 'type': e['type'] + }) + return ret + else: + for e in self['entries']: + if e['type'] == 'file' and e['target'] == entry_hash: + return e + elif e['type'] == 'dir': + data = e.get_hash_data(entry_hash) + if data: + return data + return None + + def get_path_data(self, path): + path_parts = path.split(b'/') + entry_idx = self['entry_idx'][path_parts[0]] + entry = self['entries'][entry_idx] + if len(path_parts) == 1: + return { + 'target': entry['target'], + 'name': entry['name'], + 'perms': entry['perms'], + 'type': entry['type'] + } + else: + return entry.get_path_data(b'/'.join(path_parts[1:])) + + +@patch('swh.storage.algos.diff._get_rev') +@patch('swh.storage.algos.dir_iterators._get_dir') +class TestDiffRevisions(unittest.TestCase): + + @nottest + def diff_revisions(self, rev_from, rev_to, from_dir_model, to_dir_model, + expected_changes, mock_get_dir, mock_get_rev): + + def _get_rev(*args, **kwargs): + if args[1] == rev_from: + return {'directory': from_dir_model['target']} + else: + return {'directory': to_dir_model['target']} + + def _get_dir(*args, **kwargs): + return from_dir_model.get_hash_data(args[1]) or \ + to_dir_model.get_hash_data(args[1]) + + mock_get_rev.side_effect = _get_rev + mock_get_dir.side_effect = _get_dir + + changes = diff.diff_revisions(None, rev_from, rev_to, track_renaming=True) + + self.assertEqual(changes, expected_changes) + + @istest + def test_insert_delete(self, mock_get_dir, mock_get_rev): + rev_from = '898ff03e1e7925ecde3da66327d3cdc7e07625ba' + rev_to = '647c3d381e67490e82cdbbe6c96e46d5e1628ce2' + + from_dir_model = DirectoryModel() + + to_dir_model = DirectoryModel() + to_dir_model.add_file(b'file1', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + to_dir_model.add_file(b'file2', '3e5faecb3836ffcadf82cc160787e35d4e2bec6a') + to_dir_model.add_file(b'file3', '2ae33b2984974d35eababe4890d37fbf4bce6b2c') + + expected_changes = \ + [{ + 'type': 'insert', + 'from': None, + 'from_path': None, + 'to': to_dir_model.get_path_data(b'file1'), + 'to_path': b'file1' + }, + { + 'type': 'insert', + 'from': None, + 'from_path': None, + 'to': to_dir_model.get_path_data(b'file2'), + 'to_path': b'file2' + }, + { + 'type': 'insert', + 'from': None, + 'from_path': None, + 'to': to_dir_model.get_path_data(b'file3'), + 'to_path': b'file3' + }] + + + self.diff_revisions(rev_from, rev_to, from_dir_model, + to_dir_model, expected_changes, + mock_get_dir, mock_get_rev) + + from_dir_model = DirectoryModel() + from_dir_model.add_file(b'file1', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + from_dir_model.add_file(b'file2', '3e5faecb3836ffcadf82cc160787e35d4e2bec6a') + from_dir_model.add_file(b'file3', '2ae33b2984974d35eababe4890d37fbf4bce6b2c') + + to_dir_model = DirectoryModel() + + expected_changes = \ + [{ + 'type': 'delete', + 'from': from_dir_model.get_path_data(b'file1'), + 'from_path': b'file1', + 'to': None, + 'to_path': None + }, + { + 'type': 'delete', + 'from': from_dir_model.get_path_data(b'file2'), + 'from_path': b'file2', + 'to': None, + 'to_path': None + }, + { + 'type': 'delete', + 'from': from_dir_model.get_path_data(b'file3'), + 'from_path': b'file3', + 'to': None, + 'to_path': None + }] + + + self.diff_revisions(rev_from, rev_to, from_dir_model, + to_dir_model, expected_changes, + mock_get_dir, mock_get_rev) + + @istest + def test_onelevel_diff(self, mock_get_dir, mock_get_rev): + rev_from = '898ff03e1e7925ecde3da66327d3cdc7e07625ba' + rev_to = '647c3d381e67490e82cdbbe6c96e46d5e1628ce2' + + from_dir_model = DirectoryModel() + from_dir_model.add_file(b'file1', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + from_dir_model.add_file(b'file2', 'f4a96b2000be83b61254d107046fa9777b17eb34') + from_dir_model.add_file(b'file3', 'd3c00f9396c6d0277727cec522ff6ad1ea0bc2da') + + to_dir_model = DirectoryModel() + to_dir_model.add_file(b'file2', '3ee0f38ee0ea23cc2c8c0b9d66b27be4596b002b') + to_dir_model.add_file(b'file3', 'd3c00f9396c6d0277727cec522ff6ad1ea0bc2da') + to_dir_model.add_file(b'file4', '40460b9653b1dc507e1b6eb333bd4500634bdffc') + + expected_changes = \ + [{ + 'type': 'delete', + 'from': from_dir_model.get_path_data(b'file1'), + 'from_path': b'file1', + 'to': None, + 'to_path': None}, + { + 'type': 'modify', + 'from': from_dir_model.get_path_data(b'file2'), + 'from_path': b'file2', + 'to': to_dir_model.get_path_data(b'file2'), + 'to_path': b'file2'}, + { + 'type': 'insert', + 'from': None, + 'from_path': None, + 'to': to_dir_model.get_path_data(b'file4'), + 'to_path': b'file4' + }] + + self.diff_revisions(rev_from, rev_to, from_dir_model, + to_dir_model, expected_changes, + mock_get_dir, mock_get_rev) + + @istest + def test_twolevels_diff(self, mock_get_dir, mock_get_rev): + rev_from = '898ff03e1e7925ecde3da66327d3cdc7e07625ba' + rev_to = '647c3d381e67490e82cdbbe6c96e46d5e1628ce2' + + from_dir_model = DirectoryModel() + from_dir_model.add_file(b'file1', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + from_dir_model.add_file(b'dir1/file1', '8335fca266811bac7ae5c8e1621476b4cf4156b6') + from_dir_model.add_file(b'dir1/file2', 'a6127d909e79f1fcb28bbf220faf86e7be7831e5') + from_dir_model.add_file(b'dir1/file3', '18049b8d067ce1194a7e1cce26cfa3ae4242a43d') + from_dir_model.add_file(b'file2', 'd3c00f9396c6d0277727cec522ff6ad1ea0bc2da') + + to_dir_model = DirectoryModel() + to_dir_model.add_file(b'file1', '3ee0f38ee0ea23cc2c8c0b9d66b27be4596b002b') + to_dir_model.add_file(b'dir1/file2', 'de3548b32a8669801daa02143a66dae21fe852fd') + to_dir_model.add_file(b'dir1/file3', '18049b8d067ce1194a7e1cce26cfa3ae4242a43d') + to_dir_model.add_file(b'dir1/file4', 'f5c3f42aec5fe7b92276196c350cbadaf4c51f87') + to_dir_model.add_file(b'file2', 'd3c00f9396c6d0277727cec522ff6ad1ea0bc2da') + + expected_changes = \ + [{ + 'type': 'delete', + 'from': from_dir_model.get_path_data(b'dir1/file1'), + 'from_path': b'dir1/file1', + 'to': None, + 'to_path': None + }, + { + 'type': 'modify', + 'from': from_dir_model.get_path_data(b'dir1/file2'), + 'from_path': b'dir1/file2', + 'to': to_dir_model.get_path_data(b'dir1/file2'), + 'to_path': b'dir1/file2' + }, + { + 'type': 'insert', + 'from': None, + 'from_path': None, + 'to': to_dir_model.get_path_data(b'dir1/file4'), + 'to_path': b'dir1/file4' + }, + { + 'type': 'modify', + 'from': from_dir_model.get_path_data(b'file1'), + 'from_path': b'file1', + 'to': to_dir_model.get_path_data(b'file1'), + 'to_path': b'file1' + }] + + self.diff_revisions(rev_from, rev_to, from_dir_model, + to_dir_model, expected_changes, + mock_get_dir, mock_get_rev) + + @istest + def test_insert_delete_empty_dirs(self, mock_get_dir, mock_get_rev): + rev_from = '898ff03e1e7925ecde3da66327d3cdc7e07625ba' + rev_to = '647c3d381e67490e82cdbbe6c96e46d5e1628ce2' + + from_dir_model = DirectoryModel() + from_dir_model.add_file(b'dir3/file1', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + + to_dir_model = DirectoryModel() + to_dir_model.add_file(b'dir3/file1', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + to_dir_model.add_file(b'dir3/dir1/') + + expected_changes = \ + [{ + 'type': 'insert', + 'from': None, + 'from_path': None, + 'to': to_dir_model.get_path_data(b'dir3/dir1'), + 'to_path': b'dir3/dir1' + }] + + self.diff_revisions(rev_from, rev_to, from_dir_model, + to_dir_model, expected_changes, + mock_get_dir, mock_get_rev) + + from_dir_model = DirectoryModel() + from_dir_model.add_file(b'dir1/dir2/') + from_dir_model.add_file(b'dir1/file1', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + + to_dir_model = DirectoryModel() + to_dir_model.add_file(b'dir1/file1', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + + expected_changes = \ + [{ + 'type': 'delete', + 'from': from_dir_model.get_path_data(b'dir1/dir2'), + 'from_path': b'dir1/dir2', + 'to': None, + 'to_path': None + }] + + self.diff_revisions(rev_from, rev_to, from_dir_model, + to_dir_model, expected_changes, + mock_get_dir, mock_get_rev) + + @istest + def test_track_renaming(self, mock_get_dir, mock_get_rev): + rev_from = '898ff03e1e7925ecde3da66327d3cdc7e07625ba' + rev_to = '647c3d381e67490e82cdbbe6c96e46d5e1628ce2' + + from_dir_model = DirectoryModel() + from_dir_model.add_file(b'file1_oldname', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + from_dir_model.add_file(b'dir1/file1_oldname', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + from_dir_model.add_file(b'file2_oldname', 'd3c00f9396c6d0277727cec522ff6ad1ea0bc2da') + + to_dir_model = DirectoryModel() + to_dir_model.add_file(b'dir1/file1_newname', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + to_dir_model.add_file(b'dir2/file1_newname', 'ea15f54ca215e7920c60f564315ebb7f911a5204') + to_dir_model.add_file(b'file2_newname', 'd3c00f9396c6d0277727cec522ff6ad1ea0bc2da') + + expected_changes = \ + [{ + 'type': 'rename', + 'from': from_dir_model.get_path_data(b'dir1/file1_oldname'), + 'from_path': b'dir1/file1_oldname', + 'to': to_dir_model.get_path_data(b'dir1/file1_newname'), + 'to_path': b'dir1/file1_newname' + }, + { + 'type': 'rename', + 'from': from_dir_model.get_path_data(b'file1_oldname'), + 'from_path': b'file1_oldname', + 'to': to_dir_model.get_path_data(b'dir2/file1_newname'), + 'to_path': b'dir2/file1_newname' + }, + { + 'type': 'rename', + 'from': from_dir_model.get_path_data(b'file2_oldname'), + 'from_path': b'file2_oldname', + 'to': to_dir_model.get_path_data(b'file2_newname'), + 'to_path': b'file2_newname' + }] + + self.diff_revisions(rev_from, rev_to, from_dir_model, + to_dir_model, expected_changes, + mock_get_dir, mock_get_rev)