diff --git a/swh/storage/algos/revisions_walker.py b/swh/storage/algos/revisions_walker.py new file mode 100644 --- /dev/null +++ b/swh/storage/algos/revisions_walker.py @@ -0,0 +1,509 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import heapq + +from abc import ABCMeta, abstractmethod +from collections import deque + +_revs_walker_classes = {} + + +class _RevisionsWalkerMetaClass(ABCMeta): + def __new__(cls, clsname, bases, attrs): + newclass = super().__new__(cls, clsname, bases, attrs) + if 'rw_type' in attrs: + _revs_walker_classes[attrs['rw_type']] = newclass + return newclass + + +class RevisionsWalker(metaclass=_RevisionsWalkerMetaClass): + """ + Abstract base class encapsulating the logic to walk across + a revisions history starting from a given one. + + It defines an iterator returning the revisions according + to a specific ordering implemented in derived classes. + + The iteration step performs the following operations: + + 1) Check if the iteration is finished by calling method + :meth:`is_finished` and raises :exc:`StopIteration` if it + it is the case + + 2) Get the next unseen revision by calling method + :meth:`get_next_rev_id` + + 3) Process parents of that revision by calling method + :meth:`process_parent_revs` for the next iteration + steps + + 4) Check if the revision should be returned by calling + method :meth:`should_return` and returns it if + it is the case + + In order to easily instantiate a specific type of revisions + walker, it is recommended to use the factory function + :func:`get_revisions_walker`. + + Args: + storage (swh.storage.storage.Storage): instance of swh storage + (either local or remote) + rev_start (bytes): a revision identifier + max_revs (Optional[int]): maximum number of revisions to return + state (Optional[dict]): previous state of that revisions walker + """ + + def __init__(self, storage, rev_start, max_revs=None, state=None): + self._revs_to_visit = [] + self._done = set() + self._revs = {} + self._last_rev = None + self._num_revs = 0 + self._max_revs = max_revs + if state: + self._revs_to_visit = state['revs_to_visit'] + self._done = state['done'] + self._last_rev = state['last_rev'] + self._num_revs = state['num_revs'] + self.storage = storage + self.process_rev(rev_start) + + @abstractmethod + def process_rev(self, rev_id): + """ + Abstract method whose purpose is to process a newly visited + revision during the walk. + Derived classes must implement it according to the desired + method to walk across the revisions history (for instance + through a dfs on the revisions DAG). + + Args: + rev_id (bytes): the newly visited revision identifier + """ + pass + + @abstractmethod + def get_next_rev_id(self): + """ + Abstract method whose purpose is to return the next revision + during the iteration. + Derived classes must implement it according to the desired + method to walk across the revisions history. + + Returns: + dict: A dict describing a revision as returned by + :meth:`swh.storage.storage.Storage.revision_get` + """ + pass + + def process_parent_revs(self, rev): + """ + Process the parents of a revision when it is iterated. + The default implementation simply calls :meth:`process_rev` + for each parent revision in the order they are declared. + + Args: + rev (dict): A dict describing a revision as returned by + :meth:`swh.storage.storage.Storage.revision_get` + """ + for parent_id in rev['parents']: + self.process_rev(parent_id) + + def should_return(self, rev): + """ + Filter out a revision to return if needed. + Default implementation returns all iterated revisions. + + Args: + rev (dict): A dict describing a revision as returned by + :meth:`swh.storage.storage.Storage.revision_get` + + Returns: + bool: Whether to return the revision in the iteration + """ + return True + + def is_finished(self): + """ + Determine if the iteration is finished. + This method is called at the beginning of each iteration loop. + + Returns: + bool: Whether the iteration is finished + """ + if self._max_revs is not None and self._num_revs >= self._max_revs: + return True + if not self._revs_to_visit: + return True + return False + + def _get_rev(self, rev_id): + rev = self._revs.get(rev_id, None) + if not rev: + # cache some revisions in advance to avoid sending too much + # requests to storage and thus speedup the revisions walk + for rev in self.storage.revision_log([rev_id], limit=100): + self._revs[rev['id']] = rev + return self._revs[rev_id] + + def export_state(self): + """ + Export the internal state of that revision walker to a dict. + Its purpose is to continue the iteration in a pagination context. + + Returns: + dict: A dict containing the internal state of that revisions walker + """ + return { + 'revs_to_visit': self._revs_to_visit, + 'done': self._done, + 'last_rev': self._last_rev, + 'num_revs': self._num_revs + } + + def __next__(self): + if self.is_finished(): + raise StopIteration + while self._revs_to_visit: + rev_id = self.get_next_rev_id() + if rev_id in self._done: + continue + self._done.add(rev_id) + rev = self._get_rev(rev_id) + self.process_parent_revs(rev) + if self.should_return(rev): + self._num_revs += 1 + self._last_rev = rev + return rev + raise StopIteration + + def __iter__(self): + return self + + +class CommitterDateRevisionsWalker(RevisionsWalker): + """ + Revisions walker that returns revisions in reverse chronological + order according to committer date (same behaviour as ``git log``) + """ + + rw_type = 'committer_date' + + def process_rev(self, rev_id): + """ + Add the revision to a priority queue according to the committer date. + + Args: + rev_id (bytes): the newly visited revision identifier + """ + if rev_id not in self._done: + rev = self._get_rev(rev_id) + commit_time = rev['committer_date']['timestamp']['seconds'] + heapq.heappush(self._revs_to_visit, (-commit_time, rev_id)) + + def get_next_rev_id(self): + """ + Return the smallest revision from the priority queue, i.e. + the one with highest committer date. + + Returns: + dict: A dict describing a revision as returned by + :meth:`swh.storage.storage.Storage.revision_get` + """ + _, rev_id = heapq.heappop(self._revs_to_visit) + return rev_id + + +class BFSRevisionsWalker(RevisionsWalker): + """ + Revisions walker that returns revisions in the same order + as when performing a breadth-first search on the revisions + DAG. + """ + + rw_type = 'bfs' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._revs_to_visit = deque(self._revs_to_visit) + + def process_rev(self, rev_id): + """ + Append the revision to a queue. + + Args: + rev_id (bytes): the newly visited revision identifier + """ + if rev_id not in self._done: + self._revs_to_visit.append(rev_id) + + def get_next_rev_id(self): + """ + Return the next revision from the queue. + + Returns: + dict: A dict describing a revision as returned by + :meth:`swh.storage.storage.Storage.revision_get` + """ + return self._revs_to_visit.popleft() + + +class DFSPostRevisionsWalker(RevisionsWalker): + """ + Revisions walker that returns revisions in the same order + as when performing a depth-first search in post-order on the + revisions DAG (i.e. after visiting a merge commit, + the merged commit will be visited before the base it was + merged on). + """ + + rw_type = 'dfs_post' + + def process_rev(self, rev_id): + """ + Append the revision to a stack. + + Args: + rev_id (bytes): the newly visited revision identifier + """ + if rev_id not in self._done: + self._revs_to_visit.append(rev_id) + + def get_next_rev_id(self): + """ + Return the next revision from the stack. + + Returns: + dict: A dict describing a revision as returned by + :meth:`swh.storage.storage.Storage.revision_get` + """ + return self._revs_to_visit.pop() + + +class DFSRevisionsWalker(DFSPostRevisionsWalker): + """ + Revisions walker that returns revisions in the same order + as when performing a depth-first search in pre-order on the + revisions DAG (i.e. after visiting a merge commit, + the base commit it was merged on will be visited before + the merged commit). + """ + + rw_type = 'dfs' + + def process_parent_revs(self, rev): + """ + Process the parents of a revision when it is iterated in + the reversed order they are declared. + + Args: + rev (dict): A dict describing a revision as returned by + :meth:`swh.storage.storage.Storage.revision_get` + """ + for parent_id in reversed(rev['parents']): + self.process_rev(parent_id) + + +class PathRevisionsWalker(CommitterDateRevisionsWalker): + """ + Revisions walker that returns revisions where a specific + path in the source tree has been modified, in other terms + it allows to get the history for a specific file or directory. + + It has a behaviour similar to what ``git log`` offers by default, + meaning the returned history is simplified in order to only + show relevant revisions (see the `History Simplification + `_ + section of the associated manual for more details). + + Please note that to avoid walking the entire history, the iteration + will stop once a revision where the path has been added is found. + + .. warning:: Due to client-side implementation, performances + are not optimal when the total numbers of revisions to walk + is large. This should only be used when the total number of + revisions does not exceed a couple of thousands. + + Args: + storage (swh.storage.storage.Storage): instance of swh storage + (either local or remote) + rev_start (bytes): a revision identifier + path (str): the path in the source tree to retrieve the history + max_revs (Optional[int]): maximum number of revisions to return + state (Optional[dict]): previous state of that revisions walker + """ + + rw_type = 'path' + + def __init__(self, storage, rev_start, path, **kwargs): + super().__init__(storage, rev_start, **kwargs) + paths = path.strip('/').split('/') + self._path = list(map(lambda p: p.encode('utf-8'), paths)) + self._rev_dir_path = {} + + def _get_path_id(self, rev_id): + """ + Return the path checksum identifier in the source tree of the + provided revision. If the path corresponds to a directory, the + value computed by :func:`swh.model.identifiers.directory_identifier` + will be returned. If the path corresponds to a file, its sha1 + checksum will be returned. + + Args: + rev_id (bytes): a revision identifier + + Returns: + bytes: the path identifier + """ + + rev = self._get_rev(rev_id) + + rev_dir_id = rev['directory'] + + if rev_dir_id not in self._rev_dir_path: + try: + dir_info = \ + self.storage.directory_entry_get_by_path(rev_dir_id, + self._path) + self._rev_dir_path[rev_dir_id] = dir_info['target'] + except Exception: + self._rev_dir_path[rev_dir_id] = None + + return self._rev_dir_path[rev_dir_id] + + def is_finished(self): + """ + Check if the revisions iteration is finished. + This checks for the specified path's existence in the last + returned revision's parents' source trees. + If not, the iteration is considered finished. + + Returns: + bool: Whether to return the revision in the iteration + """ + if self._path and self._last_rev: + last_rev_parents = self._last_rev['parents'] + last_rev_parents_path_ids = [self._get_path_id(p_rev) + for p_rev in last_rev_parents] + no_path = all([path_id is None + for path_id in last_rev_parents_path_ids]) + if no_path: + return True + return super().is_finished() + + def process_parent_revs(self, rev): + """ + Process parents when a new revision is iterated. + It enables to get a simplified revisions history in the same + manner as ``git log``. When a revision has multiple parents, + the following process is applied. If the revision was a merge, + and has the same path identifier to one parent, follow only that + parent (even if there are several parents with the same path + identifier, follow only one of them.) Otherwise, follow all parents. + + Args: + rev (dict): A dict describing a revision as returned by + :meth:`swh.storage.storage.Storage.revision_get` + """ + rev_path_id = self._get_path_id(rev['id']) + + if rev_path_id: + if len(rev['parents']) == 1: + self.process_rev(rev['parents'][0]) + else: + parent_rev_path_ids = [self._get_path_id(p_rev) + for p_rev in rev['parents']] + different_trees = all([path_id != rev_path_id + for path_id in parent_rev_path_ids]) + for i, p_rev in enumerate(rev['parents']): + if different_trees or \ + parent_rev_path_ids[i] == rev_path_id: + self.process_rev(p_rev) + if not different_trees: + break + else: + super().process_parent_revs(rev) + + def should_return(self, rev): + """ + Check if a revision should be returned when iterating. + It verifies that the specified path has been modified + by the revision but also that all parents have a path + identifier different from the revision one in order + to get a simplified history. + + Args: + rev (dict): A dict describing a revision as returned by + :meth:`swh.storage.storage.Storage.revision_get` + + Returns: + bool: Whether to return the revision in the iteration + """ + rev_path_id = self._get_path_id(rev['id']) + + if not rev['parents']: + return rev_path_id is not None + + parent_rev_path_ids = [self._get_path_id(p_rev) + for p_rev in rev['parents']] + different_trees = all([path_id != rev_path_id + for path_id in parent_rev_path_ids]) + + if rev_path_id != parent_rev_path_ids[0] and different_trees: + return True + + return False + + +def get_revisions_walker(rev_walker_type, *args, **kwargs): + """ + Instantiate a revisions walker of a given type. + + The following code snippet demonstrates how to use a revisions + walker for processing a whole revisions history:: + + revs_walker = get_revisions_walker('committer_date', rev_id) + for rev in revs_walker: + # process revision rev + + It is also possible to walk a revisions history in a paginated + way as illustrated below:: + + def get_revs_history_page(rw_type, rev_id, page_num, page_size, + rw_state): + max_revs = (page_num + 1) * page_size + revs_walker = get_revisions_walker(rw_type, rev_id, + max_revs=max_revs, + state=rw_state) + revs = list(revs_walker) + rw_state = revs_walker.export_state() + return revs + + rev_start = ... + per_page = 50 + rw_state = {} + + for page in range(0, 10): + revs_page = get_revs_history_page('dfs', rev_start, page, + per_page, rw_state) + # process revisions page + + + Args: + rev_walker_type (str): the type of revisions walker to return, + possible values are: *committer_date*, *dfs*, *dfs_post*, + *bfs* and *path* + args (list): position arguments to pass to the revisions walker + constructor + kwargs (dict): keyword arguments to pass to the revisions walker + constructor + + """ + if rev_walker_type not in _revs_walker_classes: + raise Exception('No revisions walker found for type "%s"' + % rev_walker_type) + revs_walker_class = _revs_walker_classes[rev_walker_type] + return revs_walker_class(*args, **kwargs) diff --git a/swh/storage/tests/algos/test_revisions_walker.py b/swh/storage/tests/algos/test_revisions_walker.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/algos/test_revisions_walker.py @@ -0,0 +1,367 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from unittest.mock import patch + +from swh.model.hashutil import hash_to_bytes +from swh.storage.algos.revisions_walker import get_revisions_walker + +# For those tests, we will walk the following revisions history +# with different orderings: +# +# * commit b364f53155044e5308a0f73abb3b5f01995a5b7d +# |\ Merge: 836d498 b94886c +# | | Author: Adam +# | | AuthorDate: Fri Oct 4 12:50:49 2013 +0200 +# | | Commit: Adam +# | | CommitDate: Fri Oct 4 12:50:49 2013 +0200 +# | | +# | | Merge branch 'release/1.0' +# | | +# | * commit b94886c500c46e32dc3d7ebae8a5409accd592e5 +# | | Author: Adam +# | | AuthorDate: Fri Oct 4 12:50:38 2013 +0200 +# | | Commit: Adam +# | | CommitDate: Fri Oct 4 12:50:38 2013 +0200 +# | | +# | | updating poms for 1.0 release +# | | +# | * commit 0cb6b4611d65bee0f57821dac7f611e2f8a02433 +# | | Author: Adam +# | | AuthorDate: Fri Oct 4 12:50:38 2013 +0200 +# | | Commit: Adam +# | | CommitDate: Fri Oct 4 12:50:38 2013 +0200 +# | | +# | | updating poms for 1.0 release +# | | +# | * commit 2b0240c6d682bad51532eec15b8a7ed6b75c8d31 +# | | Author: Adam Janicki +# | | AuthorDate: Fri Oct 4 12:50:22 2013 +0200 +# | | Commit: Adam Janicki +# | | CommitDate: Fri Oct 4 12:50:22 2013 +0200 +# | | +# | | For 1.0 release. Allow untracked. +# | | +# | * commit b401c50863475db4440c85c10ac0b6423b61554d +# | | Author: Adam +# | | AuthorDate: Fri Oct 4 12:48:12 2013 +0200 +# | | Commit: Adam +# | | CommitDate: Fri Oct 4 12:48:12 2013 +0200 +# | | +# | | updating poms for 1.0 release +# | | +# | * commit 9c5051397e5c2e0c258bb639c3dd34406584ca10 +# |/ Author: Adam Janicki +# | AuthorDate: Fri Oct 4 12:47:48 2013 +0200 +# | Commit: Adam Janicki +# | CommitDate: Fri Oct 4 12:47:48 2013 +0200 +# | +# | For 1.0 release. +# | +# * commit 836d498396fb9b5d45c896885f84d8d60a5651dc +# | Author: Adam Janicki +# | AuthorDate: Fri Oct 4 12:08:16 2013 +0200 +# | Commit: Adam Janicki +# | CommitDate: Fri Oct 4 12:08:16 2013 +0200 +# | +# | Add ignores +# | +# * commit ee96c2a2d397b79070d2b6fe3051290963748358 +# | Author: Adam +# | AuthorDate: Fri Oct 4 10:48:16 2013 +0100 +# | Commit: Adam +# | CommitDate: Fri Oct 4 10:48:16 2013 +0100 +# | +# | Reset author +# | +# * commit 8f89dda8e072383cf50d42532ae8f52ad89f8fdf +# Author: Adam +# AuthorDate: Fri Oct 4 02:20:19 2013 -0700 +# Commit: Adam +# CommitDate: Fri Oct 4 02:20:19 2013 -0700 +# +# Initial commit + +# raw dump of the above history in swh format +_revisions_list = \ +[{'author': {'email': b'adam.janicki@roche.com', # noqa + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer_date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883849}}, + 'date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883849}}, + 'directory': b'\xefX\xe7\xa6\\\xda\xdf\xfdH\xdbH\xfbq\x96@{\x98?9\xfe', + 'id': b'\xb3d\xf51U\x04NS\x08\xa0\xf7:\xbb;_\x01\x99Z[}', + 'message': b"Merge branch 'release/1.0'", + 'metadata': None, + 'parents': [b'\x83mI\x83\x96\xfb\x9b]E\xc8\x96\x88_\x84\xd8\xd6\nVQ\xdc', + b'\xb9H\x86\xc5\x00\xc4n2\xdc=~\xba\xe8\xa5@\x9a\xcc\xd5\x92\xe5'], # noqa + 'synthetic': False, + 'type': 'git'}, + {'author': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer_date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883838}}, + 'date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883838}}, + 'directory': b'\xefX\xe7\xa6\\\xda\xdf\xfdH\xdbH\xfbq\x96@{\x98?9\xfe', + 'id': b'\xb9H\x86\xc5\x00\xc4n2\xdc=~\xba\xe8\xa5@\x9a\xcc\xd5\x92\xe5', + 'message': b'updating poms for 1.0 release', + 'metadata': None, + 'parents': [b'\x0c\xb6\xb4a\x1de\xbe\xe0\xf5x!\xda\xc7\xf6\x11\xe2\xf8\xa0$3'], # noqa + 'synthetic': False, + 'type': 'git'}, + {'author': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer_date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883838}}, + 'date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883838}}, + 'directory': b'\xefX\xe7\xa6\\\xda\xdf\xfdH\xdbH\xfbq\x96@{\x98?9\xfe', + 'id': b'\x0c\xb6\xb4a\x1de\xbe\xe0\xf5x!\xda\xc7\xf6\x11\xe2\xf8\xa0$3', + 'message': b'updating poms for 1.0 release', + 'metadata': None, + 'parents': [b'+\x02@\xc6\xd6\x82\xba\xd5\x152\xee\xc1[\x8a~\xd6\xb7\\\x8d1'], + 'synthetic': False, + 'type': 'git'}, + {'author': {'email': b'janickia', + 'fullname': b'Adam Janicki ', + 'id': 8040906, + 'name': b'Adam Janicki'}, + 'committer': {'email': b'janickia', + 'fullname': b'Adam Janicki ', + 'id': 8040906, + 'name': b'Adam Janicki'}, + 'committer_date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883822}}, + 'date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883822}}, + 'directory': b'\xefX\xe7\xa6\\\xda\xdf\xfdH\xdbH\xfbq\x96@{\x98?9\xfe', + 'id': b'+\x02@\xc6\xd6\x82\xba\xd5\x152\xee\xc1[\x8a~\xd6\xb7\\\x8d1', + 'message': b'For 1.0 release. Allow untracked.\n', + 'metadata': None, + 'parents': [b'\xb4\x01\xc5\x08cG]\xb4D\x0c\x85\xc1\n\xc0\xb6B;aUM'], + 'synthetic': False, + 'type': 'git'}, + {'author': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer_date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883692}}, + 'date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883692}}, + 'directory': b'd@\xe7\x143w\xcb\xf7\xad\xae\x91\xd5\xec\xd8\x95\x82' + b'\x02\xa6=\x1b', + 'id': b'\xb4\x01\xc5\x08cG]\xb4D\x0c\x85\xc1\n\xc0\xb6B;aUM', + 'message': b'updating poms for 1.0 release', + 'metadata': None, + 'parents': [b'\x9cPQ9~\\.\x0c%\x8b\xb69\xc3\xdd4@e\x84\xca\x10'], + 'synthetic': False, + 'type': 'git'}, + {'author': {'email': b'janickia', + 'fullname': b'Adam Janicki ', + 'id': 8040906, + 'name': b'Adam Janicki'}, + 'committer': {'email': b'janickia', + 'fullname': b'Adam Janicki ', + 'id': 8040906, + 'name': b'Adam Janicki'}, + 'committer_date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883668}}, + 'date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380883668}}, + 'directory': b'\n\x857\x94r\xbe\xcc\x04=\xe9}\xe5\xfd\xdf?nR\xe6\xa7\x9e', + 'id': b'\x9cPQ9~\\.\x0c%\x8b\xb69\xc3\xdd4@e\x84\xca\x10', + 'message': b'For 1.0 release.\n', + 'metadata': None, + 'parents': [b'\x83mI\x83\x96\xfb\x9b]E\xc8\x96\x88_\x84\xd8\xd6\nVQ\xdc'], + 'synthetic': False, + 'type': 'git'}, + {'author': {'email': b'janickia', + 'fullname': b'Adam Janicki ', + 'id': 8040906, + 'name': b'Adam Janicki'}, + 'committer': {'email': b'janickia', + 'fullname': b'Adam Janicki ', + 'id': 8040906, + 'name': b'Adam Janicki'}, + 'committer_date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380881296}}, + 'date': {'negative_utc': None, + 'offset': 120, + 'timestamp': {'microseconds': 0, 'seconds': 1380881296}}, + 'directory': b'.\xf9\xa5\xcb\xb0\xd3\xdc\x9b{\xb8\x81\x03l\xe2P\x16c\x0b|\xe6', # noqa + 'id': b'\x83mI\x83\x96\xfb\x9b]E\xc8\x96\x88_\x84\xd8\xd6\nVQ\xdc', + 'message': b'Add ignores\n', + 'metadata': None, + 'parents': [b'\xee\x96\xc2\xa2\xd3\x97\xb7\x90p\xd2\xb6\xfe0Q)\tct\x83X'], + 'synthetic': False, + 'type': 'git'}, + {'author': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer_date': {'negative_utc': None, + 'offset': 60, + 'timestamp': {'microseconds': 0, 'seconds': 1380880096}}, + 'date': {'negative_utc': None, + 'offset': 60, + 'timestamp': {'microseconds': 0, 'seconds': 1380880096}}, + 'directory': b'\xc7r\xc4\x9f\xc0$\xd4\xab\xff\xcb]\xf6<\xcb\x8b~\xec\xc4\xd1)', # noqa + 'id': b'\xee\x96\xc2\xa2\xd3\x97\xb7\x90p\xd2\xb6\xfe0Q)\tct\x83X', + 'message': b'Reset author\n', + 'metadata': None, + 'parents': [b'\x8f\x89\xdd\xa8\xe0r8<\xf5\rBS*\xe8\xf5*\xd8\x9f\x8f\xdf'], + 'synthetic': False, + 'type': 'git'}, + {'author': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer': {'email': b'adam.janicki@roche.com', + 'fullname': b'Adam ', + 'id': 8040905, + 'name': b'Adam'}, + 'committer_date': {'negative_utc': None, + 'offset': -420, + 'timestamp': {'microseconds': 0, 'seconds': 1380878419}}, + 'date': {'negative_utc': None, + 'offset': -420, + 'timestamp': {'microseconds': 0, 'seconds': 1380878419}}, + 'directory': b'WS\xbaX\xd6x{q\x8f\x020i\xc5\x95\xa01\xf7y\xb2\x80', + 'id': b'\x8f\x89\xdd\xa8\xe0r8<\xf5\rBS*\xe8\xf5*\xd8\x9f\x8f\xdf', + 'message': b'Initial commit\n', + 'metadata': None, + 'parents': [], + 'synthetic': False, + 'type': 'git'}] + + +_rev_start = 'b364f53155044e5308a0f73abb3b5f01995a5b7d' + + +class RevisionsWalkerTest(unittest.TestCase): + + @patch('swh.storage.storage.Storage') + def check_revisions_ordering(self, rev_walker_type, expected_result, + MockStorage): + storage = MockStorage() + storage.revision_log.return_value = _revisions_list + + revs_walker = \ + get_revisions_walker(rev_walker_type, storage, + hash_to_bytes(_rev_start)) + + self.assertEqual(list(map(hash_to_bytes, expected_result)), + [rev['id'] for rev in revs_walker]) + + def test_revisions_walker_committer_date(self): + + # revisions should be returned in reverse chronological order + # of their committer date + expected_result = ['b364f53155044e5308a0f73abb3b5f01995a5b7d', + 'b94886c500c46e32dc3d7ebae8a5409accd592e5', + '0cb6b4611d65bee0f57821dac7f611e2f8a02433', + '2b0240c6d682bad51532eec15b8a7ed6b75c8d31', + 'b401c50863475db4440c85c10ac0b6423b61554d', + '9c5051397e5c2e0c258bb639c3dd34406584ca10', + '836d498396fb9b5d45c896885f84d8d60a5651dc', + 'ee96c2a2d397b79070d2b6fe3051290963748358', + '8f89dda8e072383cf50d42532ae8f52ad89f8fdf'] + + self.check_revisions_ordering('committer_date', expected_result) + + def test_revisions_walker_dfs(self): + + # revisions should be returned in the same order they are + # visited when performing a depth-first search in pre order + # on the revisions DAG + expected_result = ['b364f53155044e5308a0f73abb3b5f01995a5b7d', + '836d498396fb9b5d45c896885f84d8d60a5651dc', + 'ee96c2a2d397b79070d2b6fe3051290963748358', + '8f89dda8e072383cf50d42532ae8f52ad89f8fdf', + 'b94886c500c46e32dc3d7ebae8a5409accd592e5', + '0cb6b4611d65bee0f57821dac7f611e2f8a02433', + '2b0240c6d682bad51532eec15b8a7ed6b75c8d31', + 'b401c50863475db4440c85c10ac0b6423b61554d', + '9c5051397e5c2e0c258bb639c3dd34406584ca10'] + + self.check_revisions_ordering('dfs', expected_result) + + def test_revisions_walker_dfs_post(self): + + # revisions should be returned in the same order they are + # visited when performing a depth-first search in post order + # on the revisions DAG + expected_result = ['b364f53155044e5308a0f73abb3b5f01995a5b7d', + 'b94886c500c46e32dc3d7ebae8a5409accd592e5', + '0cb6b4611d65bee0f57821dac7f611e2f8a02433', + '2b0240c6d682bad51532eec15b8a7ed6b75c8d31', + 'b401c50863475db4440c85c10ac0b6423b61554d', + '9c5051397e5c2e0c258bb639c3dd34406584ca10', + '836d498396fb9b5d45c896885f84d8d60a5651dc', + 'ee96c2a2d397b79070d2b6fe3051290963748358', + '8f89dda8e072383cf50d42532ae8f52ad89f8fdf'] + + self.check_revisions_ordering('dfs_post', expected_result) + + def test_revisions_walker_bfs(self): + + # revisions should be returned in the same order they are + # visited when performing a breadth-first search on the + # revisions DAG + expected_result = ['b364f53155044e5308a0f73abb3b5f01995a5b7d', + '836d498396fb9b5d45c896885f84d8d60a5651dc', + 'b94886c500c46e32dc3d7ebae8a5409accd592e5', + 'ee96c2a2d397b79070d2b6fe3051290963748358', + '0cb6b4611d65bee0f57821dac7f611e2f8a02433', + '8f89dda8e072383cf50d42532ae8f52ad89f8fdf', + '2b0240c6d682bad51532eec15b8a7ed6b75c8d31', + 'b401c50863475db4440c85c10ac0b6423b61554d', + '9c5051397e5c2e0c258bb639c3dd34406584ca10'] + + self.check_revisions_ordering('bfs', expected_result)