diff --git a/swh/storage/algos/revisions_walker.py b/swh/storage/algos/revisions_walker.py --- a/swh/storage/algos/revisions_walker.py +++ b/swh/storage/algos/revisions_walker.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -63,11 +63,13 @@ self._last_rev = None self._num_revs = 0 self._max_revs = max_revs + self._missing_revs = set() if state: self._revs_to_visit = state['revs_to_visit'] self._done = state['done'] self._last_rev = state['last_rev'] self._num_revs = state['num_revs'] + self._missing_revs = state['missing_revs'] self.storage = storage self.process_rev(rev_start) @@ -152,6 +154,28 @@ self._revs[rev['id']] = rev return self._revs.get(rev_id) + def missing_revisions(self): + """ + Return a set of revision identifiers whose associated data were + found missing into the archive content while walking on the + revisions graph. + + Returns: + Set[bytes]: a set of revision identifiers + """ + return self._missing_revs + + def is_history_truncated(self): + """ + Return if the revision history generated so far has been truncated + of not. A revision history might end up truncated if some revision + data were found missing into the archive content. + + Returns: + bool: Whether the history got truncated or not + """ + return len(self.missing_revisions()) > 0 + def export_state(self): """ Export the internal state of that revision walker to a dict. @@ -164,7 +188,8 @@ 'revs_to_visit': self._revs_to_visit, 'done': self._done, 'last_rev': self._last_rev, - 'num_revs': self._num_revs + 'num_revs': self._num_revs, + 'missing_revs': self._missing_revs } def __next__(self): @@ -178,6 +203,7 @@ rev = self._get_rev(rev_id) # revision data is missing, returned history will be truncated if rev is None: + self._missing_revs.add(rev_id) continue self.process_parent_revs(rev) if self.should_return(rev): @@ -210,6 +236,8 @@ if rev is not None: commit_time = rev['committer_date']['timestamp']['seconds'] heapq.heappush(self._revs_to_visit, (-commit_time, rev_id)) + else: + self._missing_revs.add(rev_id) def get_next_rev_id(self): """ diff --git a/swh/storage/tests/algos/test_revisions_walker.py b/swh/storage/tests/algos/test_revisions_walker.py --- a/swh/storage/tests/algos/test_revisions_walker.py +++ b/swh/storage/tests/algos/test_revisions_walker.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,7 +7,7 @@ from unittest.mock import patch -from swh.model.hashutil import hash_to_bytes +from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.storage.algos.revisions_walker import get_revisions_walker # For those tests, we will walk the following revisions history @@ -283,21 +283,39 @@ _rev_start = 'b364f53155044e5308a0f73abb3b5f01995a5b7d' +_rev_missing = '836d498396fb9b5d45c896885f84d8d60a5651dc' + class RevisionsWalkerTest(unittest.TestCase): - @patch('swh.storage.storage.Storage') def check_revisions_ordering(self, rev_walker_type, expected_result, - MockStorage): - storage = MockStorage() - storage.revision_log.return_value = _revisions_list + truncated_history): + with patch('swh.storage.storage.Storage') as MockStorage: + storage = MockStorage() + if not truncated_history: + storage.revision_log.return_value = _revisions_list + else: + revs_lists_truncated = [ + None if hash_to_hex(rev['id']) == _rev_missing else rev + for rev in _revisions_list + ] + + storage.revision_log.return_value = revs_lists_truncated + + revs_walker = get_revisions_walker(rev_walker_type, storage, + hash_to_bytes(_rev_start)) + + self.assertEqual(list(map(hash_to_bytes, expected_result)), + [rev['id'] for rev in revs_walker]) - revs_walker = \ - get_revisions_walker(rev_walker_type, storage, - hash_to_bytes(_rev_start)) + self.assertEqual(revs_walker.is_history_truncated(), + truncated_history) - self.assertEqual(list(map(hash_to_bytes, expected_result)), - [rev['id'] for rev in revs_walker]) + if truncated_history: + missing_revs = revs_walker.missing_revisions() + self.assertEqual(missing_revs, {hash_to_bytes(_rev_missing)}) + else: + self.assertEqual(revs_walker.missing_revisions(), set()) def test_revisions_walker_committer_date(self): @@ -313,7 +331,8 @@ 'ee96c2a2d397b79070d2b6fe3051290963748358', '8f89dda8e072383cf50d42532ae8f52ad89f8fdf'] - self.check_revisions_ordering('committer_date', expected_result) + self.check_revisions_ordering('committer_date', expected_result, + truncated_history=False) def test_revisions_walker_dfs(self): @@ -330,7 +349,8 @@ 'b401c50863475db4440c85c10ac0b6423b61554d', '9c5051397e5c2e0c258bb639c3dd34406584ca10'] - self.check_revisions_ordering('dfs', expected_result) + self.check_revisions_ordering('dfs', expected_result, + truncated_history=False) def test_revisions_walker_dfs_post(self): @@ -347,7 +367,8 @@ 'ee96c2a2d397b79070d2b6fe3051290963748358', '8f89dda8e072383cf50d42532ae8f52ad89f8fdf'] - self.check_revisions_ordering('dfs_post', expected_result) + self.check_revisions_ordering('dfs_post', expected_result, + truncated_history=False) def test_revisions_walker_bfs(self): @@ -364,4 +385,19 @@ 'b401c50863475db4440c85c10ac0b6423b61554d', '9c5051397e5c2e0c258bb639c3dd34406584ca10'] - self.check_revisions_ordering('bfs', expected_result) + self.check_revisions_ordering('bfs', expected_result, + truncated_history=False) + + def test_revisions_walker_truncated_history(self): + + expected_result = ['b364f53155044e5308a0f73abb3b5f01995a5b7d', + 'b94886c500c46e32dc3d7ebae8a5409accd592e5', + '0cb6b4611d65bee0f57821dac7f611e2f8a02433', + '2b0240c6d682bad51532eec15b8a7ed6b75c8d31', + 'b401c50863475db4440c85c10ac0b6423b61554d', + '9c5051397e5c2e0c258bb639c3dd34406584ca10'] + + for revs_walker_type in ('committer_date', 'bfs', 'dfs', 'dfs_post'): + + self.check_revisions_ordering(revs_walker_type, expected_result, + truncated_history=True)