Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/algos/revisions_walker.py
# Copyright (C) 2018 The Software Heritage developers | # Copyright (C) 2018-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import heapq | import heapq | ||||
from abc import ABCMeta, abstractmethod | from abc import ABCMeta, abstractmethod | ||||
from collections import deque | from collections import deque | ||||
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | class RevisionsWalker(metaclass=_RevisionsWalkerMetaClass): | ||||
def __init__(self, storage, rev_start, max_revs=None, state=None): | def __init__(self, storage, rev_start, max_revs=None, state=None): | ||||
self._revs_to_visit = [] | self._revs_to_visit = [] | ||||
self._done = set() | self._done = set() | ||||
self._revs = {} | self._revs = {} | ||||
self._last_rev = None | self._last_rev = None | ||||
self._num_revs = 0 | self._num_revs = 0 | ||||
self._max_revs = max_revs | self._max_revs = max_revs | ||||
self._missing_revs = set() | |||||
if state: | if state: | ||||
self._revs_to_visit = state['revs_to_visit'] | self._revs_to_visit = state['revs_to_visit'] | ||||
self._done = state['done'] | self._done = state['done'] | ||||
self._last_rev = state['last_rev'] | self._last_rev = state['last_rev'] | ||||
self._num_revs = state['num_revs'] | self._num_revs = state['num_revs'] | ||||
self._missing_revs = state['missing_revs'] | |||||
self.storage = storage | self.storage = storage | ||||
self.process_rev(rev_start) | self.process_rev(rev_start) | ||||
@abstractmethod | @abstractmethod | ||||
def process_rev(self, rev_id): | def process_rev(self, rev_id): | ||||
""" | """ | ||||
Abstract method whose purpose is to process a newly visited | Abstract method whose purpose is to process a newly visited | ||||
revision during the walk. | revision during the walk. | ||||
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines | def _get_rev(self, rev_id): | ||||
# requests to storage and thus speedup the revisions walk | # requests to storage and thus speedup the revisions walk | ||||
for rev in self.storage.revision_log([rev_id], limit=100): | for rev in self.storage.revision_log([rev_id], limit=100): | ||||
# revision data is missing, returned history will be truncated | # revision data is missing, returned history will be truncated | ||||
if rev is None: | if rev is None: | ||||
continue | continue | ||||
self._revs[rev['id']] = rev | self._revs[rev['id']] = rev | ||||
return self._revs.get(rev_id) | return self._revs.get(rev_id) | ||||
def missing_revisions(self): | |||||
""" | |||||
Return a set of revision identifiers whose associated data were | |||||
found missing into the archive content while walking on the | |||||
revisions graph. | |||||
Returns: | |||||
Set[bytes]: a set of revision identifiers | |||||
""" | |||||
return self._missing_revs | |||||
def is_history_truncated(self): | |||||
""" | |||||
Return if the revision history generated so far has been truncated | |||||
of not. A revision history might end up truncated if some revision | |||||
data were found missing into the archive content. | |||||
Returns: | |||||
bool: Whether the history got truncated or not | |||||
""" | |||||
return len(self.missing_revisions()) > 0 | |||||
def export_state(self): | def export_state(self): | ||||
""" | """ | ||||
Export the internal state of that revision walker to a dict. | Export the internal state of that revision walker to a dict. | ||||
Its purpose is to continue the iteration in a pagination context. | Its purpose is to continue the iteration in a pagination context. | ||||
Returns: | Returns: | ||||
dict: A dict containing the internal state of that revisions walker | dict: A dict containing the internal state of that revisions walker | ||||
""" | """ | ||||
return { | return { | ||||
'revs_to_visit': self._revs_to_visit, | 'revs_to_visit': self._revs_to_visit, | ||||
'done': self._done, | 'done': self._done, | ||||
'last_rev': self._last_rev, | 'last_rev': self._last_rev, | ||||
'num_revs': self._num_revs | 'num_revs': self._num_revs, | ||||
'missing_revs': self._missing_revs | |||||
} | } | ||||
def __next__(self): | def __next__(self): | ||||
if self.is_finished(): | if self.is_finished(): | ||||
raise StopIteration | raise StopIteration | ||||
while self._revs_to_visit: | while self._revs_to_visit: | ||||
rev_id = self.get_next_rev_id() | rev_id = self.get_next_rev_id() | ||||
if rev_id in self._done: | if rev_id in self._done: | ||||
continue | continue | ||||
self._done.add(rev_id) | self._done.add(rev_id) | ||||
rev = self._get_rev(rev_id) | rev = self._get_rev(rev_id) | ||||
# revision data is missing, returned history will be truncated | # revision data is missing, returned history will be truncated | ||||
if rev is None: | if rev is None: | ||||
self._missing_revs.add(rev_id) | |||||
continue | continue | ||||
self.process_parent_revs(rev) | self.process_parent_revs(rev) | ||||
if self.should_return(rev): | if self.should_return(rev): | ||||
self._num_revs += 1 | self._num_revs += 1 | ||||
self._last_rev = rev | self._last_rev = rev | ||||
return rev | return rev | ||||
raise StopIteration | raise StopIteration | ||||
Show All 16 Lines | def process_rev(self, rev_id): | ||||
Args: | Args: | ||||
rev_id (bytes): the newly visited revision identifier | rev_id (bytes): the newly visited revision identifier | ||||
""" | """ | ||||
if rev_id not in self._done: | if rev_id not in self._done: | ||||
rev = self._get_rev(rev_id) | rev = self._get_rev(rev_id) | ||||
if rev is not None: | if rev is not None: | ||||
commit_time = rev['committer_date']['timestamp']['seconds'] | commit_time = rev['committer_date']['timestamp']['seconds'] | ||||
heapq.heappush(self._revs_to_visit, (-commit_time, rev_id)) | heapq.heappush(self._revs_to_visit, (-commit_time, rev_id)) | ||||
else: | |||||
self._missing_revs.add(rev_id) | |||||
def get_next_rev_id(self): | def get_next_rev_id(self): | ||||
""" | """ | ||||
Return the smallest revision from the priority queue, i.e. | Return the smallest revision from the priority queue, i.e. | ||||
the one with highest committer date. | the one with highest committer date. | ||||
Returns: | Returns: | ||||
dict: A dict describing a revision as returned by | dict: A dict describing a revision as returned by | ||||
▲ Show 20 Lines • Show All 300 Lines • Show Last 20 Lines |