diff --git a/swh/vault/cookers/revision_flat.py b/swh/vault/cookers/revision_flat.py --- a/swh/vault/cookers/revision_flat.py +++ b/swh/vault/cookers/revision_flat.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2017 The Software Heritage developers +# Copyright (C) 2016-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -9,6 +9,7 @@ from swh.model import hashutil from swh.vault.cookers.base import BaseVaultCooker +from swh.vault.cookers.utils import revision_log from swh.vault.to_disk import DirectoryBuilder @@ -22,7 +23,7 @@ def prepare_bundle(self): with tempfile.TemporaryDirectory(prefix='tmp-vault-revision-') as td: root = Path(td) - for revision in self.storage.revision_log([self.obj_id]): + for revision in revision_log(self.storage, self.obj_id): revdir = root / hashutil.hash_to_hex(revision['id']) revdir.mkdir() directory_builder = DirectoryBuilder( diff --git a/swh/vault/cookers/revision_gitfast.py b/swh/vault/cookers/revision_gitfast.py --- a/swh/vault/cookers/revision_gitfast.py +++ b/swh/vault/cookers/revision_gitfast.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,6 +15,7 @@ from swh.model.toposort import toposort from swh.model.from_disk import mode_to_perms from swh.vault.cookers.base import BaseVaultCooker +from swh.vault.cookers.utils import revision_log from swh.vault.to_disk import get_filtered_files_content @@ -26,7 +27,7 @@ return not list(self.storage.revision_missing([self.obj_id])) def prepare_bundle(self): - self.log = list(toposort(self.storage.revision_log([self.obj_id]))) + self.log = list(toposort(revision_log(self.storage, self.obj_id))) self.gzobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS | 16) self.fastexport() self.write(self.gzobj.flush()) diff --git a/swh/vault/cookers/utils.py b/swh/vault/cookers/utils.py new file mode 100644 --- /dev/null +++ b/swh/vault/cookers/utils.py @@ -0,0 +1,47 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.storage.algos.revisions_walker import get_revisions_walker + + +def revision_log(storage, rev_id, per_page=1000): + """Retrieve a revision log in a paginated way in order to avoid storage + timeouts when the total number of revisions to fetch is large. + + Args: + storage (swh.storage.storage.Storage): instance of swh storage + (either local or remote) + rev_id (bytes): a revision identifier + per_page (Optional[int]): the maximum number of revisions to return + in each page + + Yields: + dict: Revision information as a dictionary + """ + rw_state = {} + nb_revs = 0 + max_revs = per_page + while True: + # Get an iterator returning the commits log from rev_id. + # At most max_revs visited revisions from rev_id in the commits graph + # will be returned. + revs_walker = get_revisions_walker('bfs', storage, rev_id, + max_revs=max_revs, + state=rw_state) + # Iterate on at most per_page revisions in the commits log. + for rev in revs_walker: + nb_revs += 1 + yield rev + # If the total number of iterated revisions is lesser than the + # maximum requested one, it means that we hit the initial revision + # in the log. + if nb_revs < max_revs: + break + # Backup iterator state to continue the revisions iteration + # from where we left it. + rw_state = revs_walker.export_state() + # Increment the maximum of revisions to iterate from rev_id + # to get next revisions in the log. + max_revs += per_page