cookers.revision_*: Fix storage timeouts when fetching a large revision log

Similar to directory cooking (T1177), fetching a revision log must be performed client-side in a paginated way to avoid storage timeouts and cooking errors. Closes T1934

cookers.revision_*: Fix storage timeouts when fetching a large revision log
Similar to directory cooking (T1177), fetching a revision log must be performed client-side in a paginated way to avoid storage timeouts and cooking errors. Closes T1934
9f673806 · Antoine Lambert · 6188be58 · 9f673806 · 9f673806 · 9f673806
Commit 9f673806 authored 5 years ago by Antoine Lambert
--- a/swh/vault/cookers/revision_flat.py
+++ b/swh/vault/cookers/revision_flat.py
-# Copyright (C) 2016-2017  The Software Heritage developers
+# Copyright (C) 2016-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -9,6 +9,7 @@ from pathlib import Path
 from swh.model import hashutil
 from swh.vault.cookers.base import BaseVaultCooker
+from swh.vault.cookers.utils import revision_log
 from swh.vault.to_disk import DirectoryBuilder
@@ -22,7 +23,7 @@ class RevisionFlatCooker(BaseVaultCooker):
    def prepare_bundle(self):
        with tempfile.TemporaryDirectory(prefix='tmp-vault-revision-') as td:
            root = Path(td)
-            for revision in self.storage.revision_log([self.obj_id]):
+            for revision in revision_log(self.storage, self.obj_id):
                revdir = root / hashutil.hash_to_hex(revision['id'])
                revdir.mkdir()
                directory_builder = DirectoryBuilder(

--- a/swh/vault/cookers/revision_gitfast.py
+++ b/swh/vault/cookers/revision_gitfast.py
-# Copyright (C) 2017  The Software Heritage developers
+# Copyright (C) 2017-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -15,6 +15,7 @@ from swh.model import hashutil
 from swh.model.toposort import toposort
 from swh.model.from_disk import mode_to_perms
 from swh.vault.cookers.base import BaseVaultCooker
+from swh.vault.cookers.utils import revision_log
 from swh.vault.to_disk import get_filtered_files_content
@@ -26,7 +27,7 @@ class RevisionGitfastCooker(BaseVaultCooker):
        return not list(self.storage.revision_missing([self.obj_id]))
    def prepare_bundle(self):
-        self.log = list(toposort(self.storage.revision_log([self.obj_id])))
+        self.log = list(toposort(revision_log(self.storage, self.obj_id)))
        self.gzobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS | 16)
        self.fastexport()
        self.write(self.gzobj.flush())

--- a/swh/vault/cookers/utils.py
+++ b/swh/vault/cookers/utils.py
+# Copyright (C) 2019  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from swh.storage.algos.revisions_walker import get_revisions_walker
+def revision_log(storage, rev_id, per_page=1000):
+    """Retrieve a revision log in a paginated way in order to avoid storage
+    timeouts when the total number of revisions to fetch is large.
+    Args:
+        storage (swh.storage.storage.Storage): instance of swh storage
+            (either local or remote)
+        rev_id (bytes): a revision identifier
+        per_page (Optional[int]): the maximum number of revisions to return
+            in each page
+    Yields:
+        dict: Revision information as a dictionary
+    """
+    rw_state = {}
+    nb_revs = 0
+    max_revs = per_page
+    while True:
+        # Get an iterator returning the commits log from rev_id.
+        # At most max_revs visited revisions from rev_id in the commits graph
+        # will be returned.
+        revs_walker = get_revisions_walker('bfs', storage, rev_id,
+                                           max_revs=max_revs,
+                                           state=rw_state)
+        # Iterate on at most per_page revisions in the commits log.
+        for rev in revs_walker:
+            nb_revs += 1
+            yield rev
+        # If the total number of iterated revisions is lesser than the
+        # maximum requested one, it means that we hit the initial revision
+        # in the log.
+        if nb_revs < max_revs:
+            break
+        # Backup iterator state to continue the revisions iteration
+        # from where we left it.
+        rw_state = revs_walker.export_state()
+        # Increment the maximum of revisions to iterate from rev_id
+        # to get next revisions in the log.
+        max_revs += per_page