Skip to content
Snippets Groups Projects
Commit 9f673806 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

cookers.revision_*: Fix storage timeouts when fetching a large revision log

Similar to directory cooking (T1177), fetching a revision log must be performed
client-side in a paginated way to avoid storage timeouts and cooking errors.

Closes T1934
parent 6188be58
No related branches found
Tags v0.0.30
1 merge request!169Fix revision cooking errors with the vault for large revision log
# Copyright (C) 2016-2017 The Software Heritage developers # Copyright (C) 2016-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution # See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version # License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information # See top-level LICENSE file for more information
...@@ -9,6 +9,7 @@ from pathlib import Path ...@@ -9,6 +9,7 @@ from pathlib import Path
from swh.model import hashutil from swh.model import hashutil
from swh.vault.cookers.base import BaseVaultCooker from swh.vault.cookers.base import BaseVaultCooker
from swh.vault.cookers.utils import revision_log
from swh.vault.to_disk import DirectoryBuilder from swh.vault.to_disk import DirectoryBuilder
...@@ -22,7 +23,7 @@ class RevisionFlatCooker(BaseVaultCooker): ...@@ -22,7 +23,7 @@ class RevisionFlatCooker(BaseVaultCooker):
def prepare_bundle(self): def prepare_bundle(self):
with tempfile.TemporaryDirectory(prefix='tmp-vault-revision-') as td: with tempfile.TemporaryDirectory(prefix='tmp-vault-revision-') as td:
root = Path(td) root = Path(td)
for revision in self.storage.revision_log([self.obj_id]): for revision in revision_log(self.storage, self.obj_id):
revdir = root / hashutil.hash_to_hex(revision['id']) revdir = root / hashutil.hash_to_hex(revision['id'])
revdir.mkdir() revdir.mkdir()
directory_builder = DirectoryBuilder( directory_builder = DirectoryBuilder(
......
# Copyright (C) 2017 The Software Heritage developers # Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution # See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version # License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information # See top-level LICENSE file for more information
...@@ -15,6 +15,7 @@ from swh.model import hashutil ...@@ -15,6 +15,7 @@ from swh.model import hashutil
from swh.model.toposort import toposort from swh.model.toposort import toposort
from swh.model.from_disk import mode_to_perms from swh.model.from_disk import mode_to_perms
from swh.vault.cookers.base import BaseVaultCooker from swh.vault.cookers.base import BaseVaultCooker
from swh.vault.cookers.utils import revision_log
from swh.vault.to_disk import get_filtered_files_content from swh.vault.to_disk import get_filtered_files_content
...@@ -26,7 +27,7 @@ class RevisionGitfastCooker(BaseVaultCooker): ...@@ -26,7 +27,7 @@ class RevisionGitfastCooker(BaseVaultCooker):
return not list(self.storage.revision_missing([self.obj_id])) return not list(self.storage.revision_missing([self.obj_id]))
def prepare_bundle(self): def prepare_bundle(self):
self.log = list(toposort(self.storage.revision_log([self.obj_id]))) self.log = list(toposort(revision_log(self.storage, self.obj_id)))
self.gzobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS | 16) self.gzobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS | 16)
self.fastexport() self.fastexport()
self.write(self.gzobj.flush()) self.write(self.gzobj.flush())
......
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.storage.algos.revisions_walker import get_revisions_walker
def revision_log(storage, rev_id, per_page=1000):
"""Retrieve a revision log in a paginated way in order to avoid storage
timeouts when the total number of revisions to fetch is large.
Args:
storage (swh.storage.storage.Storage): instance of swh storage
(either local or remote)
rev_id (bytes): a revision identifier
per_page (Optional[int]): the maximum number of revisions to return
in each page
Yields:
dict: Revision information as a dictionary
"""
rw_state = {}
nb_revs = 0
max_revs = per_page
while True:
# Get an iterator returning the commits log from rev_id.
# At most max_revs visited revisions from rev_id in the commits graph
# will be returned.
revs_walker = get_revisions_walker('bfs', storage, rev_id,
max_revs=max_revs,
state=rw_state)
# Iterate on at most per_page revisions in the commits log.
for rev in revs_walker:
nb_revs += 1
yield rev
# If the total number of iterated revisions is lesser than the
# maximum requested one, it means that we hit the initial revision
# in the log.
if nb_revs < max_revs:
break
# Backup iterator state to continue the revisions iteration
# from where we left it.
rw_state = revs_walker.export_state()
# Increment the maximum of revisions to iterate from rev_id
# to get next revisions in the log.
max_revs += per_page
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment