Skip to content
Snippets Groups Projects
Commit 4bf91cff authored by Antoine Cezar's avatar Antoine Cezar
Browse files

Add tree diffing in HgLoaderFromDisk

By looking at differences between revisions, the repository tree is
updated rather that fully rebuild for each one.

Observed load time improvement on https://www.mercurial-scm.org/repo/hg/
1:11:02 -> 47:58
parent d3885c7f
No related branches found
No related tags found
No related merge requests found
......@@ -8,7 +8,7 @@ from collections import deque
from datetime import datetime, timezone
from shutil import rmtree
from tempfile import mkdtemp
from typing import Any, Deque, Dict, Optional, Tuple, Union
from typing import Any, Deque, Dict, Optional, Tuple, TypeVar, Union
import dateutil
......@@ -49,6 +49,9 @@ DEFAULT_CONFIG: Dict[str, Any] = {
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk"
T = TypeVar("T")
def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]:
"""Convert visit date from Optional[Union[str, datetime]] to Optional[datetime].
......@@ -71,14 +74,18 @@ def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[dat
class HgDirectory(Directory):
"""A directory that creates parent directories if missing."""
"""A more practical directory.
- creates missing parent directories
- removes empty directories
"""
def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None:
if b"/" in path:
head, tail = path.split(b"/", 1)
directory = self.get(head)
if directory is None:
if directory is None or isinstance(directory, Content):
directory = HgDirectory()
self[head] = directory
......@@ -86,6 +93,25 @@ class HgDirectory(Directory):
else:
super().__setitem__(path, value)
def __delitem__(self, path: bytes) -> None:
super().__delitem__(path)
while b"/" in path: # remove empty parent directories
path = path.rsplit(b"/", 1)[0]
if len(self[path]) == 0:
super().__delitem__(path)
else:
break
def get(
self, path: bytes, default: Optional[T] = None
) -> Optional[Union[Content, "HgDirectory", T]]:
# TODO move to swh.model.from_disk.Directory
try:
return self[path]
except KeyError:
return default
class HgLoaderFromDisk(BaseLoader):
"""Load a mercurial repository from a local repository."""
......@@ -125,6 +151,15 @@ class HgLoaderFromDisk(BaseLoader):
self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {}
self._repo_directory: Optional[str] = None
# keeps the last processed hg nodeid
# it is used for differential tree update by store_directories
# NULLID is the parent of the first revision
self._last_hg_nodeid = hgutil.NULLID
# keeps the last revision tree
# it is used for differential tree update by store_directories
self._last_root = HgDirectory()
# Cache the content hash across revisions to avoid recalculation.
self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict(
self.config["content_cache_size"],
......@@ -409,12 +444,26 @@ class HgLoaderFromDisk(BaseLoader):
Returns:
the swhid of the top level directory.
"""
root = HgDirectory()
for file_path in rev_ctx.manifest():
repo: hgutil.Repository = self._repo # mypy can't infer that repo is not None
prev_ctx = repo[self._last_hg_nodeid]
# TODO maybe do diff on parents
status = prev_ctx.status(rev_ctx)
for file_path in status.removed:
del self._last_root[file_path]
for file_path in status.added:
content = self.store_content(rev_ctx, file_path)
root[file_path] = content
self._last_root[file_path] = content
for file_path in status.modified:
content = self.store_content(rev_ctx, file_path)
self._last_root[file_path] = content
self._last_hg_nodeid = rev_ctx.node()
directories: Deque[Directory] = deque([root])
directories: Deque[Directory] = deque([self._last_root])
while directories:
directory = directories.pop()
self.storage.directory_add([directory.to_model()])
......@@ -422,7 +471,7 @@ class HgLoaderFromDisk(BaseLoader):
[item for item in directory.values() if isinstance(item, Directory)]
)
return root.hash
return self._last_root.hash
class HgArchiveLoaderFromDisk(HgLoaderFromDisk):
......
......@@ -4,6 +4,8 @@
# See top-level LICENSE file for more information
import os
from datetime import datetime
from hashlib import sha1
from swh.loader.tests import (
assert_last_visit_matches,
......@@ -11,7 +13,7 @@ from swh.loader.tests import (
get_stats,
prepare_repository_from_archive,
)
from swh.model.from_disk import Content
from swh.model.from_disk import Content, DentryPerms
from swh.model.hashutil import hash_to_bytes
from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType
from swh.storage.algos.snapshot import snapshot_get_latest
......@@ -20,9 +22,45 @@ from ..from_disk import HgDirectory, HgLoaderFromDisk
from .loader_checker import ExpectedSwhids, LoaderChecker
def random_content() -> Content:
"""Create minimal content object."""
data = str(datetime.now()).encode()
return Content({"sha1_git": sha1(data).digest(), "perms": DentryPerms.content})
def test_hg_directory_creates_missing_directories():
directory = HgDirectory()
directory[b"path/to/some/content"] = Content()
directory[b"path/to/some/content"] = random_content()
def test_hg_directory_get():
content = random_content()
directory = HgDirectory()
assert directory.get(b"path/to/content") is None
assert directory.get(b"path/to/content", content) == content
directory[b"path/to/content"] = content
assert directory.get(b"path/to/content") == content
def test_hg_directory_deletes_empty_directories():
directory = HgDirectory()
content = random_content()
directory[b"path/to/content"] = content
directory[b"path/to/some/deep/content"] = random_content()
del directory[b"path/to/some/deep/content"]
assert directory.get(b"path/to/some/deep") is None
assert directory.get(b"path/to/some") is None
assert directory.get(b"path/to/content") == content
def test_hg_directory_when_directory_replaces_file():
directory = HgDirectory()
directory[b"path/to/some"] = random_content()
directory[b"path/to/some/content"] = random_content()
# Those tests assert expectations on repository loading
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment