Skip to content
Snippets Groups Projects
Commit 97b0d9f8 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

Improve git hash update behavior

Decrease the number of paths to compute to 1 common ancestor (if any):
- Scan only that directory and rehash with new results (data changed)
- Update the resulting objects with those new hashes.
- Update from that directory to the rootdir the existing hashes computation
parent cd881639
No related branches found
No related tags found
No related merge requests found
......@@ -339,56 +339,95 @@ def recompute_sha1_in_memory(root, deeper_rootdir, objects):
return objects
def commonpath(paths):
"""Given a sequence of path names, returns the longest common sub-path.
Copied from Python3.5
"""
if not paths:
raise ValueError('commonpath() arg is an empty sequence')
if isinstance(paths[0], bytes):
sep = b'/'
curdir = b'.'
else:
sep = '/'
curdir = '.'
try:
split_paths = [path.split(sep) for path in paths]
try:
isabs, = set(p[:1] == sep for p in paths)
except ValueError:
raise ValueError("Can't mix absolute and relative paths")
split_paths = [
[c for c in s if c and c != curdir] for s in split_paths]
s1 = min(split_paths)
s2 = max(split_paths)
common = s1
for i, c in enumerate(s1):
if c != s2[i]:
common = s1[:i]
break
prefix = sep if isabs else sep[:0]
return prefix + sep.join(common)
except (TypeError, AttributeError):
raise
def update_checksums_from(changed_paths, objects,
dir_ok_fn=lambda dirpath: True):
"""Given a list of changed paths, recompute the checksums only where needed.
"""Given a list of changed paths, recompute the checksums only where
needed.
Args:
changed_paths: List of dictionary representing path changes.
The dictionary has the form:
changed_paths: Dictionary list representing path changes.
A dictionary has the form:
- path: the full path to the file Added, Modified or Deleted
- action: A, M or D
objects: dictionary returned by `walk_and_compute_sha1_from_directory`.
Returns:
Dictionary returned by `walk_and_compute_sha1_from_directory`
updated (mutated) according to necessary modifications.
updated (mutated) according to latest filesystem modifications.
"""
root = objects[ROOT_TREE_KEY][0]['path']
# FIXME: Compute the lowest common ancestor to reduce the computations
# a first round-trip to ensure we don't need to recompute everything anyway
for parent in (os.path.dirname(p['path']) for p in changed_paths):
if parent == root:
return walk_and_compute_sha1_from_directory(root, dir_ok_fn)
# Recompute only what's needed
paths = []
# a first round-trip to ensure we don't need to...
for changed_path in changed_paths:
path = changed_path['path']
if changed_path['action'] == 'D':
new_objects = {k: objects[k] for k in objects.keys()
if not k.startswith(path)}
objects = new_objects
rootdir = os.path.dirname(path)
if not os.path.exists(rootdir):
objects.pop(rootdir, None)
continue
# recompute from disk the checksums from impacted rootdir changes
# and update the original objects with new checksums for the
# arborescence tree below rootdir
hashes = walk_and_compute_sha1_from_directory(rootdir, dir_ok_fn,
with_root_tree=False)
objects.update(hashes)
# FIXME: In some cases, there will be too much computations
# here. We cannot postpone the computations though since the
# keys in the dict are not sorted.
# Recompute the hashes in memory from rootdir to root
objects = recompute_sha1_in_memory(root, rootdir, objects)
parent = os.path.dirname(path)
if parent == root: # ... recompute everything anyway
return walk_and_compute_sha1_from_directory(root, dir_ok_fn)
return objects
if changed_path['action'] == 'D': # (D)elete
k = objects.pop(path, None)
if k: # it's a dir, we need to remove the descendant paths
prefix_path = path + b'/'
new_objects = {k: objects[k] for k in objects.keys()
if not k.startswith(prefix_path)}
objects = new_objects
paths.append(parent)
if not paths: # no modification on paths
return objects
rootdir = commonpath(paths)
# Recompute from disk the checksums from impacted common ancestor
# rootdir changes. Then update the original objects with new
# checksums for the arborescence tree below rootdir
hashes = walk_and_compute_sha1_from_directory(rootdir, dir_ok_fn,
with_root_tree=False)
objects.update(hashes)
# Recompute the hashes in memory from rootdir to root
return recompute_sha1_in_memory(root, rootdir, objects)
......@@ -137,7 +137,7 @@ blah
self.assertEqual(checksum, self.checksums['tag_sha1_git'])
class GitHashArborescenceTree(unittest.TestCase):
class GitHashWalkArborescenceTree(unittest.TestCase):
"""Root class to ease walk and git hash testing without side-effecty problems.
"""
......@@ -161,6 +161,12 @@ class GitHashArborescenceTree(unittest.TestCase):
if os.path.exists(self.tmp_root_path):
shutil.rmtree(self.tmp_root_path)
class GitHashFromScratch(GitHashWalkArborescenceTree):
"""Test the main `walk_and_compute_sha1_from_directory` algorithm that
scans and compute the disk for checksums.
"""
@istest
def walk_and_compute_sha1_from_directory(self):
# make a temporary arborescence tree to hash without ignoring anything
......@@ -234,7 +240,10 @@ class GitHashArborescenceTree(unittest.TestCase):
self.assertEquals(actual_hashes, expected_hashes)
class GitHashUpdate(GitHashArborescenceTree):
class GitHashUpdate(GitHashWalkArborescenceTree):
"""Test `walk and git hash only on modified fs` functions.
"""
@istest
def update_checksums_from_add_new_file(self):
# make a temporary arborescence tree to hash without ignoring anything
......@@ -290,6 +299,19 @@ class GitHashUpdate(GitHashArborescenceTree):
self.assertEquals(expected_dict, actual_dict)
@istest
def update_checksums_no_change(self):
# when
expected_dict = git.walk_and_compute_sha1_from_directory(
self.tmp_root_path)
# nothing changes on disk
# then
actual_dict = git.update_checksums_from([], expected_dict)
self.assertEquals(actual_dict, expected_dict)
@istest
def update_checksums_delete_existing_file(self):
# make a temporary arborescence tree to hash without ignoring anything
......@@ -314,10 +336,10 @@ class GitHashUpdate(GitHashArborescenceTree):
[{'path': changed_path, 'action': 'D'}],
objects)
self.assertEquals(expected_dict, actual_dict)
self.assertEquals(actual_dict, expected_dict)
@istest
def update_checksums_from_multiple_fs_modification(self):
def update_checksums_from_multiple_fs_modifications(self):
# make a temporary arborescence tree to hash without ignoring anything
# update the disk in some way (modify a file, add a new, delete one)
# update the actual git checksums from the deeper tree modified
......@@ -359,19 +381,27 @@ class GitHashUpdate(GitHashArborescenceTree):
self.assertEquals(expected_dict, actual_dict)
class GitHashUpdateWithCommonAncestor(GitHashArborescenceTree):
@istest
def update_checksums_from_common_ancestor(self):
# make a temporary arborescence tree to hash without ignoring anything
# update the disk in some way
# update the actual git checksums where only the modification is needed
# when
# Add some new arborescence below a folder destined to be removed
# want to check that old keys does not remain
future_folder_to_remove = os.path.join(self.tmp_root_path,
b'sample-folder/bar/barfoo')
# add .../barfoo/hello/world under (.../barfoo which will be destroyed)
new_folder = os.path.join(future_folder_to_remove, b'hello')
os.makedirs(new_folder, exist_ok=True)
with open(os.path.join(future_folder_to_remove, b'world'), 'wb') as f:
f.write(b"i'm sad 'cause i'm destined to be removed...")
# now we scan the disk
objects = git.walk_and_compute_sha1_from_directory(
self.tmp_root_path)
# Actions on disk (imagine a checkout of some form)
assert objects[future_folder_to_remove]
# Actions on disk (to simulate a checkout of some sort)
# 1. Create a new file
changed_path = os.path.join(self.tmp_root_path,
......@@ -386,14 +416,8 @@ class GitHashUpdateWithCommonAncestor(GitHashArborescenceTree):
with open(changed_path1, 'wb') as f:
f.write(b'new line')
# 3. Remove some folder
changed_path2 = os.path.join(self.tmp_root_path,
b'sample-folder/foo')
# 3. Remove some folder
changed_path2 = os.path.join(self.tmp_root_path,
b'sample-folder/bar/barfoo')
shutil.rmtree(changed_path2)
# 3. Remove folder
shutil.rmtree(future_folder_to_remove)
# Actually walking the fs will be the resulting expectation
expected_dict = git.walk_and_compute_sha1_from_directory(
......@@ -403,7 +427,7 @@ class GitHashUpdateWithCommonAncestor(GitHashArborescenceTree):
actual_dict = git.update_checksums_from(
[{'path': changed_path, 'action': 'A'},
{'path': changed_path1, 'action': 'M'},
{'path': changed_path2, 'action': 'D'}],
{'path': future_folder_to_remove, 'action': 'D'}],
objects)
self.assertEquals(expected_dict, actual_dict)
......@@ -450,3 +474,25 @@ class GitHashUpdateWithCommonAncestor(GitHashArborescenceTree):
objects)
self.assertEquals(expected_dict, actual_dict)
@istest
def commonpath(self):
paths = ['r/0/h',
'r/1/d', 'r/1/i/a', 'r/1/i/b', 'r/1/i/c',
'r/2/e', 'r/2/f', 'r/2/g']
self.assertEquals(git.commonpath(paths), 'r')
paths = ['r/1/d', 'r/1/i/a', 'r/1/i/b', 'r/1/i/c']
self.assertEquals(git.commonpath(paths), 'r/1')
paths = ['/a/r/2/g', '/a/r/1/i/c', '/a/r/0/h']
self.assertEquals(git.commonpath(paths), '/a/r')
paths = [b'/a/r/2/g', b'/b/r/1/i/c', b'/c/r/0/h']
self.assertEquals(git.commonpath(paths), b'/')
paths = ['a/z', 'a/z', 'a/z']
self.assertEquals(git.commonpath(paths), 'a/z')
paths = ['0']
self.assertEquals(git.commonpath(paths), '0')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment