diff --git a/swh/model/git.py b/swh/model/git.py index 1f5807423b66e703c5a6df69c70d5b5c077ea861..15da81e37a7864ac47d568da2e46186a1185012d 100644 --- a/swh/model/git.py +++ b/swh/model/git.py @@ -339,56 +339,95 @@ def recompute_sha1_in_memory(root, deeper_rootdir, objects): return objects +def commonpath(paths): + """Given a sequence of path names, returns the longest common sub-path. + + Copied from Python3.5 + + """ + + if not paths: + raise ValueError('commonpath() arg is an empty sequence') + + if isinstance(paths[0], bytes): + sep = b'/' + curdir = b'.' + else: + sep = '/' + curdir = '.' + + try: + split_paths = [path.split(sep) for path in paths] + + try: + isabs, = set(p[:1] == sep for p in paths) + except ValueError: + raise ValueError("Can't mix absolute and relative paths") + + split_paths = [ + [c for c in s if c and c != curdir] for s in split_paths] + s1 = min(split_paths) + s2 = max(split_paths) + common = s1 + for i, c in enumerate(s1): + if c != s2[i]: + common = s1[:i] + break + + prefix = sep if isabs else sep[:0] + return prefix + sep.join(common) + except (TypeError, AttributeError): + raise + + def update_checksums_from(changed_paths, objects, dir_ok_fn=lambda dirpath: True): - """Given a list of changed paths, recompute the checksums only where needed. + """Given a list of changed paths, recompute the checksums only where + needed. Args: - changed_paths: List of dictionary representing path changes. - The dictionary has the form: + changed_paths: Dictionary list representing path changes. + A dictionary has the form: - path: the full path to the file Added, Modified or Deleted - action: A, M or D objects: dictionary returned by `walk_and_compute_sha1_from_directory`. Returns: Dictionary returned by `walk_and_compute_sha1_from_directory` - updated (mutated) according to necessary modifications. + updated (mutated) according to latest filesystem modifications. """ root = objects[ROOT_TREE_KEY][0]['path'] - # FIXME: Compute the lowest common ancestor to reduce the computations - - # a first round-trip to ensure we don't need to recompute everything anyway - for parent in (os.path.dirname(p['path']) for p in changed_paths): - if parent == root: - return walk_and_compute_sha1_from_directory(root, dir_ok_fn) - - # Recompute only what's needed + paths = [] + # a first round-trip to ensure we don't need to... for changed_path in changed_paths: path = changed_path['path'] - if changed_path['action'] == 'D': - new_objects = {k: objects[k] for k in objects.keys() - if not k.startswith(path)} - objects = new_objects - - rootdir = os.path.dirname(path) - if not os.path.exists(rootdir): - objects.pop(rootdir, None) - continue - - # recompute from disk the checksums from impacted rootdir changes - # and update the original objects with new checksums for the - # arborescence tree below rootdir - hashes = walk_and_compute_sha1_from_directory(rootdir, dir_ok_fn, - with_root_tree=False) - objects.update(hashes) - - # FIXME: In some cases, there will be too much computations - # here. We cannot postpone the computations though since the - # keys in the dict are not sorted. - - # Recompute the hashes in memory from rootdir to root - objects = recompute_sha1_in_memory(root, rootdir, objects) + parent = os.path.dirname(path) + if parent == root: # ... recompute everything anyway + return walk_and_compute_sha1_from_directory(root, dir_ok_fn) - return objects + if changed_path['action'] == 'D': # (D)elete + k = objects.pop(path, None) + if k: # it's a dir, we need to remove the descendant paths + prefix_path = path + b'/' + new_objects = {k: objects[k] for k in objects.keys() + if not k.startswith(prefix_path)} + objects = new_objects + + paths.append(parent) + + if not paths: # no modification on paths + return objects + + rootdir = commonpath(paths) + + # Recompute from disk the checksums from impacted common ancestor + # rootdir changes. Then update the original objects with new + # checksums for the arborescence tree below rootdir + hashes = walk_and_compute_sha1_from_directory(rootdir, dir_ok_fn, + with_root_tree=False) + objects.update(hashes) + + # Recompute the hashes in memory from rootdir to root + return recompute_sha1_in_memory(root, rootdir, objects) diff --git a/swh/model/tests/test_git.py b/swh/model/tests/test_git.py index 7d1c093c67a2844a838e69accd51a083839d27ee..a630835eeb8371d7e85457c7b382024edbe31fbd 100644 --- a/swh/model/tests/test_git.py +++ b/swh/model/tests/test_git.py @@ -137,7 +137,7 @@ blah self.assertEqual(checksum, self.checksums['tag_sha1_git']) -class GitHashArborescenceTree(unittest.TestCase): +class GitHashWalkArborescenceTree(unittest.TestCase): """Root class to ease walk and git hash testing without side-effecty problems. """ @@ -161,6 +161,12 @@ class GitHashArborescenceTree(unittest.TestCase): if os.path.exists(self.tmp_root_path): shutil.rmtree(self.tmp_root_path) + +class GitHashFromScratch(GitHashWalkArborescenceTree): + """Test the main `walk_and_compute_sha1_from_directory` algorithm that + scans and compute the disk for checksums. + + """ @istest def walk_and_compute_sha1_from_directory(self): # make a temporary arborescence tree to hash without ignoring anything @@ -234,7 +240,10 @@ class GitHashArborescenceTree(unittest.TestCase): self.assertEquals(actual_hashes, expected_hashes) -class GitHashUpdate(GitHashArborescenceTree): +class GitHashUpdate(GitHashWalkArborescenceTree): + """Test `walk and git hash only on modified fs` functions. + + """ @istest def update_checksums_from_add_new_file(self): # make a temporary arborescence tree to hash without ignoring anything @@ -290,6 +299,19 @@ class GitHashUpdate(GitHashArborescenceTree): self.assertEquals(expected_dict, actual_dict) + @istest + def update_checksums_no_change(self): + # when + expected_dict = git.walk_and_compute_sha1_from_directory( + self.tmp_root_path) + + # nothing changes on disk + + # then + actual_dict = git.update_checksums_from([], expected_dict) + + self.assertEquals(actual_dict, expected_dict) + @istest def update_checksums_delete_existing_file(self): # make a temporary arborescence tree to hash without ignoring anything @@ -314,10 +336,10 @@ class GitHashUpdate(GitHashArborescenceTree): [{'path': changed_path, 'action': 'D'}], objects) - self.assertEquals(expected_dict, actual_dict) + self.assertEquals(actual_dict, expected_dict) @istest - def update_checksums_from_multiple_fs_modification(self): + def update_checksums_from_multiple_fs_modifications(self): # make a temporary arborescence tree to hash without ignoring anything # update the disk in some way (modify a file, add a new, delete one) # update the actual git checksums from the deeper tree modified @@ -359,19 +381,27 @@ class GitHashUpdate(GitHashArborescenceTree): self.assertEquals(expected_dict, actual_dict) - -class GitHashUpdateWithCommonAncestor(GitHashArborescenceTree): @istest def update_checksums_from_common_ancestor(self): - # make a temporary arborescence tree to hash without ignoring anything - # update the disk in some way - # update the actual git checksums where only the modification is needed - # when + # Add some new arborescence below a folder destined to be removed + # want to check that old keys does not remain + future_folder_to_remove = os.path.join(self.tmp_root_path, + b'sample-folder/bar/barfoo') + + # add .../barfoo/hello/world under (.../barfoo which will be destroyed) + new_folder = os.path.join(future_folder_to_remove, b'hello') + os.makedirs(new_folder, exist_ok=True) + with open(os.path.join(future_folder_to_remove, b'world'), 'wb') as f: + f.write(b"i'm sad 'cause i'm destined to be removed...") + + # now we scan the disk objects = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) - # Actions on disk (imagine a checkout of some form) + assert objects[future_folder_to_remove] + + # Actions on disk (to simulate a checkout of some sort) # 1. Create a new file changed_path = os.path.join(self.tmp_root_path, @@ -386,14 +416,8 @@ class GitHashUpdateWithCommonAncestor(GitHashArborescenceTree): with open(changed_path1, 'wb') as f: f.write(b'new line') - # 3. Remove some folder - changed_path2 = os.path.join(self.tmp_root_path, - b'sample-folder/foo') - - # 3. Remove some folder - changed_path2 = os.path.join(self.tmp_root_path, - b'sample-folder/bar/barfoo') - shutil.rmtree(changed_path2) + # 3. Remove folder + shutil.rmtree(future_folder_to_remove) # Actually walking the fs will be the resulting expectation expected_dict = git.walk_and_compute_sha1_from_directory( @@ -403,7 +427,7 @@ class GitHashUpdateWithCommonAncestor(GitHashArborescenceTree): actual_dict = git.update_checksums_from( [{'path': changed_path, 'action': 'A'}, {'path': changed_path1, 'action': 'M'}, - {'path': changed_path2, 'action': 'D'}], + {'path': future_folder_to_remove, 'action': 'D'}], objects) self.assertEquals(expected_dict, actual_dict) @@ -450,3 +474,25 @@ class GitHashUpdateWithCommonAncestor(GitHashArborescenceTree): objects) self.assertEquals(expected_dict, actual_dict) + + @istest + def commonpath(self): + paths = ['r/0/h', + 'r/1/d', 'r/1/i/a', 'r/1/i/b', 'r/1/i/c', + 'r/2/e', 'r/2/f', 'r/2/g'] + self.assertEquals(git.commonpath(paths), 'r') + + paths = ['r/1/d', 'r/1/i/a', 'r/1/i/b', 'r/1/i/c'] + self.assertEquals(git.commonpath(paths), 'r/1') + + paths = ['/a/r/2/g', '/a/r/1/i/c', '/a/r/0/h'] + self.assertEquals(git.commonpath(paths), '/a/r') + + paths = [b'/a/r/2/g', b'/b/r/1/i/c', b'/c/r/0/h'] + self.assertEquals(git.commonpath(paths), b'/') + + paths = ['a/z', 'a/z', 'a/z'] + self.assertEquals(git.commonpath(paths), 'a/z') + + paths = ['0'] + self.assertEquals(git.commonpath(paths), '0')