From d124e6e9b93d9903f0884478ebbc60f23e533bd9 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <antoine.romain.dumont@gmail.com>
Date: Thu, 26 May 2016 12:33:12 +0200
Subject: [PATCH] Optimize walk for edge cases

---
 swh/model/git.py | 154 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 118 insertions(+), 36 deletions(-)

diff --git a/swh/model/git.py b/swh/model/git.py
index f2c12c7a..3918f96e 100644
--- a/swh/model/git.py
+++ b/swh/model/git.py
@@ -186,8 +186,116 @@ def compute_tree_metadata(dirname, ls_hashes):
     }
 
 
+def default_validation_dir(dirpath):
+    """Default validation function.
+       This is the equivalent of the identity function.
+
+    Args:
+        dirpath: Path to validate
+
+    Returns: True
+
+    """
+    return True
+
+
+def __walk(rootdir,
+           dir_ok_fn=default_validation_dir,
+           remove_empty_folder=False):
+    """Walk the filesystem and yields a 3 tuples (dirpath, dirnames as set
+    of absolute paths, filenames as set of abslute paths)
+
+       Ignore files which won't pass the dir_ok_fn validation.
+
+       If remove_empty_folder is True, remove and ignore any
+       encountered empty folder.
+
+    Args:
+        - rootdir: starting walk root directory path
+        - dir_ok_fn: validation function. if folder encountered are
+        not ok, they are ignored.  Default to default_validation_dir
+        which does nothing.
+         - remove_empty_folder: Flag to remove and ignore any
+          encountered empty folders.
+
+    Yields:
+        3 tuples dirpath, set of absolute children dirname paths, set
+        of absolute filename paths.
+
+    """
+    def basic_gen_dir(rootdir):
+        for dp, dns, fns in os.walk(rootdir, topdown=False):
+            yield (dp,
+                   set((os.path.join(dp, dn) for dn in dns)),
+                   set((os.path.join(dp, fn) for fn in fns)))
+
+    if dir_ok_fn == default_validation_dir:
+        if not remove_empty_folder:  # os.walk
+            yield from basic_gen_dir(rootdir)
+        else:                        # os.walk + empty dir cleanup
+            empty_folders = set()
+            for dp, dns, fns in basic_gen_dir(rootdir):
+                if not dns and not fns:
+                    empty_folders.add(dp)
+                    # need to remove it because folder of empty folder
+                    # is an empty folder!!!
+                    if os.path.islink(dp):
+                        os.remove(dp)
+                    else:
+                        os.rmdir(dp)
+                    parent = os.path.dirname(dp)
+                    # edge case about parent containing one empty
+                    # folder which become an empty one
+                    while not os.listdir(parent):
+                        empty_folders.add(parent)
+                        if os.path.islink(parent):
+                            os.remove(parent)
+                        else:
+                            os.rmdir(parent)
+                        parent = os.path.dirname(parent)
+                    continue
+                yield (dp, dns - empty_folders, fns)
+    else:
+        def filtfn(dirnames):
+            return set(filter(dir_ok_fn, dirnames))
+
+        gen_dir = ((dp, dns, fns) for dp, dns, fns
+                   in basic_gen_dir(rootdir) if dir_ok_fn(dp))
+
+        if not remove_empty_folder:  # os.walk + filtering
+            for dp, dns, fns in gen_dir:
+                yield (dp, filtfn(dns), fns)
+        else:                        # os.walk + filtering + empty dir cleanup
+            empty_folders = set()
+            for dp, dns, fns in gen_dir:
+                dps = filtfn(dns)
+
+                if not dps and not fns:
+                    empty_folders.add(dp)
+                    # need to remove it because folder of empty folder
+                    # is an empty folder!!!
+                    if os.path.islink(dp):
+                        print('remove link to empty folder')
+                        os.remove(dp)
+                    else:
+                        print('remove empty folder')
+                        os.rmdir(dp)
+                    parent = os.path.dirname(dp)
+                    # edge case about parent containing one empty
+                    # folder which become an empty one
+                    while not os.listdir(parent):
+                        empty_folders.add(parent)
+                        if os.path.islink(parent):
+                            os.remove(parent)
+                        else:
+                            os.rmdir(parent)
+                        parent = os.path.dirname(parent)
+                    continue
+                yield dp, dps - empty_folders, fns
+
+
 def walk_and_compute_sha1_from_directory(rootdir,
-                                         dir_ok_fn=lambda dirpath: True,
+                                         dir_ok_fn=default_validation_dir,
                                          with_root_tree=True,
                                          remove_empty_folder=False):
     """Compute git sha1 from directory rootdir.
@@ -228,53 +336,27 @@ def walk_and_compute_sha1_from_directory(rootdir,
     if rootdir.endswith(b'/'):
         rootdir = rootdir.rstrip(b'/')
 
-    def filtfn(dirpath, dirnames):
-        return list(filter(lambda dirname: dir_ok_fn(os.path.join(dirpath,
-                                                                  dirname)),
-                           dirnames))
-
-    if remove_empty_folder:  # round-trip to remove empty folders
-        gen_dir = ((dp, filtfn(dp, dns), fns) for (dp, dns, fns)
-                   in os.walk(rootdir, topdown=False)
-                   if dir_ok_fn(dp))
-        for dirpath, dirnames, filenames in gen_dir:
-            if dirnames == [] and filenames == []:
-                if os.path.islink(dirpath):
-                    os.remove(dirpath)
-                else:
-                    os.removedirs(dirpath)
-
-    gen_dir = ((dp, filtfn(dp, dns), fns) for (dp, dns, fns)
-               in os.walk(rootdir, topdown=False)
-               if dir_ok_fn(dp))
-
-    for dirpath, dirnames, filenames in gen_dir:
+    for dirpath, dirnames, filenames in __walk(
+            rootdir, dir_ok_fn, remove_empty_folder):
         hashes = []
 
-        links = (os.path.join(dirpath, file)
-                 for file in (filenames+dirnames)
-                 if os.path.islink(os.path.join(dirpath, file)))
+        links = (file
+                 for file in filenames.union(dirnames)
+                 if os.path.islink(file))
 
         for linkpath in links:
             all_links.add(linkpath)
             m_hashes = compute_link_metadata(linkpath)
             hashes.append(m_hashes)
 
-        only_files = (os.path.join(dirpath, file)
-                      for file in filenames
-                      if os.path.join(dirpath, file) not in all_links)
-        for filepath in only_files:
+        for filepath in (file for file in filenames if file not in all_links):
             m_hashes = compute_blob_metadata(filepath)
             hashes.append(m_hashes)
 
         ls_hashes[dirpath] = hashes
 
         dir_hashes = []
-        subdirs = (os.path.join(dirpath, dir)
-                   for dir in dirnames
-                   if os.path.join(dirpath, dir)
-                   not in all_links)
-        for fulldirname in subdirs:
+        for fulldirname in (dir for dir in dirnames if dir not in all_links):
             tree_hash = compute_tree_metadata(fulldirname, ls_hashes)
             dir_hashes.append(tree_hash)
 
@@ -398,7 +480,7 @@ def commonpath(paths):
 
 
 def __remove_paths_from_objects(objects, rootpaths,
-                                dir_ok_fn=lambda dirpath: True):
+                                dir_ok_fn=default_validation_dir):
     """Given top paths to remove, remove all paths and descendants from
     objects.
 
@@ -441,7 +523,7 @@ def __remove_paths_from_objects(objects, rootpaths,
 
 
 def update_checksums_from(changed_paths, objects,
-                          dir_ok_fn=lambda dirpath: True,
+                          dir_ok_fn=default_validation_dir,
                           remove_empty_folder=False):
     """Given a list of changed paths, recompute the checksums only where
     needed.
-- 
GitLab