From 34f61010062469418db8d115c2d993bde1377cb2 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz <vlorentz@softwareheritage.org> Date: Tue, 11 Jun 2024 12:16:51 +0200 Subject: [PATCH] from_disk: Do not recurse in ignored directories Using os.walk() does not make much sense when we want to control what directories to recurse into. Additionally, this uses os.scandir directly, which allows us to directly sort symlinks and files apart from directories (while os.walk groups symlinks with directories) without two extra system calls. --- swh/model/from_disk.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py index 13e4c113..33352c96 100644 --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -472,29 +472,33 @@ class Directory(MerkleNode): dirs: Dict[bytes, Directory] = {} dirs[top_path] = cls({"name": os.path.basename(top_path), "path": top_path}) filtered = [] - for root, dentries, fentries in os.walk(top_path): + to_visit = [path] + while to_visit: + root = to_visit.pop() path, name = os.path.split(root) - if root != top_path and not path_filter(path, name, dentries + fentries): - # we should not traverse the current directory, so stop right now - dentries[:] = [] + with os.scandir(root) as it: + entries_list = list(it) + if root != top_path and not path_filter( + path, name, [entry.path for entry in entries_list] + ): + # we should not traverse the current directory, so stop right now, # but also mark it as removed (for later cleanup) filtered.append(root) continue + entries = {} - # Join fentries and dentries in the same processing, as symbolic - # links to directories appear in dentries... - for name in dentries + fentries: - path = os.path.join(root, name) - if not os.path.isdir(path) or os.path.islink(path): - if not path_filter(root, name, None): + for entry in entries_list: + if not entry.is_dir(follow_symlinks=False): + if not path_filter(root, entry.name, None): continue content = Content.from_file( - path=path, max_content_length=max_content_length + path=entry.path, max_content_length=max_content_length ) - entries[name] = content + entries[entry.name] = content else: - entries[name] = cls({"name": os.path.basename(path), "path": path}) - dirs[path] = entries[name] + entries[entry.name] = cls({"name": entry.name, "path": entry.path}) + dirs[entry.path] = entries[entry.name] + to_visit.append(entry.path) dirs[root].update(entries) if progress_callback is not None: -- GitLab