From ece18350b6ac4eecaad18a7d4ca65d0d67b6ab83 Mon Sep 17 00:00:00 2001
From: Pierre-Yves David <pierre-yves.david@octobus.net>
Date: Mon, 9 Sep 2024 11:21:36 +0200
Subject: [PATCH] from-disk-exclude: handle top_path with a final slash

Before this patch giving `from_disk` and directory path with final '/'s
would confuses the exclusion code as it would assume it need to exclude
one extra character to get the basename. This would swallow the first
char of the directory to exclude, leading to a righful crash.

This would affect directory excluded in the first directory only as sub
directory would be called without a '/'.

We fix the path at the top of the function as trailing '/'s could
confuse other code too in the future.
---
 swh/model/from_disk.py            | 10 +++++++++-
 swh/model/tests/test_from_disk.py | 17 +++++++++++++----
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
index b24c4351..51dec7b7 100644
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
@@ -467,7 +467,15 @@ class Directory(MerkleNode):
           progress_callback (Optional function): if given, returns for each
             non empty directories traversed the number of computed entries.
         """
+        # the top might have been specified with a final slash. This will
+        # confuse various code.
+        #
+        # We should prevent '/' as is however
+        if 1 < len(path) and path[-1:] == b"/":
+            path = path[0:1] + path[1:].rstrip(b"/")
+        assert len(path) <= 1 or path[-1:] != b"/"
         top_path = path
+        top_path_prefix_size = len(top_path) + 1
 
         dirs: Dict[bytes, Directory] = {}
         dirs[top_path] = cls({"name": os.path.basename(top_path), "path": top_path})
@@ -507,7 +515,7 @@ class Directory(MerkleNode):
         top_dir = dirs[top_path]
 
         for path in reversed(filtered):
-            path = path[len(top_path) + 1 :]
+            path = path[top_path_prefix_size:]
             del top_dir[path]
         # a bit sad but now we have to traverse the gathered tree structure to
         # filter it again (e.g. for the ignore_empty_directory filter to work
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
index 71a8ed49..c51f0468 100644
--- a/swh/model/tests/test_from_disk.py
+++ b/swh/model/tests/test_from_disk.py
@@ -1,4 +1,3 @@
-# Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -950,6 +949,10 @@ class DirectoryToObjects(DataMixin, unittest.TestCase):
             expected_content_count=len(self.contents) + 1,
         )
 
+    def test_directory_to_objects_ignore_name_with_slash(self):
+        self.tmpdir_name = self.tmpdir_name + b"/"
+        self.test_directory_to_objects_ignore_name()
+
     def test_directory_to_objects_ignore_name_case(self):
         directory = Directory.from_disk(
             path=self.tmpdir_name,
@@ -1042,7 +1045,10 @@ class DirectoryToObjects(DataMixin, unittest.TestCase):
         # Corresponds to the deeper files and directories plus the four top level ones
         assert total == [4, 1, 1, 1, 1]
 
-    def test_exclude(self):
+    def test_exclude_trailing(self):
+        self.test_exclude(trailing_slash=True)
+
+    def test_exclude(self, trailing_slash=False):
         """exclude patterns"""
         with tempfile.TemporaryDirectory() as dirname:
             dirname = os.fsencode(dirname)
@@ -1063,7 +1069,10 @@ class DirectoryToObjects(DataMixin, unittest.TestCase):
             )
 
             # no filter
-            directory = Directory.from_disk(path=dirname)
+            dir_path = dirname
+            if trailing_slash:
+                dir_path += b"/"
+            directory = Directory.from_disk(path=dir_path)
             assert set(directory.keys()) == {
                 b"baz",
                 b"foo",
@@ -1088,7 +1097,7 @@ class DirectoryToObjects(DataMixin, unittest.TestCase):
 
             exclude_patterns = [b"excluded_*"]
             path_filter = ignore_directories_patterns(dirname, exclude_patterns)
-            directory_f = Directory.from_disk(path=dirname, path_filter=path_filter)
+            directory_f = Directory.from_disk(path=dir_path, path_filter=path_filter)
             assert set(directory_f.keys()) == {b"baz", b"foo", b"foofile", b"file"}
             # XXX should foo/excluded_dir and foo/excluded_dir2 be excluded as
             # well? Currently they are not
-- 
GitLab