From cebe917a85d427355e837a69e3c28248a496d277 Mon Sep 17 00:00:00 2001
From: Franck Bret <franck.bret@octobus.net>
Date: Tue, 13 Feb 2024 10:33:24 +0100
Subject: [PATCH] from_disk: Add optional progress callback

Add an optional progress callback to `from_disk` method. It can
returns the number of computed entries for each top entries traversed.
This is useful for CLI, in particular to display progress information
for SWH Scanner.
---
 swh/model/cli.py                  |  7 +++++--
 swh/model/from_disk.py            |  7 +++++++
 swh/model/tests/test_from_disk.py | 11 +++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/swh/model/cli.py b/swh/model/cli.py
index 7fd99ed0..9fd4c50f 100644
--- a/swh/model/cli.py
+++ b/swh/model/cli.py
@@ -5,7 +5,7 @@
 
 import os
 import sys
-from typing import Dict, Iterable, Optional
+from typing import Callable, Dict, Iterable, Optional
 
 # WARNING: do not import unnecessary things here to keep cli startup time under
 # control
@@ -74,6 +74,7 @@ def swhid_of_file_content(data) -> CoreSWHID:
 def model_of_dir(
     path: bytes,
     exclude_patterns: Optional[Iterable[bytes]] = None,
+    update_info: Optional[Callable[[int], None]] = None,
 ) -> Directory:
     from swh.model.from_disk import accept_all_paths, ignore_directories_patterns
 
@@ -83,7 +84,9 @@ def model_of_dir(
         else accept_all_paths
     )
 
-    return Directory.from_disk(path=path, path_filter=path_filter)
+    return Directory.from_disk(
+        path=path, path_filter=path_filter, progress_callback=update_info
+    )
 
 
 def swhid_of_dir(
diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
index 48d70161..26bb442b 100644
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
@@ -453,6 +453,7 @@ class Directory(MerkleNode):
             Callable[[bytes, bytes, Optional[List[bytes]]], bool]
         ] = None,
         max_content_length: Optional[int] = None,
+        progress_callback: Optional[Callable[[int], None]] = None,
     ) -> "Directory":
         """Compute the Software Heritage objects for a given directory tree
 
@@ -472,6 +473,8 @@ class Directory(MerkleNode):
             directory should be ignored.
           max_content_length (Optional[int]): if given, all contents larger
             than this will be skipped.
+          progress_callback (Optional function): if given, returns for each
+          non empty directories traversed the number of computed entries.
         """
         top_path = path
         dirs: Dict[bytes, Directory] = {}
@@ -504,6 +507,10 @@ class Directory(MerkleNode):
             dirs[root] = cls({"name": os.path.basename(root), "path": root})
             dirs[root].update(entries)
 
+            if progress_callback is not None:
+                if len(entries) > 0:
+                    progress_callback(len(entries))
+
         return dirs[top_path]
 
     def __init__(self, data=None):
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
index 946fd533..e1b455ac 100644
--- a/swh/model/tests/test_from_disk.py
+++ b/swh/model/tests/test_from_disk.py
@@ -914,6 +914,17 @@ class DirectoryToObjects(DataMixin, unittest.TestCase):
                     b"foofile",
                 ]
 
+    def test_directory_progress_callback(self):
+        total = []
+
+        def update_info(arg):
+            assert type(arg) is int
+            total.append(arg)
+
+        Directory.from_disk(path=self.tmpdir_name, progress_callback=update_info)
+        # Corresponds to the deeper files and directories plus the four top level ones
+        assert total == [1, 1, 1, 1, 4]
+
 
 @pytest.mark.fs
 class TarballTest(DataMixin, unittest.TestCase):
-- 
GitLab