From 2eced78da85d9d10d9efe5ac514ff3b369cee0e1 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <antoine.lambert@inria.fr>
Date: Wed, 26 Sep 2018 14:14:36 +0200
Subject: [PATCH] algos.dir_iterators: Implement iterator protocol in
 DirectoryIterator

  - add Python iterator protocol support in the DirectoryIterator class in
    order to easily visit in a recursive way any directory stored in the
    archive.

  - add convenient function dir_iterator wrapping the instantiation of
    the DirectoryIterator class

  - add tests

Related T1177
---
 swh/storage/algos/dir_iterators.py           |  39 ++++-
 swh/storage/tests/algos/test_diff.py         |  76 +--------
 swh/storage/tests/algos/test_dir_iterator.py | 153 +++++++++++++++++++
 3 files changed, 190 insertions(+), 78 deletions(-)
 create mode 100644 swh/storage/tests/algos/test_dir_iterator.py

diff --git a/swh/storage/algos/dir_iterators.py b/swh/storage/algos/dir_iterators.py
index 897d0a82c..e96ee68cf 100644
--- a/swh/storage/algos/dir_iterators.py
+++ b/swh/storage/algos/dir_iterators.py
@@ -72,11 +72,10 @@ class DirectoryIterator(object):
         """
         # get directory entries
         dir_data = _get_dir(self.storage, dir_id)
-        # sort them in lexicographical order
-        dir_data = sorted(dir_data, key=lambda e: e['name'])
-        # reverse the ordering in order to unstack the "smallest"
-        # entry each time the iterator advances
-        dir_data.reverse()
+        # sort them in lexicographical order and reverse the ordering
+        # in order to unstack the "smallest" entry each time the
+        # iterator advances
+        dir_data = sorted(dir_data, key=lambda e: e['name'], reverse=True)
         # push the directory frame to the main stack
         self.frames.append(dir_data)
 
@@ -199,6 +198,36 @@ class DirectoryIterator(object):
             self.frames.pop()
             self.drop()
 
+    def __next__(self):
+        entry = self.step()
+        if not entry:
+            raise StopIteration
+        entry['path'] = self.current_path()
+        return entry
+
+    def __iter__(self):
+        return DirectoryIterator(self.storage, self.root_dir_id,
+                                 self.base_path)
+
+
+def dir_iterator(storage, dir_id):
+    """
+    Return an iterator for recursively visiting a directory and
+    its sub-directories. The associated paths are visited in
+    lexicographic depth-first search order.
+
+    Args:
+        storage (swh.storage.Storage): an instance of a swh storage
+        dir_id (bytes): a directory identifier
+
+    Returns:
+        swh.storage.algos.dir_iterators.DirectoryIterator: an iterator
+            returning a dict at each iteration step describing a directory
+            entry. A 'path' field is added in that dict to store the
+            absolute path of the entry.
+    """
+    return DirectoryIterator(storage, dir_id)
+
 
 class Remaining(Enum):
     """
diff --git a/swh/storage/tests/algos/test_diff.py b/swh/storage/tests/algos/test_diff.py
index 6551e9b39..c25e0f31b 100644
--- a/swh/storage/tests/algos/test_diff.py
+++ b/swh/storage/tests/algos/test_diff.py
@@ -7,85 +7,15 @@
 
 import unittest
 
-from nose.tools import istest, nottest
 from unittest.mock import patch
 
+from nose.tools import istest, nottest
+
 from swh.model.hashutil import hash_to_bytes
 from swh.model.identifiers import directory_identifier
 from swh.storage.algos import diff
 
-
-class DirectoryModel(object):
-    """
-    Quick and dirty directory model to ease the writing
-    of revision trees differential tests.
-    """
-    def __init__(self, name=''):
-        self.data = {}
-        self.data['name'] = name
-        self.data['perms'] = 16384
-        self.data['type'] = 'dir'
-        self.data['entries'] = []
-        self.data['entry_idx'] = {}
-
-    def __getitem__(self, item):
-        if item == 'target':
-            return hash_to_bytes(directory_identifier(self))
-        else:
-            return self.data[item]
-
-    def add_file(self, path, sha1=None):
-        path_parts = path.split(b'/')
-        if len(path_parts) == 1:
-            self['entry_idx'][path] = len(self['entries'])
-            self['entries'].append({
-                'target': hash_to_bytes(sha1),
-                'name': path,
-                'perms': 33188,
-                'type': 'file'
-            })
-        else:
-            if not path_parts[0] in self['entry_idx']:
-                self['entry_idx'][path_parts[0]] = len(self['entries'])
-                self['entries'].append(DirectoryModel(path_parts[0]))
-            if path_parts[1]:
-                dir_idx = self['entry_idx'][path_parts[0]]
-                self['entries'][dir_idx].add_file(b'/'.join(path_parts[1:]), sha1)
-
-    def get_hash_data(self, entry_hash):
-        if self['target'] == entry_hash:
-            ret = []
-            for e in self['entries']:
-                ret.append({
-                    'target': e['target'],
-                    'name': e['name'],
-                    'perms': e['perms'],
-                    'type': e['type']
-                })
-            return ret
-        else:
-            for e in self['entries']:
-                if e['type'] == 'file' and e['target'] == entry_hash:
-                    return e
-                elif e['type'] == 'dir':
-                    data = e.get_hash_data(entry_hash)
-                    if data:
-                        return data
-            return None
-
-    def get_path_data(self, path):
-        path_parts = path.split(b'/')
-        entry_idx = self['entry_idx'][path_parts[0]]
-        entry = self['entries'][entry_idx]
-        if len(path_parts) == 1:
-            return {
-                'target': entry['target'],
-                'name': entry['name'],
-                'perms': entry['perms'],
-                'type': entry['type']
-            }
-        else:
-            return entry.get_path_data(b'/'.join(path_parts[1:]))
+from .test_dir_iterator import DirectoryModel
 
 
 @patch('swh.storage.algos.diff._get_rev')
diff --git a/swh/storage/tests/algos/test_dir_iterator.py b/swh/storage/tests/algos/test_dir_iterator.py
new file mode 100644
index 000000000..44830c7b0
--- /dev/null
+++ b/swh/storage/tests/algos/test_dir_iterator.py
@@ -0,0 +1,153 @@
+
+# Copyright (C) 2018  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+
+from unittest.mock import patch
+
+from nose.tools import istest, nottest
+
+from swh.model.from_disk import DentryPerms
+from swh.model.hashutil import hash_to_bytes, MultiHash
+from swh.model.identifiers import directory_identifier
+from swh.storage.algos.dir_iterators import dir_iterator
+
+# flake8: noqa
+
+class DirectoryModel(object):
+    """
+    Quick and dirty directory model to ease the writing
+    of directory iterators and revision trees differential tests.
+    """
+    def __init__(self, name=''):
+        self.data = {}
+        self.data['name'] = name
+        self.data['perms'] = DentryPerms.directory
+        self.data['type'] = 'dir'
+        self.data['entries'] = []
+        self.data['entry_idx'] = {}
+
+    def __getitem__(self, item):
+        if item == 'target':
+            return hash_to_bytes(directory_identifier(self))
+        else:
+            return self.data[item]
+
+    def add_file(self, path, sha1=None):
+        path_parts = path.split(b'/')
+        sha1 = hash_to_bytes(sha1) if sha1 \
+            else MultiHash.from_data(path).digest()['sha1']
+        if len(path_parts) == 1:
+            self['entry_idx'][path] = len(self['entries'])
+            self['entries'].append({
+                'target': sha1,
+                'name': path,
+                'perms': DentryPerms.content,
+                'type': 'file'
+            })
+        else:
+            if not path_parts[0] in self['entry_idx']:
+                self['entry_idx'][path_parts[0]] = len(self['entries'])
+                self['entries'].append(DirectoryModel(path_parts[0]))
+            if path_parts[1]:
+                dir_idx = self['entry_idx'][path_parts[0]]
+                self['entries'][dir_idx].add_file(b'/'.join(path_parts[1:]), sha1)
+
+    def get_hash_data(self, entry_hash):
+        if self['target'] == entry_hash:
+            ret = []
+            for e in self['entries']:
+                ret.append({
+                    'target': e['target'],
+                    'name': e['name'],
+                    'perms': e['perms'],
+                    'type': e['type']
+                })
+            return ret
+        else:
+            for e in self['entries']:
+                if e['type'] == 'file' and e['target'] == entry_hash:
+                    return e
+                elif e['type'] == 'dir':
+                    data = e.get_hash_data(entry_hash)
+                    if data:
+                        return data
+            return None
+
+    def get_path_data(self, path):
+        path_parts = path.split(b'/')
+        entry_idx = self['entry_idx'][path_parts[0]]
+        entry = self['entries'][entry_idx]
+        if len(path_parts) == 1:
+            return {
+                'target': entry['target'],
+                'name': entry['name'],
+                'perms': entry['perms'],
+                'type': entry['type']
+            }
+        else:
+            return entry.get_path_data(b'/'.join(path_parts[1:]))
+
+
+@patch('swh.storage.algos.dir_iterators._get_dir')
+class TestDirectoryIterator(unittest.TestCase):
+
+    @nottest
+    def check_iterated_paths(self, dir_model, expected_paths_order,
+                             mock_get_dir):
+
+        def _get_dir(*args, **kwargs):
+            return dir_model.get_hash_data(args[1])
+
+        mock_get_dir.side_effect = _get_dir # noqa
+        paths_order = [e['path'] for e in dir_iterator(None, dir_model['target'])]
+        self.assertEqual(paths_order, expected_paths_order)
+
+    @istest
+    def test_dir_iterator_empty_dir(self, mock_get_dir):
+        dir_model = DirectoryModel()
+        expected_paths_order = []
+        self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir)
+
+    @istest
+    def test_dir_iterator_no_empty_dirs(self, mock_get_dir):
+        dir_model = DirectoryModel()
+        dir_model.add_file(b'xyz/gtr/uhb')
+        dir_model.add_file(b'bca/ef')
+        dir_model.add_file(b'abc/ab')
+        dir_model.add_file(b'abc/bc')
+        dir_model.add_file(b'xyz/ouy/poi')
+
+        expected_paths_order = [b'abc',
+                                b'abc/ab',
+                                b'abc/bc',
+                                b'bca',
+                                b'bca/ef',
+                                b'xyz',
+                                b'xyz/gtr',
+                                b'xyz/gtr/uhb',
+                                b'xyz/ouy',
+                                b'xyz/ouy/poi']
+
+        self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir)
+
+    @istest
+    def test_dir_iterator_with_empty_dirs(self, mock_get_dir):
+        dir_model = DirectoryModel()
+        dir_model.add_file(b'xyz/gtr/')
+        dir_model.add_file(b'bca/ef')
+        dir_model.add_file(b'abc/')
+        dir_model.add_file(b'xyz/ouy/poi')
+
+        expected_paths_order = [b'abc',
+                                b'bca',
+                                b'bca/ef',
+                                b'xyz',
+                                b'xyz/gtr',
+                                b'xyz/ouy',
+                                b'xyz/ouy/poi']
+
+        self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir)
-- 
GitLab