From 2eced78da85d9d10d9efe5ac514ff3b369cee0e1 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <antoine.lambert@inria.fr> Date: Wed, 26 Sep 2018 14:14:36 +0200 Subject: [PATCH] algos.dir_iterators: Implement iterator protocol in DirectoryIterator - add Python iterator protocol support in the DirectoryIterator class in order to easily visit in a recursive way any directory stored in the archive. - add convenient function dir_iterator wrapping the instantiation of the DirectoryIterator class - add tests Related T1177 --- swh/storage/algos/dir_iterators.py | 39 ++++- swh/storage/tests/algos/test_diff.py | 76 +-------- swh/storage/tests/algos/test_dir_iterator.py | 153 +++++++++++++++++++ 3 files changed, 190 insertions(+), 78 deletions(-) create mode 100644 swh/storage/tests/algos/test_dir_iterator.py diff --git a/swh/storage/algos/dir_iterators.py b/swh/storage/algos/dir_iterators.py index 897d0a82c..e96ee68cf 100644 --- a/swh/storage/algos/dir_iterators.py +++ b/swh/storage/algos/dir_iterators.py @@ -72,11 +72,10 @@ class DirectoryIterator(object): """ # get directory entries dir_data = _get_dir(self.storage, dir_id) - # sort them in lexicographical order - dir_data = sorted(dir_data, key=lambda e: e['name']) - # reverse the ordering in order to unstack the "smallest" - # entry each time the iterator advances - dir_data.reverse() + # sort them in lexicographical order and reverse the ordering + # in order to unstack the "smallest" entry each time the + # iterator advances + dir_data = sorted(dir_data, key=lambda e: e['name'], reverse=True) # push the directory frame to the main stack self.frames.append(dir_data) @@ -199,6 +198,36 @@ class DirectoryIterator(object): self.frames.pop() self.drop() + def __next__(self): + entry = self.step() + if not entry: + raise StopIteration + entry['path'] = self.current_path() + return entry + + def __iter__(self): + return DirectoryIterator(self.storage, self.root_dir_id, + self.base_path) + + +def dir_iterator(storage, dir_id): + """ + Return an iterator for recursively visiting a directory and + its sub-directories. The associated paths are visited in + lexicographic depth-first search order. + + Args: + storage (swh.storage.Storage): an instance of a swh storage + dir_id (bytes): a directory identifier + + Returns: + swh.storage.algos.dir_iterators.DirectoryIterator: an iterator + returning a dict at each iteration step describing a directory + entry. A 'path' field is added in that dict to store the + absolute path of the entry. + """ + return DirectoryIterator(storage, dir_id) + class Remaining(Enum): """ diff --git a/swh/storage/tests/algos/test_diff.py b/swh/storage/tests/algos/test_diff.py index 6551e9b39..c25e0f31b 100644 --- a/swh/storage/tests/algos/test_diff.py +++ b/swh/storage/tests/algos/test_diff.py @@ -7,85 +7,15 @@ import unittest -from nose.tools import istest, nottest from unittest.mock import patch +from nose.tools import istest, nottest + from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import directory_identifier from swh.storage.algos import diff - -class DirectoryModel(object): - """ - Quick and dirty directory model to ease the writing - of revision trees differential tests. - """ - def __init__(self, name=''): - self.data = {} - self.data['name'] = name - self.data['perms'] = 16384 - self.data['type'] = 'dir' - self.data['entries'] = [] - self.data['entry_idx'] = {} - - def __getitem__(self, item): - if item == 'target': - return hash_to_bytes(directory_identifier(self)) - else: - return self.data[item] - - def add_file(self, path, sha1=None): - path_parts = path.split(b'/') - if len(path_parts) == 1: - self['entry_idx'][path] = len(self['entries']) - self['entries'].append({ - 'target': hash_to_bytes(sha1), - 'name': path, - 'perms': 33188, - 'type': 'file' - }) - else: - if not path_parts[0] in self['entry_idx']: - self['entry_idx'][path_parts[0]] = len(self['entries']) - self['entries'].append(DirectoryModel(path_parts[0])) - if path_parts[1]: - dir_idx = self['entry_idx'][path_parts[0]] - self['entries'][dir_idx].add_file(b'/'.join(path_parts[1:]), sha1) - - def get_hash_data(self, entry_hash): - if self['target'] == entry_hash: - ret = [] - for e in self['entries']: - ret.append({ - 'target': e['target'], - 'name': e['name'], - 'perms': e['perms'], - 'type': e['type'] - }) - return ret - else: - for e in self['entries']: - if e['type'] == 'file' and e['target'] == entry_hash: - return e - elif e['type'] == 'dir': - data = e.get_hash_data(entry_hash) - if data: - return data - return None - - def get_path_data(self, path): - path_parts = path.split(b'/') - entry_idx = self['entry_idx'][path_parts[0]] - entry = self['entries'][entry_idx] - if len(path_parts) == 1: - return { - 'target': entry['target'], - 'name': entry['name'], - 'perms': entry['perms'], - 'type': entry['type'] - } - else: - return entry.get_path_data(b'/'.join(path_parts[1:])) +from .test_dir_iterator import DirectoryModel @patch('swh.storage.algos.diff._get_rev') diff --git a/swh/storage/tests/algos/test_dir_iterator.py b/swh/storage/tests/algos/test_dir_iterator.py new file mode 100644 index 000000000..44830c7b0 --- /dev/null +++ b/swh/storage/tests/algos/test_dir_iterator.py @@ -0,0 +1,153 @@ + +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from unittest.mock import patch + +from nose.tools import istest, nottest + +from swh.model.from_disk import DentryPerms +from swh.model.hashutil import hash_to_bytes, MultiHash +from swh.model.identifiers import directory_identifier +from swh.storage.algos.dir_iterators import dir_iterator + +# flake8: noqa + +class DirectoryModel(object): + """ + Quick and dirty directory model to ease the writing + of directory iterators and revision trees differential tests. + """ + def __init__(self, name=''): + self.data = {} + self.data['name'] = name + self.data['perms'] = DentryPerms.directory + self.data['type'] = 'dir' + self.data['entries'] = [] + self.data['entry_idx'] = {} + + def __getitem__(self, item): + if item == 'target': + return hash_to_bytes(directory_identifier(self)) + else: + return self.data[item] + + def add_file(self, path, sha1=None): + path_parts = path.split(b'/') + sha1 = hash_to_bytes(sha1) if sha1 \ + else MultiHash.from_data(path).digest()['sha1'] + if len(path_parts) == 1: + self['entry_idx'][path] = len(self['entries']) + self['entries'].append({ + 'target': sha1, + 'name': path, + 'perms': DentryPerms.content, + 'type': 'file' + }) + else: + if not path_parts[0] in self['entry_idx']: + self['entry_idx'][path_parts[0]] = len(self['entries']) + self['entries'].append(DirectoryModel(path_parts[0])) + if path_parts[1]: + dir_idx = self['entry_idx'][path_parts[0]] + self['entries'][dir_idx].add_file(b'/'.join(path_parts[1:]), sha1) + + def get_hash_data(self, entry_hash): + if self['target'] == entry_hash: + ret = [] + for e in self['entries']: + ret.append({ + 'target': e['target'], + 'name': e['name'], + 'perms': e['perms'], + 'type': e['type'] + }) + return ret + else: + for e in self['entries']: + if e['type'] == 'file' and e['target'] == entry_hash: + return e + elif e['type'] == 'dir': + data = e.get_hash_data(entry_hash) + if data: + return data + return None + + def get_path_data(self, path): + path_parts = path.split(b'/') + entry_idx = self['entry_idx'][path_parts[0]] + entry = self['entries'][entry_idx] + if len(path_parts) == 1: + return { + 'target': entry['target'], + 'name': entry['name'], + 'perms': entry['perms'], + 'type': entry['type'] + } + else: + return entry.get_path_data(b'/'.join(path_parts[1:])) + + +@patch('swh.storage.algos.dir_iterators._get_dir') +class TestDirectoryIterator(unittest.TestCase): + + @nottest + def check_iterated_paths(self, dir_model, expected_paths_order, + mock_get_dir): + + def _get_dir(*args, **kwargs): + return dir_model.get_hash_data(args[1]) + + mock_get_dir.side_effect = _get_dir # noqa + paths_order = [e['path'] for e in dir_iterator(None, dir_model['target'])] + self.assertEqual(paths_order, expected_paths_order) + + @istest + def test_dir_iterator_empty_dir(self, mock_get_dir): + dir_model = DirectoryModel() + expected_paths_order = [] + self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir) + + @istest + def test_dir_iterator_no_empty_dirs(self, mock_get_dir): + dir_model = DirectoryModel() + dir_model.add_file(b'xyz/gtr/uhb') + dir_model.add_file(b'bca/ef') + dir_model.add_file(b'abc/ab') + dir_model.add_file(b'abc/bc') + dir_model.add_file(b'xyz/ouy/poi') + + expected_paths_order = [b'abc', + b'abc/ab', + b'abc/bc', + b'bca', + b'bca/ef', + b'xyz', + b'xyz/gtr', + b'xyz/gtr/uhb', + b'xyz/ouy', + b'xyz/ouy/poi'] + + self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir) + + @istest + def test_dir_iterator_with_empty_dirs(self, mock_get_dir): + dir_model = DirectoryModel() + dir_model.add_file(b'xyz/gtr/') + dir_model.add_file(b'bca/ef') + dir_model.add_file(b'abc/') + dir_model.add_file(b'xyz/ouy/poi') + + expected_paths_order = [b'abc', + b'bca', + b'bca/ef', + b'xyz', + b'xyz/gtr', + b'xyz/ouy', + b'xyz/ouy/poi'] + + self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir) -- GitLab