From d54c0666d009372fb2aaea7b4eb0d2cfcaaea987 Mon Sep 17 00:00:00 2001
From: Nicolas Dandrimont <nicolas@dandrimont.eu>
Date: Tue, 19 Sep 2017 13:40:21 +0200
Subject: [PATCH] from_disk: convert on-disk data to Software Heritage archive
 objects

Summary:
This module is a reimplementation of swh.model.git, with the underlying goal of
replacing it and fixing T709 in the process.

Reviewers: #reviewers!

Differential Revision: https://forge.softwareheritage.org/D248
---
 swh/model/from_disk.py                        | 341 +++++++++
 .../tests/generate_testdata_from_disk.py      |  92 +++
 swh/model/tests/test_from_disk.py             | 658 ++++++++++++++++++
 3 files changed, 1091 insertions(+)
 create mode 100644 swh/model/from_disk.py
 create mode 100644 swh/model/tests/generate_testdata_from_disk.py
 create mode 100644 swh/model/tests/test_from_disk.py

diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
new file mode 100644
index 00000000..2fec859a
--- /dev/null
+++ b/swh/model/from_disk.py
@@ -0,0 +1,341 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import enum
+import os
+import stat
+
+from . import hashutil
+from .merkle import MerkleLeaf, MerkleNode
+from .identifiers import (
+    directory_identifier,
+    identifier_to_bytes as id_to_bytes,
+    identifier_to_str as id_to_str,
+)
+
+
+class DentryPerms(enum.IntEnum):
+    """Admissible permissions for directory entries."""
+    content = 0o100644
+    """Content"""
+    executable_content = 0o100755
+    """Executable content (e.g. executable script)"""
+    symlink = 0o120000
+    """Symbolic link"""
+    directory = 0o040000
+    """Directory"""
+    revision = 0o160000
+    """Revision (e.g. submodule)"""
+
+
+def mode_to_perms(mode):
+    """Convert a file mode to a permission compatible with Software Heritage
+    directory entries
+
+    Args:
+      mode (int): a file mode as returned by :func:`os.stat` in
+                  :attr:`os.stat_result.st_mode`
+
+    Returns:
+      DentryPerms: one of the following values:
+        :const:`DentryPerms.content`: plain file
+        :const:`DentryPerms.executable_content`: executable file
+        :const:`DentryPerms.symlink`: symbolic link
+        :const:`DentryPerms.directory`: directory
+
+    """
+    if stat.S_ISLNK(mode):
+        return DentryPerms.symlink
+    if stat.S_ISDIR(mode):
+        return DentryPerms.directory
+    else:
+        # file is executable in any way
+        if mode & (0o111):
+            return DentryPerms.executable_content
+        else:
+            return DentryPerms.content
+
+
+class Content(MerkleLeaf):
+    """Representation of a Software Heritage content as a node in a Merkle tree.
+
+    The current Merkle hash for the Content nodes is the `sha1_git`, which
+    makes it consistent with what :class:`Directory` uses for its own hash
+    computation.
+
+    """
+    __slots__ = []
+    type = 'content'
+
+    @classmethod
+    def from_bytes(cls, *, mode, data):
+        """Convert data (raw :class:`bytes`) to a Software Heritage content entry
+
+        Args:
+          mode (int): a file mode (passed to :func:`mode_to_perms`)
+          data (bytes): raw contents of the file
+        """
+        ret = hashutil.hash_data(data)
+        ret['length'] = len(data)
+        ret['perms'] = mode_to_perms(mode)
+        ret['data'] = data
+
+        return cls(ret)
+
+    @classmethod
+    def from_symlink(cls, *, path, mode):
+        """Convert a symbolic link to a Software Heritage content entry"""
+        return cls.from_bytes(mode=mode, data=os.readlink(path))
+
+    @classmethod
+    def from_file(cls, *, path, data=False):
+        """Compute the Software Heritage content entry corresponding to an on-disk
+        file.
+
+        The returned dictionary contains keys useful for both:
+        - loading the content in the archive (hashes, `length`)
+        - using the content as a directory entry in a directory
+
+        Args:
+          path (bytes): path to the file for which we're computing the
+            content entry
+          data (bool): add the file data to the entry
+        """
+        file_stat = os.lstat(path)
+        mode = file_stat.st_mode
+
+        if stat.S_ISLNK(mode):
+            # Symbolic link: return a file whose contents are the link target
+            return cls.from_symlink(path=path, mode=mode)
+        elif not stat.S_ISREG(mode):
+            # not a regular file: return the empty file instead
+            return cls.from_bytes(mode=mode, data=b'')
+
+        length = file_stat.st_size
+
+        if not data:
+            ret = hashutil.hash_path(path)
+        else:
+            chunks = []
+
+            def append_chunk(x, chunks=chunks):
+                chunks.append(x)
+
+            with open(path, 'rb') as fobj:
+                ret = hashutil.hash_file(fobj, length=length,
+                                         chunk_cb=append_chunk)
+
+            ret['data'] = b''.join(chunks)
+
+        ret['perms'] = mode_to_perms(mode)
+        ret['length'] = length
+
+        obj = cls(ret)
+        return obj
+
+    def __repr__(self):
+        return 'Content(id=%s)' % id_to_str(self.hash)
+
+    def compute_hash(self):
+        return self.data['sha1_git']
+
+
+def accept_all_directories(dirname, entries):
+    """Default filter for :func:`Directory.from_disk` accepting all
+    directories
+
+    Args:
+      dirname (bytes): directory name
+      entries (list): directory entries
+    """
+    return True
+
+
+def ignore_empty_directories(dirname, entries):
+    """Filter for :func:`directory_to_objects` ignoring empty directories
+
+    Args:
+      dirname (bytes): directory name
+      entries (list): directory entries
+    Returns:
+      True if the directory is not empty, false if the directory is empty
+    """
+    return bool(entries)
+
+
+def ignore_named_directories(names, *, case_sensitive=True):
+    """Filter for :func:`directory_to_objects` to ignore directories named one
+    of names.
+
+    Args:
+      names (list of bytes): names to ignore
+      case_sensitive (bool): whether to do the filtering in a case sensitive
+        way
+    Returns:
+      a directory filter for :func:`directory_to_objects`
+    """
+    if not case_sensitive:
+        names = [name.lower() for name in names]
+
+    def named_filter(dirname, entries,
+                     names=names, case_sensitive=case_sensitive):
+        if case_sensitive:
+            return dirname not in names
+        else:
+            return dirname.lower() not in names
+
+    return named_filter
+
+
+class Directory(MerkleNode):
+    """Representation of a Software Heritage directory as a node in a Merkle Tree.
+
+    This class can be used to generate, from an on-disk directory, all the
+    objects that need to be sent to the Software Heritage archive.
+
+    The :func:`from_disk` constructor allows you to generate the data structure
+    from a directory on disk. The resulting :class:`Directory` can then be
+    manipulated as a dictionary, using the path as key.
+
+    The :func:`collect` method is used to retrieve all the objects that need to
+    be added to the Software Heritage archive since the last collection, by
+    class (contents and directories).
+
+    When using the dict-like methods to update the contents of the directory,
+    the affected levels of hierarchy are reset and can be collected again using
+    the same method. This enables the efficient collection of updated nodes,
+    for instance when the client is applying diffs.
+    """
+    __slots__ = ['__entries']
+    type = 'directory'
+
+    @classmethod
+    def from_disk(cls, *, path, data=False,
+                  dir_filter=accept_all_directories):
+        """Compute the Software Heritage objects for a given directory tree
+
+        Args:
+          path (bytes): the directory to traverse
+          data (bool): whether to add the data to the content objects
+          dir_filter (function): a filter to ignore some directories by
+            name or contents. Takes two arguments: dirname and entries, and
+            returns True if the directory should be added, False if the
+            directory should be ignored.
+        """
+
+        top_path = path
+        dirs = {}
+
+        for root, dentries, fentries in os.walk(top_path, topdown=False):
+            entries = {}
+            # Join fentries and dentries in the same processing, as symbolic
+            # links to directories appear in dentries...
+            for name in fentries + dentries:
+                path = os.path.join(root, name)
+                if not os.path.isdir(path) or os.path.islink(path):
+                    content = Content.from_file(path=path, data=data)
+                    entries[name] = content
+                else:
+                    if dir_filter(name, dirs[path].entries):
+                        entries[name] = dirs[path]
+
+            dirs[root] = cls({'name': os.path.basename(root)})
+            dirs[root].update(entries)
+
+        return dirs[top_path]
+
+    def __init__(self, data=None):
+        super().__init__(data=data)
+        self.__entries = None
+
+    def invalidate_hash(self):
+        self.__entries = None
+        super().invalidate_hash()
+
+    @staticmethod
+    def child_to_directory_entry(name, child):
+        if isinstance(child, Directory):
+            return {
+                'type': 'dir',
+                'perms': DentryPerms.directory,
+                'target': child.hash,
+                'name': name,
+            }
+        elif isinstance(child, Content):
+            return {
+                'type': 'file',
+                'perms': child.data['perms'],
+                'target': child.hash,
+                'name': name,
+            }
+        else:
+            raise ValueError('unknown child')
+
+    def get_data(self, **kwargs):
+        return {
+            'id': self.hash,
+            'entries': self.entries,
+        }
+
+    @property
+    def entries(self):
+        if self.__entries is None:
+            self.__entries = [
+                self.child_to_directory_entry(name, child)
+                for name, child in self.items()
+            ]
+
+        return self.__entries
+
+    def compute_hash(self):
+        return id_to_bytes(directory_identifier({'entries': self.entries}))
+
+    def __getitem__(self, key):
+        if not isinstance(key, bytes):
+            raise ValueError('Can only get a bytes from directory')
+
+        # Convenience shortcut
+        if key == b'':
+            return self
+
+        if b'/' not in key:
+            return super().__getitem__(key)
+        else:
+            key1, key2 = key.split(b'/', 1)
+            return super().__getitem__(key1)[key2]
+
+    def __setitem__(self, key, value):
+        if not isinstance(key, bytes):
+            raise ValueError('Can only set a bytes directory entry')
+        if not isinstance(value, (Content, Directory)):
+            raise ValueError('Can only set a directory entry to a Content or '
+                             'Directory')
+
+        if key == b'':
+            raise ValueError('Directory entry must have a name')
+        if b'\x00' in key:
+            raise ValueError('Directory entry name must not contain nul bytes')
+
+        if b'/' not in key:
+            return super().__setitem__(key, value)
+        else:
+            key1, key2 = key.rsplit(b'/', 1)
+            self[key1].add_child(key2, value)
+
+    def __delitem__(self, key):
+        if not isinstance(key, bytes):
+            raise ValueError('Can only delete a bytes directory entry')
+
+        if b'/' not in key:
+            super().__delitem__(key)
+        else:
+            key1, key2 = key.rsplit(b'/', 1)
+            del super().__getitem__(key1)[key2]
+
+    def __repr__(self):
+        return 'Directory(%s, id=%s)' % (
+            self.data['name'],
+            id_to_str(self.hash) if self.hash else '?',
+        )
diff --git a/swh/model/tests/generate_testdata_from_disk.py b/swh/model/tests/generate_testdata_from_disk.py
new file mode 100644
index 00000000..35d4f480
--- /dev/null
+++ b/swh/model/tests/generate_testdata_from_disk.py
@@ -0,0 +1,92 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from operator import itemgetter
+import os
+import sys
+
+from swh.model.from_disk import Directory, DentryPerms
+from swh.model.hashutil import ALGORITHMS, hash_to_hex
+
+
+def generate_from_directory(varname, directory, indent=0):
+    """Generate test data from a given directory"""
+    def get_data(member, path):
+        yield (path, member.get_data())
+        if isinstance(member, Directory):
+            for name, child in member.items():
+                yield from get_data(child, os.path.join(path, name))
+
+    data = dict(get_data(directory, b''))
+    out = []
+
+    def format_hash(h, indent=0):
+        spindent = ' ' * indent
+        if len(h) > 20:
+            cutoff = len(h)//2
+            parts = h[:cutoff], h[cutoff:]
+        else:
+            parts = [h]
+
+        out.append('hash_to_bytes(\n')
+        for part in parts:
+            out.append(spindent + '    %s\n' % repr(hash_to_hex(part)))
+        out.append(spindent + ')')
+
+    def format_dict_items(d, indent=0):
+        spindent = ' ' * indent
+        for key, value in sorted(d.items()):
+            if isinstance(key, bytes):
+                out.append(spindent + repr(key) + ': {\n')
+                format_dict_items(value, indent=indent + 4)
+                out.append(spindent + '}')
+            else:
+                out.append(spindent + repr(key) + ': ')
+                if key == 'entries':
+                    if not value:
+                        out.append('[]')
+                    else:
+                        out.append('[')
+                        last_index = len(value) - 1
+                        for i, entry in enumerate(
+                                sorted(value, key=itemgetter('name'))):
+                            if i:
+                                out.append(' ')
+                            out.append('{\n')
+                            format_dict_items(entry, indent=indent + 4)
+                            if i != last_index:
+                                out.append(spindent + '},')
+                        out.append(spindent + '}]')
+                elif key in ALGORITHMS | {'id', 'target'}:
+                    format_hash(value, indent=indent)
+                elif isinstance(value, DentryPerms):
+                    out.append(str(value))
+                else:
+                    out.append(repr(value))
+            out.append(',\n')
+
+    spindent = ' ' * indent
+    out.append(spindent + '%s = {\n' % varname)
+    format_dict_items(data, indent=4 + indent)
+    out.append(spindent + '}')
+
+    return ''.join(out)
+
+
+if __name__ == '__main__':
+    if not sys.argv[1:]:
+        print("Usage: %s dir1 dir2" % sys.argv[0], file=sys.stderr)
+        exit(2)
+
+    for dirname in sys.argv[1:]:
+        basename = os.path.basename(dirname)
+        varname = 'expected_%s' % basename
+        testdata = generate_from_directory(
+            varname,
+            Directory.from_disk(path=os.fsencode(dirname)),
+            indent=8
+        )
+        print(testdata)
+        print()
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
new file mode 100644
index 00000000..3a8e76e0
--- /dev/null
+++ b/swh/model/tests/test_from_disk.py
@@ -0,0 +1,658 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import tarfile
+import tempfile
+import unittest
+
+from swh.model import from_disk
+from swh.model.from_disk import Content, Directory, DentryPerms
+from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes
+
+
+class ModeToPerms(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+
+        # Generate a full permissions map
+        self.perms_map = {}
+
+        # Symlinks
+        for i in range(0o120000, 0o127777 + 1):
+            self.perms_map[i] = DentryPerms.symlink
+
+        # Directories
+        for i in range(0o040000, 0o047777 + 1):
+            self.perms_map[i] = DentryPerms.directory
+
+        # Other file types: socket, regular file, block device, character
+        # device, fifo all map to regular files
+        for ft in [0o140000, 0o100000, 0o060000, 0o020000, 0o010000]:
+            for i in range(ft, ft + 0o7777 + 1):
+                if i & 0o111:
+                    # executable bits are set
+                    self.perms_map[i] = DentryPerms.executable_content
+                else:
+                    self.perms_map[i] = DentryPerms.content
+
+    def test_exhaustive_mode_to_perms(self):
+        for fmode, perm in self.perms_map.items():
+            self.assertEqual(perm, from_disk.mode_to_perms(fmode))
+
+
+class DataMixin:
+    maxDiff = None
+
+    def setUp(self):
+        self.tmpdir = tempfile.TemporaryDirectory(
+            prefix=b'swh.model.from_disk'
+        )
+        self.contents = {
+            b'file': {
+                'data': b'42\n',
+                'sha1': hash_to_bytes(
+                    '34973274ccef6ab4dfaaf86599792fa9c3fe4689'
+                ),
+                'sha256': hash_to_bytes(
+                    '084c799cd551dd1d8d5c5f9a5d593b2e'
+                    '931f5e36122ee5c793c1d08a19839cc0'
+                ),
+                'sha1_git': hash_to_bytes(
+                    'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd'),
+                'blake2s256': hash_to_bytes(
+                    'd5fe1939576527e42cfd76a9455a2432'
+                    'fe7f56669564577dd93c4280e76d661d'
+                ),
+                'length': 3,
+                'mode': 0o100644
+            },
+        }
+
+        self.symlinks = {
+            b'symlink': {
+                'data': b'target',
+                'blake2s256': hash_to_bytes(
+                    '595d221b30fdd8e10e2fdf18376e688e'
+                    '9f18d56fd9b6d1eb6a822f8c146c6da6'
+                ),
+                'sha1': hash_to_bytes(
+                    '0e8a3ad980ec179856012b7eecf4327e99cd44cd'
+                ),
+                'sha1_git': hash_to_bytes(
+                    '1de565933b05f74c75ff9a6520af5f9f8a5a2f1d'
+                ),
+                'sha256': hash_to_bytes(
+                    '34a04005bcaf206eec990bd9637d9fdb'
+                    '6725e0a0c0d4aebf003f17f4c956eb5c'
+                ),
+                'length': 6,
+                'perms': DentryPerms.symlink,
+            }
+        }
+
+        self.specials = {
+            b'fifo': os.mkfifo,
+            b'devnull': lambda path: os.mknod(path, device=os.makedev(1, 3)),
+        }
+
+        self.empty_content = {
+            'data': b'',
+            'length': 0,
+            'blake2s256': hash_to_bytes(
+                '69217a3079908094e11121d042354a7c'
+                '1f55b6482ca1a51e1b250dfd1ed0eef9'
+            ),
+            'sha1': hash_to_bytes(
+                'da39a3ee5e6b4b0d3255bfef95601890afd80709'
+            ),
+            'sha1_git': hash_to_bytes(
+                'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
+            ),
+            'sha256': hash_to_bytes(
+                'e3b0c44298fc1c149afbf4c8996fb924'
+                '27ae41e4649b934ca495991b7852b855'
+            ),
+            'perms': DentryPerms.content,
+        }
+
+        self.empty_directory = {
+            'id': hash_to_bytes(
+                '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
+            ),
+            'entries': [],
+        }
+
+        # Generated with generate_testdata_from_disk
+        self.tarball_contents = {
+            b'': {
+                'entries': [{
+                    'name': b'bar',
+                    'perms': DentryPerms.directory,
+                    'target': hash_to_bytes(
+                        '3c1f578394f4623f74a0ba7fe761729f59fc6ec4'
+                    ),
+                    'type': 'dir',
+                }, {
+                    'name': b'empty-folder',
+                    'perms': DentryPerms.directory,
+                    'target': hash_to_bytes(
+                        '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
+                    ),
+                    'type': 'dir',
+                }, {
+                    'name': b'foo',
+                    'perms': DentryPerms.directory,
+                    'target': hash_to_bytes(
+                        '2b41c40f0d1fbffcba12497db71fba83fcca96e5'
+                    ),
+                    'type': 'dir',
+                }, {
+                    'name': b'link-to-another-quote',
+                    'perms': DentryPerms.symlink,
+                    'target': hash_to_bytes(
+                        '7d5c08111e21c8a9f71540939998551683375fad'
+                    ),
+                    'type': 'file',
+                }, {
+                    'name': b'link-to-binary',
+                    'perms': DentryPerms.symlink,
+                    'target': hash_to_bytes(
+                        'e86b45e538d9b6888c969c89fbd22a85aa0e0366'
+                    ),
+                    'type': 'file',
+                }, {
+                    'name': b'link-to-foo',
+                    'perms': DentryPerms.symlink,
+                    'target': hash_to_bytes(
+                        '19102815663d23f8b75a47e7a01965dcdc96468c'
+                    ),
+                    'type': 'file',
+                }, {
+                    'name': b'some-binary',
+                    'perms': DentryPerms.executable_content,
+                    'target': hash_to_bytes(
+                        '68769579c3eaadbe555379b9c3538e6628bae1eb'
+                    ),
+                    'type': 'file',
+                }],
+                'id': hash_to_bytes(
+                    'e8b0f1466af8608c8a3fb9879db172b887e80759'
+                ),
+            },
+            b'bar': {
+                'entries': [{
+                    'name': b'barfoo',
+                    'perms': DentryPerms.directory,
+                    'target': hash_to_bytes(
+                        'c3020f6bf135a38c6df3afeb5fb38232c5e07087'
+                    ),
+                    'type': 'dir',
+                }],
+                'id': hash_to_bytes(
+                    '3c1f578394f4623f74a0ba7fe761729f59fc6ec4'
+                ),
+            },
+            b'bar/barfoo': {
+                'entries': [{
+                    'name': b'another-quote.org',
+                    'perms': DentryPerms.content,
+                    'target': hash_to_bytes(
+                        '133693b125bad2b4ac318535b84901ebb1f6b638'
+                    ),
+                    'type': 'file',
+                }],
+                'id': hash_to_bytes(
+                    'c3020f6bf135a38c6df3afeb5fb38232c5e07087'
+                ),
+            },
+            b'bar/barfoo/another-quote.org': {
+                'blake2s256': hash_to_bytes(
+                    'd26c1cad82d43df0bffa5e7be11a60e3'
+                    '4adb85a218b433cbce5278b10b954fe8'
+                ),
+                'length': 72,
+                'perms': DentryPerms.content,
+                'sha1': hash_to_bytes(
+                    '90a6138ba59915261e179948386aa1cc2aa9220a'
+                ),
+                'sha1_git': hash_to_bytes(
+                    '133693b125bad2b4ac318535b84901ebb1f6b638'
+                ),
+                'sha256': hash_to_bytes(
+                    '3db5ae168055bcd93a4d08285dc99ffe'
+                    'e2883303b23fac5eab850273a8ea5546'
+                ),
+            },
+            b'empty-folder': {
+                'entries': [],
+                'id': hash_to_bytes(
+                    '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
+                ),
+            },
+            b'foo': {
+                'entries': [{
+                    'name': b'barfoo',
+                    'perms': DentryPerms.symlink,
+                    'target': hash_to_bytes(
+                        '8185dfb2c0c2c597d16f75a8a0c37668567c3d7e'
+                    ),
+                    'type': 'file',
+                }, {
+                    'name': b'quotes.md',
+                    'perms': DentryPerms.content,
+                    'target': hash_to_bytes(
+                        '7c4c57ba9ff496ad179b8f65b1d286edbda34c9a'
+                    ),
+                    'type': 'file',
+                }, {
+                    'name': b'rel-link-to-barfoo',
+                    'perms': DentryPerms.symlink,
+                    'target': hash_to_bytes(
+                        'acac326ddd63b0bc70840659d4ac43619484e69f'
+                    ),
+                    'type': 'file',
+                }],
+                'id': hash_to_bytes(
+                    '2b41c40f0d1fbffcba12497db71fba83fcca96e5'
+                ),
+            },
+            b'foo/barfoo': {
+                'blake2s256': hash_to_bytes(
+                    'e1252f2caa4a72653c4efd9af871b62b'
+                    'f2abb7bb2f1b0e95969204bd8a70d4cd'
+                ),
+                'data': b'bar/barfoo',
+                'length': 10,
+                'perms': DentryPerms.symlink,
+                'sha1': hash_to_bytes(
+                    '9057ee6d0162506e01c4d9d5459a7add1fedac37'
+                ),
+                'sha1_git': hash_to_bytes(
+                    '8185dfb2c0c2c597d16f75a8a0c37668567c3d7e'
+                ),
+                'sha256': hash_to_bytes(
+                    '29ad3f5725321b940332c78e403601af'
+                    'ff61daea85e9c80b4a7063b6887ead68'
+                ),
+            },
+            b'foo/quotes.md': {
+                'blake2s256': hash_to_bytes(
+                    'bf7ce4fe304378651ee6348d3e9336ed'
+                    '5ad603d33e83c83ba4e14b46f9b8a80b'
+                ),
+                'length': 66,
+                'perms': DentryPerms.content,
+                'sha1': hash_to_bytes(
+                    '1bf0bb721ac92c18a19b13c0eb3d741cbfadebfc'
+                ),
+                'sha1_git': hash_to_bytes(
+                    '7c4c57ba9ff496ad179b8f65b1d286edbda34c9a'
+                ),
+                'sha256': hash_to_bytes(
+                    'caca942aeda7b308859eb56f909ec96d'
+                    '07a499491690c453f73b9800a93b1659'
+                ),
+            },
+            b'foo/rel-link-to-barfoo': {
+                'blake2s256': hash_to_bytes(
+                    'd9c327421588a1cf61f316615005a2e9'
+                    'c13ac3a4e96d43a24138d718fa0e30db'
+                ),
+                'data': b'../bar/barfoo',
+                'length': 13,
+                'perms': DentryPerms.symlink,
+                'sha1': hash_to_bytes(
+                    'dc51221d308f3aeb2754db48391b85687c2869f4'
+                ),
+                'sha1_git': hash_to_bytes(
+                    'acac326ddd63b0bc70840659d4ac43619484e69f'
+                ),
+                'sha256': hash_to_bytes(
+                    '8007d20db2af40435f42ddef4b8ad76b'
+                    '80adbec26b249fdf0473353f8d99df08'
+                ),
+            },
+            b'link-to-another-quote': {
+                'blake2s256': hash_to_bytes(
+                    '2d0e73cea01ba949c1022dc10c8a43e6'
+                    '6180639662e5dc2737b843382f7b1910'
+                ),
+                'data': b'bar/barfoo/another-quote.org',
+                'length': 28,
+                'perms': DentryPerms.symlink,
+                'sha1': hash_to_bytes(
+                    'cbeed15e79599c90de7383f420fed7acb48ea171'
+                ),
+                'sha1_git': hash_to_bytes(
+                    '7d5c08111e21c8a9f71540939998551683375fad'
+                ),
+                'sha256': hash_to_bytes(
+                    'e6e17d0793aa750a0440eb9ad5b80b25'
+                    '8076637ef0fb68f3ac2e59e4b9ac3ba6'
+                ),
+            },
+            b'link-to-binary': {
+                'blake2s256': hash_to_bytes(
+                    '9ce18b1adecb33f891ca36664da676e1'
+                    '2c772cc193778aac9a137b8dc5834b9b'
+                ),
+                'data': b'some-binary',
+                'length': 11,
+                'perms': DentryPerms.symlink,
+                'sha1': hash_to_bytes(
+                    'd0248714948b3a48a25438232a6f99f0318f59f1'
+                ),
+                'sha1_git': hash_to_bytes(
+                    'e86b45e538d9b6888c969c89fbd22a85aa0e0366'
+                ),
+                'sha256': hash_to_bytes(
+                    '14126e97d83f7d261c5a6889cee73619'
+                    '770ff09e40c5498685aba745be882eff'
+                ),
+            },
+            b'link-to-foo': {
+                'blake2s256': hash_to_bytes(
+                    '08d6cad88075de8f192db097573d0e82'
+                    '9411cd91eb6ec65e8fc16c017edfdb74'
+                ),
+                'data': b'foo',
+                'length': 3,
+                'perms': DentryPerms.symlink,
+                'sha1': hash_to_bytes(
+                    '0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33'
+                ),
+                'sha1_git': hash_to_bytes(
+                    '19102815663d23f8b75a47e7a01965dcdc96468c'
+                ),
+                'sha256': hash_to_bytes(
+                    '2c26b46b68ffc68ff99b453c1d304134'
+                    '13422d706483bfa0f98a5e886266e7ae'
+                ),
+            },
+            b'some-binary': {
+                'blake2s256': hash_to_bytes(
+                    '922e0f7015035212495b090c27577357'
+                    'a740ddd77b0b9e0cd23b5480c07a18c6'
+                ),
+                'length': 5,
+                'perms': DentryPerms.executable_content,
+                'sha1': hash_to_bytes(
+                    '0bbc12d7f4a2a15b143da84617d95cb223c9b23c'
+                ),
+                'sha1_git': hash_to_bytes(
+                    '68769579c3eaadbe555379b9c3538e6628bae1eb'
+                ),
+                'sha256': hash_to_bytes(
+                    'bac650d34a7638bb0aeb5342646d24e3'
+                    'b9ad6b44c9b383621faa482b990a367d'
+                ),
+            },
+        }
+
+    def tearDown(self):
+        self.tmpdir.cleanup()
+
+    def assertContentEqual(self, left, right, check_data=False):  # NoQA
+        if not isinstance(left, Content):
+            raise ValueError('%s is not a Content' % left)
+        if isinstance(right, Content):
+            right = right.get_data()
+
+        keys = DEFAULT_ALGORITHMS | {
+            'length',
+            'perms',
+        }
+        if check_data:
+            keys |= {'data'}
+
+        failed = []
+        for key in keys:
+            try:
+                lvalue = left.data[key]
+                if key == 'perms' and 'perms' not in right:
+                    rvalue = from_disk.mode_to_perms(right['mode'])
+                else:
+                    rvalue = right[key]
+            except KeyError:
+                failed.append(key)
+                continue
+
+            if lvalue != rvalue:
+                failed.append(key)
+
+        if failed:
+            raise self.failureException(
+                'Content mismatched:\n' +
+                '\n'.join(
+                    'content[%s] = %r != %r' % (
+                        key, left.data.get(key), right.get(key))
+                    for key in failed
+                )
+            )
+
+    def assertDirectoryEqual(self, left, right):  # NoQA
+        if not isinstance(left, Directory):
+            raise ValueError('%s is not a Directory' % left)
+        if isinstance(right, Directory):
+            right = right.get_data()
+
+        return self.assertCountEqual(left.entries, right['entries'])
+
+    def make_contents(self, directory):
+        for filename, content in self.contents.items():
+            path = os.path.join(directory, filename)
+            with open(path, 'wb') as f:
+                f.write(content['data'])
+            os.chmod(path, content['mode'])
+
+    def make_symlinks(self, directory):
+        for filename, symlink in self.symlinks.items():
+            path = os.path.join(directory, filename)
+            os.symlink(symlink['data'], path)
+
+    def make_specials(self, directory):
+        for filename, fn in self.specials.items():
+            path = os.path.join(directory, filename)
+            fn(path)
+
+    def make_from_tarball(self, directory):
+        tarball = os.path.join(os.fsencode(os.path.dirname(__file__)),
+                               b'../../../..',
+                               b'swh-storage-testdata',
+                               b'dir-folders',
+                               b'sample-folder.tgz')
+
+        with tarfile.open(tarball, 'r:gz') as f:
+            f.extractall(os.fsdecode(directory))
+
+
+class TestContent(DataMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+
+    def test_data_to_content(self):
+        for filename, content in self.contents.items():
+            conv_content = Content.from_bytes(mode=content['mode'],
+                                              data=content['data'])
+            self.assertContentEqual(conv_content, content)
+
+
+class SymlinkToContent(DataMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.make_symlinks(self.tmpdir.name)
+
+    def test_symlink_to_content(self):
+        for filename, symlink in self.symlinks.items():
+            path = os.path.join(self.tmpdir.name, filename)
+            perms = 0o120000
+            conv_content = Content.from_symlink(path=path, mode=perms)
+            self.assertContentEqual(conv_content, symlink)
+
+
+class FileToContent(DataMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.make_contents(self.tmpdir.name)
+        self.make_symlinks(self.tmpdir.name)
+        self.make_specials(self.tmpdir.name)
+
+    def test_file_to_content(self):
+        for data in [False, True]:
+            for filename, symlink in self.symlinks.items():
+                path = os.path.join(self.tmpdir.name, filename)
+                conv_content = Content.from_file(path=path, data=data)
+                self.assertContentEqual(conv_content, symlink, check_data=data)
+
+            for filename, content in self.contents.items():
+                path = os.path.join(self.tmpdir.name, filename)
+                conv_content = Content.from_file(path=path, data=data)
+                self.assertContentEqual(conv_content, content, check_data=data)
+
+            for filename in self.specials:
+                path = os.path.join(self.tmpdir.name, filename)
+                conv_content = Content.from_file(path=path, data=data)
+                self.assertContentEqual(conv_content, self.empty_content,
+                                        check_data=data)
+
+
+class DirectoryToObjects(DataMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        contents = os.path.join(self.tmpdir.name, b'contents')
+        os.mkdir(contents)
+        self.make_contents(contents)
+        symlinks = os.path.join(self.tmpdir.name, b'symlinks')
+        os.mkdir(symlinks)
+        self.make_symlinks(symlinks)
+        specials = os.path.join(self.tmpdir.name, b'specials')
+        os.mkdir(specials)
+        self.make_specials(specials)
+        empties = os.path.join(self.tmpdir.name, b'empty1', b'empty2')
+        os.makedirs(empties)
+
+    def test_directory_to_objects(self):
+        directory = Directory.from_disk(path=self.tmpdir.name)
+
+        for name, value in self.contents.items():
+            self.assertContentEqual(directory[b'contents/' + name], value)
+
+        for name, value in self.symlinks.items():
+            self.assertContentEqual(directory[b'symlinks/' + name], value)
+
+        for name in self.specials:
+            self.assertContentEqual(
+                directory[b'specials/' + name],
+                self.empty_content,
+            )
+
+        self.assertEqual(
+            directory[b'empty1/empty2'].get_data(),
+            self.empty_directory,
+        )
+
+        # Raise on non existent file
+        with self.assertRaisesRegex(KeyError, "b'nonexistent'"):
+            directory[b'empty1/nonexistent']
+
+        # Raise on non existent directory
+        with self.assertRaisesRegex(KeyError, "b'nonexistentdir'"):
+            directory[b'nonexistentdir/file']
+
+        objs = directory.collect()
+
+        self.assertCountEqual(['content', 'directory'], objs)
+
+        self.assertEqual(len(objs['directory']), 6)
+        self.assertEqual(len(objs['content']),
+                         len(self.contents)
+                         + len(self.symlinks)
+                         + 1)
+
+    def test_directory_to_objects_ignore_empty(self):
+        directory = Directory.from_disk(
+            path=self.tmpdir.name,
+            dir_filter=from_disk.ignore_empty_directories
+        )
+
+        for name, value in self.contents.items():
+            self.assertContentEqual(directory[b'contents/' + name], value)
+
+        for name, value in self.symlinks.items():
+            self.assertContentEqual(directory[b'symlinks/' + name], value)
+
+        for name in self.specials:
+            self.assertContentEqual(
+                directory[b'specials/' + name],
+                self.empty_content,
+            )
+
+        # empty directories have been ignored recursively
+        with self.assertRaisesRegex(KeyError, "b'empty1'"):
+            directory[b'empty1']
+        with self.assertRaisesRegex(KeyError, "b'empty1'"):
+            directory[b'empty1/empty2']
+
+        objs = directory.collect()
+
+        self.assertCountEqual(['content', 'directory'], objs)
+
+        self.assertEqual(len(objs['directory']), 4)
+        self.assertEqual(len(objs['content']),
+                         len(self.contents)
+                         + len(self.symlinks)
+                         + 1)
+
+    def test_directory_to_objects_ignore_name(self):
+        directory = Directory.from_disk(
+            path=self.tmpdir.name,
+            dir_filter=from_disk.ignore_named_directories([b'symlinks'])
+        )
+        for name, value in self.contents.items():
+            self.assertContentEqual(directory[b'contents/' + name], value)
+
+        for name in self.specials:
+            self.assertContentEqual(
+                directory[b'specials/' + name],
+                self.empty_content,
+            )
+
+        self.assertEqual(
+            directory[b'empty1/empty2'].get_data(),
+            self.empty_directory,
+        )
+
+        with self.assertRaisesRegex(KeyError, "b'symlinks'"):
+            directory[b'symlinks']
+
+        objs = directory.collect()
+
+        self.assertCountEqual(['content', 'directory'], objs)
+
+        self.assertEqual(len(objs['directory']), 5)
+        self.assertEqual(len(objs['content']),
+                         len(self.contents)
+                         + 1)
+
+
+class TarballTest(DataMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.make_from_tarball(self.tmpdir.name)
+
+    def test_contents_match(self):
+        directory = Directory.from_disk(
+            path=os.path.join(self.tmpdir.name, b'sample-folder')
+        )
+
+        for name, data in self.tarball_contents.items():
+            obj = directory[name]
+            if isinstance(obj, Content):
+                self.assertContentEqual(obj, data)
+            elif isinstance(obj, Directory):
+                self.assertDirectoryEqual(obj, data)
+            else:
+                raise self.failureException('Unknown type for %s' % obj)
-- 
GitLab