From 60c3aa16cf8778c7413606e2a532cfc47966d63c Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Thu, 20 Feb 2020 16:42:48 +0100
Subject: [PATCH] Add support for skipping large contents in from_disk.

It will be useful to loaders, as they currently load the entire
content in memory before deciding to skip it.
---
 swh/model/from_disk.py            | 52 +++++++++++++++++++++++++------
 swh/model/tests/test_from_disk.py | 44 ++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 9 deletions(-)

diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
index 64a6ef7a..45c60ca9 100644
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
@@ -83,6 +83,7 @@ class Content(MerkleLeaf):
         ret['length'] = len(data)
         ret['perms'] = mode_to_perms(mode)
         ret['data'] = data
+        ret['status'] = 'visible'
 
         return cls(ret)
 
@@ -92,7 +93,9 @@ class Content(MerkleLeaf):
         return cls.from_bytes(mode=mode, data=os.readlink(path))
 
     @classmethod
-    def from_file(cls, *, path, data=False, save_path=False):
+    def from_file(
+            cls, *, path, data=False, save_path=False,
+            max_content_length=None):
         """Compute the Software Heritage content entry corresponding to an
         on-disk file.
 
@@ -105,22 +108,46 @@ class Content(MerkleLeaf):
             content entry
           data (bool): add the file data to the entry
           save_path (bool): add the file path to the entry
+          max_content_length (Optional[int]): if given, all contents larger
+            than this will be skipped.
 
         """
         file_stat = os.lstat(path)
         mode = file_stat.st_mode
+        length = file_stat.st_size
+        too_large = max_content_length is not None \
+            and length > max_content_length
 
         if stat.S_ISLNK(mode):
             # Symbolic link: return a file whose contents are the link target
+
+            if too_large:
+                # Unlike large contents, we can't stream symlinks to
+                # MultiHash, and we don't want to fit them in memory if
+                # they exceed max_content_length either.
+                # Thankfully, this should not happen for reasonable values of
+                # max_content_length because of OS/filesystem limitations,
+                # so let's just raise an error.
+                raise Exception(f'Symlink too large ({length} bytes)')
+
             return cls.from_symlink(path=path, mode=mode)
         elif not stat.S_ISREG(mode):
             # not a regular file: return the empty file instead
             return cls.from_bytes(mode=mode, data=b'')
 
-        length = file_stat.st_size
+        if too_large:
+            skip_reason = 'Content too large'
+        elif not data:
+            skip_reason = 'Skipping file content'
+        else:
+            skip_reason = None
 
-        if not data:
-            ret = MultiHash.from_path(path).digest()
+        if skip_reason:
+            ret = {
+                **MultiHash.from_path(path).digest(),
+                'status': 'absent',
+                'reason': skip_reason,
+            }
         else:
             h = MultiHash(length=length)
             chunks = []
@@ -132,8 +159,11 @@ class Content(MerkleLeaf):
                     h.update(chunk)
                     chunks.append(chunk)
 
-            ret = h.digest()
-            ret['data'] = b''.join(chunks)
+            ret = {
+                **h.digest(),
+                'status': 'visible',
+                'data': b''.join(chunks),
+            }
 
         if save_path:
             ret['path'] = path
@@ -221,7 +251,8 @@ class Directory(MerkleNode):
 
     @classmethod
     def from_disk(cls, *, path, data=False, save_path=False,
-                  dir_filter=accept_all_directories):
+                  dir_filter=accept_all_directories,
+                  max_content_length=None):
         """Compute the Software Heritage objects for a given directory tree
 
         Args:
@@ -232,6 +263,8 @@ class Directory(MerkleNode):
             name or contents. Takes two arguments: dirname and entries, and
             returns True if the directory should be added, False if the
             directory should be ignored.
+          max_content_length (Optional[int]): if given, all contents larger
+            than this will be skipped.
         """
 
         top_path = path
@@ -244,8 +277,9 @@ class Directory(MerkleNode):
             for name in fentries + dentries:
                 path = os.path.join(root, name)
                 if not os.path.isdir(path) or os.path.islink(path):
-                    content = Content.from_file(path=path, data=data,
-                                                save_path=save_path)
+                    content = Content.from_file(
+                        path=path, data=data, save_path=save_path,
+                        max_content_length=max_content_length)
                     entries[name] = content
                 else:
                     if dir_filter(name, dirs[path].entries):
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
index 7b21d20e..1c747370 100644
--- a/swh/model/tests/test_from_disk.py
+++ b/swh/model/tests/test_from_disk.py
@@ -525,6 +525,50 @@ class FileToContent(DataMixin, unittest.TestCase):
                 conv_content = Content.from_file(path=path, data=data)
                 self.assertContentEqual(conv_content, self.empty_content)
 
+    def test_symlink_max_length(self):
+        for max_content_length in [4, 10]:
+            for filename, symlink in self.symlinks.items():
+                path = os.path.join(self.tmpdir_name, filename)
+                content = Content.from_file(path=path, data=True)
+                if content.data['length'] > max_content_length:
+                    with pytest.raises(Exception, match='too large'):
+                        Content.from_file(
+                            path=path, data=True,
+                            max_content_length=max_content_length)
+                else:
+                    limited_content = Content.from_file(
+                        path=path, data=True,
+                        max_content_length=max_content_length)
+                    assert content == limited_content
+
+    def test_file_max_length(self):
+        for max_content_length in [2, 4]:
+            for filename, content in self.contents.items():
+                path = os.path.join(self.tmpdir_name, filename)
+                content = Content.from_file(path=path, data=True)
+                limited_content = Content.from_file(
+                    path=path, data=True,
+                    max_content_length=max_content_length)
+                assert content.data['length'] == limited_content.data['length']
+                assert content.data['status'] == 'visible'
+                if content.data['length'] > max_content_length:
+                    assert limited_content.data['status'] == 'absent'
+                    assert limited_content.data['reason'] \
+                        == 'Content too large'
+                else:
+                    assert limited_content.data['status'] == 'visible'
+                    assert limited_content.data['data'] == content.data['data']
+
+    def test_special_file_max_length(self):
+        for max_content_length in [None, 0, 1]:
+            for filename in self.specials:
+                path = os.path.join(self.tmpdir_name, filename)
+                content = Content.from_file(path=path, data=True)
+                limited_content = Content.from_file(
+                    path=path, data=True,
+                    max_content_length=max_content_length)
+                assert limited_content == content
+
     def test_file_to_content_with_path(self):
         for filename, content in self.contents.items():
             content_w_path = content.copy()
-- 
GitLab