From d65a844ac33f6e61de4bde07b08ce62ab642b64b Mon Sep 17 00:00:00 2001
From: Pierre-Yves David <pierre-yves.david@ens-lyon.org>
Date: Tue, 14 May 2024 17:18:34 +0200
Subject: [PATCH] DiskBackedContent: remove the class in favor of a simpler
 composition approach

Instead of having multiple class and `object_type` value, we just adds
a few lines in the main `model.Content` class to retrieved data on
demand. The `with_data` logic already existed there anyway.

This will avoid having from_disk extending the model from the outside.
---
 swh/model/from_disk.py            | 57 +++++--------------------------
 swh/model/model.py                | 38 ++++++++++++++++-----
 swh/model/tests/test_from_disk.py | 30 ++++++++--------
 3 files changed, 54 insertions(+), 71 deletions(-)

diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
index 015f7c42..9e23860e 100644
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
@@ -10,7 +10,6 @@ filesystem, and convert them to in-memory data structures, which can then
 be exported to SWH data model objects, as defined in :mod:`swh.model.model`.
 """
 
-import datetime
 import enum
 import fnmatch
 import glob
@@ -33,7 +32,6 @@ from typing import (
 import warnings
 
 import attr
-from attrs_strict import type_validator
 from typing_extensions import Final
 
 from . import model
@@ -68,47 +66,12 @@ class FromDiskType(enum.Enum):
 
 
 @attr.s(frozen=True, slots=True)
-class DiskBackedContent(model.BaseContent):
-    """Content-like class, which allows lazy-loading data from the disk."""
+class DiskBackedData:
+    path = attr.ib(type=bytes)
 
-    object_type: Final = "content_file"
-
-    sha1 = attr.ib(type=bytes, validator=type_validator())
-    sha1_git = attr.ib(type=model.Sha1Git, validator=type_validator())
-    sha256 = attr.ib(type=bytes, validator=type_validator())
-    blake2s256 = attr.ib(type=bytes, validator=type_validator())
-
-    length = attr.ib(type=int, validator=type_validator())
-
-    status = attr.ib(
-        type=str,
-        validator=attr.validators.in_(["visible", "hidden"]),
-        default="visible",
-    )
-
-    ctime = attr.ib(
-        type=Optional[datetime.datetime],
-        validator=type_validator(),
-        default=None,
-        eq=False,
-    )
-
-    path = attr.ib(type=Optional[bytes], default=None)
-
-    @classmethod
-    def from_dict(cls, d):
-        return cls(**d)
-
-    def __attrs_post_init__(self):
-        if self.path is None:
-            raise TypeError("path must not be None.")
-
-    def with_data(self) -> model.Content:
-        args = self.to_dict()
-        del args["path"]
-        assert self.path is not None
+    def __call__(self) -> bytes:
         with open(self.path, "rb") as fd:
-            return model.Content.from_dict({**args, "data": fd.read()})
+            return fd.read()
 
 
 class DentryPerms(enum.IntEnum):
@@ -265,14 +228,12 @@ class Content(MerkleLeaf):
         """Builds a `model.BaseContent` object based on this leaf."""
         data = self.get_data().copy()
         data.pop("perms", None)
+        path = data.pop("path", None)
         if data["status"] == "absent":
-            data.pop("path", None)
             return model.SkippedContent.from_dict(data)
-        elif "data" in data:
-            data.pop("path", None)
-            return model.Content.from_dict(data)
-        else:
-            return DiskBackedContent.from_dict(data)
+        elif "data" not in data:
+            data["get_data"] = DiskBackedData(path=path)
+        return model.Content.from_dict(data)
 
 
 def accept_all_directories(
@@ -436,7 +397,7 @@ def iter_directory(
                 # FIXME: read the data from disk later (when the
                 # storage buffer is flushed).
                 #
-                c_obj = cast(Union[model.Content, DiskBackedContent], obj)
+                c_obj = cast(model.Content, obj)
                 contents.append(c_obj.with_data())
         else:
             raise TypeError(f"Unexpected object type from disk: {obj}")
diff --git a/swh/model/model.py b/swh/model/model.py
index df25977e..29b5ff32 100644
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -20,7 +20,18 @@ import collections
 import datetime
 from enum import Enum
 import hashlib
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
 
 import attr
 from attr._make import _AndValidator
@@ -1442,6 +1453,11 @@ class Content(BaseContent):
     )
 
     data = attr.ib(type=Optional[bytes], validator=generic_type_validator, default=None)
+    get_data = attr.ib(
+        type=Optional[Callable[[], bytes]],
+        default=None,
+        cmp=False,
+    )
 
     ctime = attr.ib(
         type=Optional[datetime.datetime],
@@ -1467,11 +1483,10 @@ class Content(BaseContent):
                 raise ValueError("ctime must be a timezone-aware datetime.")
 
     def to_dict(self):
-        content = super().to_dict()
-        if content["data"] is None:
-            del content["data"]
-        if content["ctime"] is None:
-            del content["ctime"]
+        content = super(Content, self.with_data()).to_dict()
+        for k in ("get_data", "data", "ctime"):
+            if content[k] is None:
+                del content[k]
         return content
 
     @classmethod
@@ -1499,9 +1514,14 @@ class Content(BaseContent):
 
         This call is almost a no-op, but subclasses may overload this method
         to lazy-load data (eg. from disk or objstorage)."""
-        if self.data is None:
-            raise MissingData("Content data is None.")
-        return self
+        if self.data is not None:
+            return self
+        new_data = None
+        if self.get_data is not None:
+            new_data = self.get_data()
+        if new_data is None:
+            raise MissingData("Content data and get_data are both None.")
+        return attr.evolve(self, data=new_data, get_data=None)
 
     def unique_key(self) -> KeyType:
         return self.sha1  # TODO: use a dict of hashes
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
index 44991bd6..ee599538 100644
--- a/swh/model/tests/test_from_disk.py
+++ b/swh/model/tests/test_from_disk.py
@@ -17,7 +17,7 @@ from swh.model.from_disk import (
     Content,
     DentryPerms,
     Directory,
-    DiskBackedContent,
+    DiskBackedData,
     FromDiskType,
 )
 from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
@@ -67,10 +67,10 @@ class TestDiskBackedContent(unittest.TestCase):
             blake2s256=b"qux",
         )
         with tempfile.NamedTemporaryFile(mode="w+b") as fd:
-            content = DiskBackedContent(
+            content = model.Content(
                 length=42,
                 status="visible",
-                path=fd.name,
+                get_data=DiskBackedData(path=fd.name),
                 sha1=b"foo",
                 sha1_git=b"bar",
                 sha256=b"baz",
@@ -86,10 +86,10 @@ class TestDiskBackedContent(unittest.TestCase):
         with tempfile.NamedTemporaryFile(mode="w+b") as fd:
             fd.write(b"foo")
             fd.seek(0)
-            content = DiskBackedContent(
+            content = model.Content(
                 length=42,
                 status="visible",
-                path=fd.name,
+                get_data=DiskBackedData(path=fd.name),
                 sha1=b"foo",
                 sha1_git=b"bar",
                 sha256=b"baz",
@@ -105,10 +105,10 @@ class TestDiskBackedContent(unittest.TestCase):
 
     def test_with_data_cannot_read(self):
         with tempfile.NamedTemporaryFile(mode="w+b") as fd:
-            content = DiskBackedContent(
+            content = model.Content(
                 length=42,
                 status="visible",
-                path=fd.name,
+                get_data=DiskBackedData(path=fd.name),
                 sha1=b"foo",
                 sha1_git=b"bar",
                 sha256=b"baz",
@@ -119,8 +119,8 @@ class TestDiskBackedContent(unittest.TestCase):
             content.with_data()
 
     def test_missing_path(self):
-        with pytest.raises(TypeError):
-            DiskBackedContent(
+        with pytest.raises(model.MissingData):
+            c = model.Content(
                 length=42,
                 status="visible",
                 sha1=b"foo",
@@ -128,17 +128,19 @@ class TestDiskBackedContent(unittest.TestCase):
                 sha256=b"baz",
                 blake2s256=b"qux",
             )
+            c.with_data()
 
-        with pytest.raises(TypeError):
-            DiskBackedContent(
+        with pytest.raises(model.MissingData):
+            c = model.Content(
                 length=42,
                 status="visible",
-                path=None,
+                get_data=lambda: None,
                 sha1=b"foo",
                 sha1_git=b"bar",
                 sha256=b"baz",
                 blake2s256=b"qux",
             )
+            c.with_data()
 
 
 class DataMixin:
@@ -632,9 +634,9 @@ class FileToContent(DataMixin, unittest.TestCase):
                 right.pop(key, None)
             assert model_content.with_data() == model.Content.from_dict(right)
 
-            right["path"] = path
+            right["get_data"] = DiskBackedData(path=path)
             del right["data"]
-            assert model_content == DiskBackedContent.from_dict(right)
+            assert model_content == model.Content.from_dict(right)
 
     def test_special_to_content_model(self):
         for filename in self.specials:
-- 
GitLab