From d65a844ac33f6e61de4bde07b08ce62ab642b64b Mon Sep 17 00:00:00 2001 From: Pierre-Yves David <pierre-yves.david@ens-lyon.org> Date: Tue, 14 May 2024 17:18:34 +0200 Subject: [PATCH] DiskBackedContent: remove the class in favor of a simpler composition approach Instead of having multiple class and `object_type` value, we just adds a few lines in the main `model.Content` class to retrieved data on demand. The `with_data` logic already existed there anyway. This will avoid having from_disk extending the model from the outside. --- swh/model/from_disk.py | 57 +++++-------------------------- swh/model/model.py | 38 ++++++++++++++++----- swh/model/tests/test_from_disk.py | 30 ++++++++-------- 3 files changed, 54 insertions(+), 71 deletions(-) diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py index 015f7c42..9e23860e 100644 --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -10,7 +10,6 @@ filesystem, and convert them to in-memory data structures, which can then be exported to SWH data model objects, as defined in :mod:`swh.model.model`. """ -import datetime import enum import fnmatch import glob @@ -33,7 +32,6 @@ from typing import ( import warnings import attr -from attrs_strict import type_validator from typing_extensions import Final from . import model @@ -68,47 +66,12 @@ class FromDiskType(enum.Enum): @attr.s(frozen=True, slots=True) -class DiskBackedContent(model.BaseContent): - """Content-like class, which allows lazy-loading data from the disk.""" +class DiskBackedData: + path = attr.ib(type=bytes) - object_type: Final = "content_file" - - sha1 = attr.ib(type=bytes, validator=type_validator()) - sha1_git = attr.ib(type=model.Sha1Git, validator=type_validator()) - sha256 = attr.ib(type=bytes, validator=type_validator()) - blake2s256 = attr.ib(type=bytes, validator=type_validator()) - - length = attr.ib(type=int, validator=type_validator()) - - status = attr.ib( - type=str, - validator=attr.validators.in_(["visible", "hidden"]), - default="visible", - ) - - ctime = attr.ib( - type=Optional[datetime.datetime], - validator=type_validator(), - default=None, - eq=False, - ) - - path = attr.ib(type=Optional[bytes], default=None) - - @classmethod - def from_dict(cls, d): - return cls(**d) - - def __attrs_post_init__(self): - if self.path is None: - raise TypeError("path must not be None.") - - def with_data(self) -> model.Content: - args = self.to_dict() - del args["path"] - assert self.path is not None + def __call__(self) -> bytes: with open(self.path, "rb") as fd: - return model.Content.from_dict({**args, "data": fd.read()}) + return fd.read() class DentryPerms(enum.IntEnum): @@ -265,14 +228,12 @@ class Content(MerkleLeaf): """Builds a `model.BaseContent` object based on this leaf.""" data = self.get_data().copy() data.pop("perms", None) + path = data.pop("path", None) if data["status"] == "absent": - data.pop("path", None) return model.SkippedContent.from_dict(data) - elif "data" in data: - data.pop("path", None) - return model.Content.from_dict(data) - else: - return DiskBackedContent.from_dict(data) + elif "data" not in data: + data["get_data"] = DiskBackedData(path=path) + return model.Content.from_dict(data) def accept_all_directories( @@ -436,7 +397,7 @@ def iter_directory( # FIXME: read the data from disk later (when the # storage buffer is flushed). # - c_obj = cast(Union[model.Content, DiskBackedContent], obj) + c_obj = cast(model.Content, obj) contents.append(c_obj.with_data()) else: raise TypeError(f"Unexpected object type from disk: {obj}") diff --git a/swh/model/model.py b/swh/model/model.py index df25977e..29b5ff32 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -20,7 +20,18 @@ import collections import datetime from enum import Enum import hashlib -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, +) import attr from attr._make import _AndValidator @@ -1442,6 +1453,11 @@ class Content(BaseContent): ) data = attr.ib(type=Optional[bytes], validator=generic_type_validator, default=None) + get_data = attr.ib( + type=Optional[Callable[[], bytes]], + default=None, + cmp=False, + ) ctime = attr.ib( type=Optional[datetime.datetime], @@ -1467,11 +1483,10 @@ class Content(BaseContent): raise ValueError("ctime must be a timezone-aware datetime.") def to_dict(self): - content = super().to_dict() - if content["data"] is None: - del content["data"] - if content["ctime"] is None: - del content["ctime"] + content = super(Content, self.with_data()).to_dict() + for k in ("get_data", "data", "ctime"): + if content[k] is None: + del content[k] return content @classmethod @@ -1499,9 +1514,14 @@ class Content(BaseContent): This call is almost a no-op, but subclasses may overload this method to lazy-load data (eg. from disk or objstorage).""" - if self.data is None: - raise MissingData("Content data is None.") - return self + if self.data is not None: + return self + new_data = None + if self.get_data is not None: + new_data = self.get_data() + if new_data is None: + raise MissingData("Content data and get_data are both None.") + return attr.evolve(self, data=new_data, get_data=None) def unique_key(self) -> KeyType: return self.sha1 # TODO: use a dict of hashes diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py index 44991bd6..ee599538 100644 --- a/swh/model/tests/test_from_disk.py +++ b/swh/model/tests/test_from_disk.py @@ -17,7 +17,7 @@ from swh.model.from_disk import ( Content, DentryPerms, Directory, - DiskBackedContent, + DiskBackedData, FromDiskType, ) from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex @@ -67,10 +67,10 @@ class TestDiskBackedContent(unittest.TestCase): blake2s256=b"qux", ) with tempfile.NamedTemporaryFile(mode="w+b") as fd: - content = DiskBackedContent( + content = model.Content( length=42, status="visible", - path=fd.name, + get_data=DiskBackedData(path=fd.name), sha1=b"foo", sha1_git=b"bar", sha256=b"baz", @@ -86,10 +86,10 @@ class TestDiskBackedContent(unittest.TestCase): with tempfile.NamedTemporaryFile(mode="w+b") as fd: fd.write(b"foo") fd.seek(0) - content = DiskBackedContent( + content = model.Content( length=42, status="visible", - path=fd.name, + get_data=DiskBackedData(path=fd.name), sha1=b"foo", sha1_git=b"bar", sha256=b"baz", @@ -105,10 +105,10 @@ class TestDiskBackedContent(unittest.TestCase): def test_with_data_cannot_read(self): with tempfile.NamedTemporaryFile(mode="w+b") as fd: - content = DiskBackedContent( + content = model.Content( length=42, status="visible", - path=fd.name, + get_data=DiskBackedData(path=fd.name), sha1=b"foo", sha1_git=b"bar", sha256=b"baz", @@ -119,8 +119,8 @@ class TestDiskBackedContent(unittest.TestCase): content.with_data() def test_missing_path(self): - with pytest.raises(TypeError): - DiskBackedContent( + with pytest.raises(model.MissingData): + c = model.Content( length=42, status="visible", sha1=b"foo", @@ -128,17 +128,19 @@ class TestDiskBackedContent(unittest.TestCase): sha256=b"baz", blake2s256=b"qux", ) + c.with_data() - with pytest.raises(TypeError): - DiskBackedContent( + with pytest.raises(model.MissingData): + c = model.Content( length=42, status="visible", - path=None, + get_data=lambda: None, sha1=b"foo", sha1_git=b"bar", sha256=b"baz", blake2s256=b"qux", ) + c.with_data() class DataMixin: @@ -632,9 +634,9 @@ class FileToContent(DataMixin, unittest.TestCase): right.pop(key, None) assert model_content.with_data() == model.Content.from_dict(right) - right["path"] = path + right["get_data"] = DiskBackedData(path=path) del right["data"] - assert model_content == DiskBackedContent.from_dict(right) + assert model_content == model.Content.from_dict(right) def test_special_to_content_model(self): for filename in self.specials: -- GitLab