From 6da524cb2bd8b870eaa0be477837694617cfff1b Mon Sep 17 00:00:00 2001 From: Valentin Lorentz <vlorentz@softwareheritage.org> Date: Thu, 20 Feb 2020 16:50:23 +0100 Subject: [PATCH] Add to_model() method to from_disk.{Content,Directory}, to convert to canonical model objects. They will be used by loaders, so they can deal only with model objects, instead of having to do the same conversion themselves. This removes the `data` and `save_path` arguments of `from_file` and `from_disk`, as data loading is always deferred from now on. To access it, users are now expected to either open the data files themselves, or us `.to_model().with_data()`. --- swh/model/from_disk.py | 74 +++++++----- swh/model/model.py | 27 ++++- swh/model/tests/test_from_disk.py | 182 ++++++++++++++++++++++++------ swh/model/tests/test_model.py | 17 +++ 4 files changed, 234 insertions(+), 66 deletions(-) diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py index e1afc6b3..583df11c 100644 --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -7,15 +7,36 @@ import enum import os import stat -from typing import List +import attr +from typing import List, Optional -from .hashutil import MultiHash, HASH_BLOCK_SIZE +from .hashutil import MultiHash from .merkle import MerkleLeaf, MerkleNode from .identifiers import ( directory_entry_sort_key, directory_identifier, identifier_to_bytes as id_to_bytes, identifier_to_str as id_to_str, ) +from . import model + + +@attr.s +class DiskBackedContent(model.Content): + """Subclass of Content, which allows lazy-loading data from the disk.""" + path = attr.ib(type=Optional[bytes], default=None) + + def __attrs_post_init__(self): + if self.path is None: + raise TypeError('path must not be None.') + + def with_data(self) -> model.Content: + args = self.to_dict() + del args['path'] + assert self.path is not None + with open(self.path, 'rb') as fd: + return model.Content.from_dict({ + **args, + 'data': fd.read()}) class DentryPerms(enum.IntEnum): @@ -94,8 +115,7 @@ class Content(MerkleLeaf): @classmethod def from_file( - cls, *, path, data=False, save_path=False, - max_content_length=None): + cls, *, path, max_content_length=None): """Compute the Software Heritage content entry corresponding to an on-disk file. @@ -104,9 +124,6 @@ class Content(MerkleLeaf): - using the content as a directory entry in a directory Args: - path (bytes): path to the file for which we're computing the - content entry - data (bool): add the file data to the entry save_path (bool): add the file path to the entry max_content_length (Optional[int]): if given, all contents larger than this will be skipped. @@ -137,36 +154,23 @@ class Content(MerkleLeaf): if too_large: skip_reason = 'Content too large' - elif not data: - skip_reason = 'Skipping file content' else: skip_reason = None + hashes = MultiHash.from_path(path).digest() if skip_reason: ret = { - **MultiHash.from_path(path).digest(), + **hashes, 'status': 'absent', 'reason': skip_reason, } else: - h = MultiHash(length=length) - chunks = [] - with open(path, 'rb') as fobj: - while True: - chunk = fobj.read(HASH_BLOCK_SIZE) - if not chunk: - break - h.update(chunk) - chunks.append(chunk) - ret = { - **h.digest(), + **hashes, 'status': 'visible', - 'data': b''.join(chunks), } - if save_path: - ret['path'] = path + ret['path'] = path ret['perms'] = mode_to_perms(mode) ret['length'] = length @@ -179,6 +183,18 @@ class Content(MerkleLeaf): def compute_hash(self): return self.data['sha1_git'] + def to_model(self) -> model.BaseContent: + """Builds a `model.BaseContent` object based on this leaf.""" + data = self.get_data().copy() + data.pop('perms', None) + if data['status'] == 'absent': + data.pop('path', None) + return model.SkippedContent.from_dict(data) + elif 'data' in data: + return model.Content.from_dict(data) + else: + return DiskBackedContent.from_dict(data) + def accept_all_directories(dirname, entries): """Default filter for :func:`Directory.from_disk` accepting all @@ -250,7 +266,7 @@ class Directory(MerkleNode): type = 'directory' @classmethod - def from_disk(cls, *, path, data=False, save_path=False, + def from_disk(cls, *, path, dir_filter=accept_all_directories, max_content_length=None): """Compute the Software Heritage objects for a given directory tree @@ -278,8 +294,7 @@ class Directory(MerkleNode): path = os.path.join(root, name) if not os.path.isdir(path) or os.path.islink(path): content = Content.from_file( - path=path, data=data, save_path=save_path, - max_content_length=max_content_length) + path=path, max_content_length=max_content_length) entries[name] = content else: if dir_filter(name, dirs[path].entries): @@ -338,6 +353,11 @@ class Directory(MerkleNode): def compute_hash(self): return id_to_bytes(directory_identifier({'entries': self.entries})) + def to_model(self) -> model.Directory: + """Builds a `model.Directory` object based on this node; + ignoring its children.""" + return model.Directory.from_dict(self.get_data()) + def __getitem__(self, key): if not isinstance(key, bytes): raise ValueError('Can only get a bytes from Directory') diff --git a/swh/model/model.py b/swh/model/model.py index 26accb65..d3c9c7d5 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -18,6 +18,13 @@ from .identifiers import ( ) from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes + +class MissingData(Exception): + """Raised by `Content.with_data` when it has no way of fetching the + data (but not when fetching the data fails).""" + pass + + SHA1_SIZE = 20 # TODO: Limit this to 20 bytes @@ -362,6 +369,10 @@ class Directory(BaseModel, HashableObject): @attr.s(frozen=True) class BaseContent(BaseModel): + status = attr.ib( + type=str, + validator=attr.validators.in_(['visible', 'hidden', 'absent'])) + def to_dict(self): content = super().to_dict() if content['ctime'] is None: @@ -402,8 +413,8 @@ class Content(BaseContent): type=str, default='visible', validator=attr.validators.in_(['visible', 'hidden'])) - data = attr.ib(type=Optional[bytes], - default=None) + + data = attr.ib(type=Optional[bytes], default=None) ctime = attr.ib(type=Optional[datetime.datetime], default=None) @@ -424,6 +435,16 @@ class Content(BaseContent): def from_dict(cls, d): return super().from_dict(d, use_subclass=False) + def with_data(self) -> 'Content': + """Loads the `data` attribute; meaning that it is guaranteed not to + be None after this call. + + This call is almost a no-op, but subclasses may overload this method + to lazy-load data (eg. from disk or objstorage).""" + if self.data is None: + raise MissingData('Content data is None.') + return self + @attr.s(frozen=True) class SkippedContent(BaseContent): @@ -432,7 +453,7 @@ class SkippedContent(BaseContent): sha256 = attr.ib(type=Optional[bytes]) blake2s256 = attr.ib(type=Optional[bytes]) - length = attr.ib(type=int) + length = attr.ib(type=Optional[int]) status = attr.ib( type=str, diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py index 137ac148..d9881a15 100644 --- a/swh/model/tests/test_from_disk.py +++ b/swh/model/tests/test_from_disk.py @@ -12,8 +12,11 @@ import unittest from typing import ClassVar, Optional from swh.model import from_disk -from swh.model.from_disk import Content, DentryPerms, Directory +from swh.model.from_disk import ( + Content, DentryPerms, Directory, DiskBackedContent +) from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex +from swh.model import model TEST_DATA = os.path.join(os.path.dirname(__file__), 'data') @@ -48,6 +51,57 @@ class ModeToPerms(unittest.TestCase): self.assertEqual(perm, from_disk.mode_to_perms(fmode)) +class TestDiskBackedContent(unittest.TestCase): + def test_with_data(self): + expected_content = model.Content( + length=42, status='visible', data=b'foo bar', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + with tempfile.NamedTemporaryFile(mode='w+b') as fd: + content = DiskBackedContent( + length=42, status='visible', path=fd.name, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + fd.write(b'foo bar') + fd.seek(0) + content_with_data = content.with_data() + + assert expected_content == content_with_data + + def test_lazy_data(self): + with tempfile.NamedTemporaryFile(mode='w+b') as fd: + fd.write(b'foo') + fd.seek(0) + content = DiskBackedContent( + length=42, status='visible', path=fd.name, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + fd.write(b'bar') + fd.seek(0) + content_with_data = content.with_data() + fd.write(b'baz') + fd.seek(0) + + assert content_with_data.data == b'bar' + + def test_with_data_cannot_read(self): + with tempfile.NamedTemporaryFile(mode='w+b') as fd: + content = DiskBackedContent( + length=42, status='visible', path=fd.name, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + + with pytest.raises(OSError): + content.with_data() + + def test_missing_path(self): + with pytest.raises(TypeError): + DiskBackedContent( + length=42, status='visible', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + + with pytest.raises(TypeError): + DiskBackedContent( + length=42, status='visible', path=None, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + + class DataMixin: maxDiff = None # type: ClassVar[Optional[int]] @@ -401,19 +455,19 @@ class DataMixin: def tearDown(self): self.tmpdir.cleanup() - def assertContentEqual(self, left, right, *, check_data=False, # noqa + def assertContentEqual(self, left, right, *, # noqa check_path=False): if not isinstance(left, Content): raise ValueError('%s is not a Content' % left) if isinstance(right, Content): right = right.get_data() + # Compare dictionaries + keys = DEFAULT_ALGORITHMS | { 'length', 'perms', } - if check_data: - keys |= {'data'} if check_path: keys |= {'path'} @@ -449,6 +503,9 @@ class DataMixin: right = right.get_data() assert left.entries == right['entries'] + assert left.hash == right['id'] + + assert left.to_model() == model.Directory.from_dict(right) def make_contents(self, directory): for filename, content in self.contents.items(): @@ -498,6 +555,19 @@ class SymlinkToContent(DataMixin, unittest.TestCase): conv_content = Content.from_symlink(path=path, mode=perms) self.assertContentEqual(conv_content, symlink) + def test_symlink_to_base_model(self): + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + perms = 0o120000 + model_content = \ + Content.from_symlink(path=path, mode=perms).to_model() + + right = symlink.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) + class FileToContent(DataMixin, unittest.TestCase): def setUp(self): @@ -507,45 +577,86 @@ class FileToContent(DataMixin, unittest.TestCase): self.make_specials(self.tmpdir_name) def test_symlink_to_content(self): - # Check whether loading the data works - for data in [True, False]: - for filename, symlink in self.symlinks.items(): - path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, symlink, check_data=data) + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, symlink) def test_file_to_content(self): - for data in [True, False]: - for filename, content in self.contents.items(): - path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, content, check_data=data) + for filename, content in self.contents.items(): + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, content) def test_special_to_content(self): - for data in [True, False]: - for filename in self.specials: - path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, self.empty_content) + for filename in self.specials: + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, self.empty_content) - for path in ['/dev/null', '/dev/zero']: - path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, self.empty_content) + for path in ['/dev/null', '/dev/zero']: + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, self.empty_content) + + def test_symlink_to_content_model(self): + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + model_content = Content.from_file(path=path).to_model() + + right = symlink.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) + + def test_file_to_content_model(self): + for filename, content in self.contents.items(): + path = os.path.join(self.tmpdir_name, filename) + model_content = Content.from_file(path=path).to_model() + + right = content.copy() + for key in ('perms', 'mode'): + right.pop(key, None) + assert model_content.with_data() == model.Content.from_dict(right) + + right['path'] = path + del right['data'] + assert model_content == DiskBackedContent.from_dict(right) + + def test_special_to_content_model(self): + for filename in self.specials: + path = os.path.join(self.tmpdir_name, filename) + model_content = Content.from_file(path=path).to_model() + + right = self.empty_content.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) + + for path in ['/dev/null', '/dev/zero']: + model_content = Content.from_file(path=path).to_model() + + right = self.empty_content.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) def test_symlink_max_length(self): for max_content_length in [4, 10]: for filename, symlink in self.symlinks.items(): path = os.path.join(self.tmpdir_name, filename) - content = Content.from_file(path=path, data=True) + content = Content.from_file(path=path) if content.data['length'] > max_content_length: with pytest.raises(Exception, match='too large'): Content.from_file( - path=path, data=True, + path=path, max_content_length=max_content_length) else: limited_content = Content.from_file( - path=path, data=True, + path=path, max_content_length=max_content_length) assert content == limited_content @@ -553,9 +664,9 @@ class FileToContent(DataMixin, unittest.TestCase): for max_content_length in [2, 4]: for filename, content in self.contents.items(): path = os.path.join(self.tmpdir_name, filename) - content = Content.from_file(path=path, data=True) + content = Content.from_file(path=path) limited_content = Content.from_file( - path=path, data=True, + path=path, max_content_length=max_content_length) assert content.data['length'] == limited_content.data['length'] assert content.data['status'] == 'visible' @@ -565,15 +676,14 @@ class FileToContent(DataMixin, unittest.TestCase): == 'Content too large' else: assert limited_content.data['status'] == 'visible' - assert limited_content.data['data'] == content.data['data'] def test_special_file_max_length(self): for max_content_length in [None, 0, 1]: for filename in self.specials: path = os.path.join(self.tmpdir_name, filename) - content = Content.from_file(path=path, data=True) + content = Content.from_file(path=path) limited_content = Content.from_file( - path=path, data=True, + path=path, max_content_length=max_content_length) assert limited_content == content @@ -582,7 +692,7 @@ class FileToContent(DataMixin, unittest.TestCase): content_w_path = content.copy() path = os.path.join(self.tmpdir_name, filename) content_w_path['path'] = path - conv_content = Content.from_file(path=path, save_path=True) + conv_content = Content.from_file(path=path) self.assertContentEqual(conv_content, content_w_path, check_path=True) @@ -762,12 +872,12 @@ class TarballTest(DataMixin, unittest.TestCase): path=os.path.join(self.tmpdir_name, b'sample-folder') ) - for name, data in self.tarball_contents.items(): + for name, expected in self.tarball_contents.items(): obj = directory[name] if isinstance(obj, Content): - self.assertContentEqual(obj, data) + self.assertContentEqual(obj, expected) elif isinstance(obj, Directory): - self.assertDirectoryEqual(obj, data) + self.assertDirectoryEqual(obj, expected) else: raise self.failureException('Unknown type for %s' % obj) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index e905725d..82f3dc96 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -6,8 +6,10 @@ import copy from hypothesis import given +import pytest from swh.model.model import Content, Directory, Revision, Release, Snapshot +from swh.model.model import MissingData from swh.model.hashutil import hash_to_bytes from swh.model.hypothesis_strategies import objects, origins, origin_visits from swh.model.identifiers import ( @@ -69,6 +71,21 @@ def test_content_hashes(): assert c.hashes() == hashes +def test_content_data(): + c = Content( + length=42, status='visible', data=b'foo', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + assert c.with_data() == c + + +def test_content_data_missing(): + c = Content( + length=42, status='visible', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + with pytest.raises(MissingData): + c.with_data() + + def test_directory_model_id_computation(): dir_dict = dict(directory_example) del dir_dict['id'] -- GitLab