diff --git a/PKG-INFO b/PKG-INFO index f0a6d7d6146ff5af350c267bb615f22127d353bf..8171a982ef1699f1e87a9a39da03c74196ad94b8 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 0.0.56 +Version: 0.0.57 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/requirements-cli.txt b/requirements-cli.txt index f58c0c8bf39fc7ae9208e241cecc451265b321e2..7365d1f9c6169a1f81b7a2272b8fe543e524297a 100644 --- a/requirements-cli.txt +++ b/requirements-cli.txt @@ -1,2 +1,3 @@ +swh.core Click dulwich diff --git a/requirements.txt b/requirements.txt index 98825fa3ef8c1821335b73ab07e581747024d346..1577daa98bd6212c48392847cb70ab54c9cb523b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ vcversioner attrs hypothesis python-dateutil +iso8601 diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index f0a6d7d6146ff5af350c267bb615f22127d353bf..8171a982ef1699f1e87a9a39da03c74196ad94b8 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 0.0.56 +Version: 0.0.57 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh.model.egg-info/requires.txt b/swh.model.egg-info/requires.txt index 718cd8a1eee84fa9efce95ccfd41e2aff20ec815..7bf73112692c2e56620cf1d65df95a5ca1354e02 100644 --- a/swh.model.egg-info/requires.txt +++ b/swh.model.egg-info/requires.txt @@ -2,11 +2,13 @@ vcversioner attrs hypothesis python-dateutil +iso8601 [:python_version < "3.6"] pyblake2 [cli] +swh.core Click dulwich diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py index 64a6ef7a824f8da162a7152708caddfaad793861..583df11cf3cccb4a4dfe4fc4356d67034bb8d637 100644 --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -7,15 +7,36 @@ import enum import os import stat -from typing import List +import attr +from typing import List, Optional -from .hashutil import MultiHash, HASH_BLOCK_SIZE +from .hashutil import MultiHash from .merkle import MerkleLeaf, MerkleNode from .identifiers import ( - directory_identifier, + directory_entry_sort_key, directory_identifier, identifier_to_bytes as id_to_bytes, identifier_to_str as id_to_str, ) +from . import model + + +@attr.s +class DiskBackedContent(model.Content): + """Subclass of Content, which allows lazy-loading data from the disk.""" + path = attr.ib(type=Optional[bytes], default=None) + + def __attrs_post_init__(self): + if self.path is None: + raise TypeError('path must not be None.') + + def with_data(self) -> model.Content: + args = self.to_dict() + del args['path'] + assert self.path is not None + with open(self.path, 'rb') as fd: + return model.Content.from_dict({ + **args, + 'data': fd.read()}) class DentryPerms(enum.IntEnum): @@ -83,6 +104,7 @@ class Content(MerkleLeaf): ret['length'] = len(data) ret['perms'] = mode_to_perms(mode) ret['data'] = data + ret['status'] = 'visible' return cls(ret) @@ -92,7 +114,8 @@ class Content(MerkleLeaf): return cls.from_bytes(mode=mode, data=os.readlink(path)) @classmethod - def from_file(cls, *, path, data=False, save_path=False): + def from_file( + cls, *, path, max_content_length=None): """Compute the Software Heritage content entry corresponding to an on-disk file. @@ -101,42 +124,53 @@ class Content(MerkleLeaf): - using the content as a directory entry in a directory Args: - path (bytes): path to the file for which we're computing the - content entry - data (bool): add the file data to the entry save_path (bool): add the file path to the entry + max_content_length (Optional[int]): if given, all contents larger + than this will be skipped. """ file_stat = os.lstat(path) mode = file_stat.st_mode + length = file_stat.st_size + too_large = max_content_length is not None \ + and length > max_content_length if stat.S_ISLNK(mode): # Symbolic link: return a file whose contents are the link target + + if too_large: + # Unlike large contents, we can't stream symlinks to + # MultiHash, and we don't want to fit them in memory if + # they exceed max_content_length either. + # Thankfully, this should not happen for reasonable values of + # max_content_length because of OS/filesystem limitations, + # so let's just raise an error. + raise Exception(f'Symlink too large ({length} bytes)') + return cls.from_symlink(path=path, mode=mode) elif not stat.S_ISREG(mode): # not a regular file: return the empty file instead return cls.from_bytes(mode=mode, data=b'') - length = file_stat.st_size - - if not data: - ret = MultiHash.from_path(path).digest() + if too_large: + skip_reason = 'Content too large' + else: + skip_reason = None + + hashes = MultiHash.from_path(path).digest() + if skip_reason: + ret = { + **hashes, + 'status': 'absent', + 'reason': skip_reason, + } else: - h = MultiHash(length=length) - chunks = [] - with open(path, 'rb') as fobj: - while True: - chunk = fobj.read(HASH_BLOCK_SIZE) - if not chunk: - break - h.update(chunk) - chunks.append(chunk) - - ret = h.digest() - ret['data'] = b''.join(chunks) - - if save_path: - ret['path'] = path + ret = { + **hashes, + 'status': 'visible', + } + + ret['path'] = path ret['perms'] = mode_to_perms(mode) ret['length'] = length @@ -149,6 +183,18 @@ class Content(MerkleLeaf): def compute_hash(self): return self.data['sha1_git'] + def to_model(self) -> model.BaseContent: + """Builds a `model.BaseContent` object based on this leaf.""" + data = self.get_data().copy() + data.pop('perms', None) + if data['status'] == 'absent': + data.pop('path', None) + return model.SkippedContent.from_dict(data) + elif 'data' in data: + return model.Content.from_dict(data) + else: + return DiskBackedContent.from_dict(data) + def accept_all_directories(dirname, entries): """Default filter for :func:`Directory.from_disk` accepting all @@ -220,8 +266,9 @@ class Directory(MerkleNode): type = 'directory' @classmethod - def from_disk(cls, *, path, data=False, save_path=False, - dir_filter=accept_all_directories): + def from_disk(cls, *, path, + dir_filter=accept_all_directories, + max_content_length=None): """Compute the Software Heritage objects for a given directory tree Args: @@ -232,6 +279,8 @@ class Directory(MerkleNode): name or contents. Takes two arguments: dirname and entries, and returns True if the directory should be added, False if the directory should be ignored. + max_content_length (Optional[int]): if given, all contents larger + than this will be skipped. """ top_path = path @@ -244,8 +293,8 @@ class Directory(MerkleNode): for name in fentries + dentries: path = os.path.join(root, name) if not os.path.isdir(path) or os.path.islink(path): - content = Content.from_file(path=path, data=data, - save_path=save_path) + content = Content.from_file( + path=path, max_content_length=max_content_length) entries[name] = content else: if dir_filter(name, dirs[path].entries): @@ -291,17 +340,24 @@ class Directory(MerkleNode): @property def entries(self): + """Child nodes, sorted by name in the same way `directory_identifier` + does.""" if self.__entries is None: - self.__entries = [ + self.__entries = sorted(( self.child_to_directory_entry(name, child) for name, child in self.items() - ] + ), key=directory_entry_sort_key) return self.__entries def compute_hash(self): return id_to_bytes(directory_identifier({'entries': self.entries})) + def to_model(self) -> model.Directory: + """Builds a `model.Directory` object based on this node; + ignoring its children.""" + return model.Directory.from_dict(self.get_data()) + def __getitem__(self, key): if not isinstance(key, bytes): raise ValueError('Can only get a bytes from Directory') diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index 9257de142f168d62f7bbf868c4ed6811d6e57896..85fc76c4585ccba88b8f7e69eca8ae2f9ff34ea1 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -114,7 +114,7 @@ def content_identifier(content): return MultiHash.from_data(content['data']).digest() -def _sort_key(entry): +def directory_entry_sort_key(entry): """The sorting key for tree entries""" if entry['type'] == 'dir': return entry['name'] + b'/' @@ -182,7 +182,7 @@ def directory_identifier(directory): components = [] - for entry in sorted(directory['entries'], key=_sort_key): + for entry in sorted(directory['entries'], key=directory_entry_sort_key): components.extend([ _perms_to_bytes(entry['perms']), b'\x20', diff --git a/swh/model/merkle.py b/swh/model/merkle.py index 02c6f2b29d17e5f6d9dc5336fe760bfc68d1617e..9d97efdc55b1c0bf23c5abfdea8f995988197ea9 100644 --- a/swh/model/merkle.py +++ b/swh/model/merkle.py @@ -8,7 +8,7 @@ import abc import collections -from typing import List, Optional +from typing import Iterator, List, Optional, Set def deep_update(left, right): @@ -120,6 +120,13 @@ class MerkleNode(dict, metaclass=abc.ABCMeta): self.__hash = None self.collected = False + def __eq__(self, other): + return isinstance(other, MerkleNode) \ + and super().__eq__(other) and self.data == other.data + + def __ne__(self, other): + return not self.__eq__(other) + def invalidate_hash(self): """Invalidate the cached hash of the current node.""" if not self.__hash: @@ -266,6 +273,20 @@ class MerkleNode(dict, metaclass=abc.ABCMeta): for child in self.values(): child.reset_collect() + def iter_tree(self) -> Iterator['MerkleNode']: + """Yields all children nodes, recursively. Common nodes are + deduplicated. + """ + yield from self._iter_tree(set()) + + def _iter_tree( + self, seen: Set[bytes]) -> Iterator['MerkleNode']: + if self.hash not in seen: + seen.add(self.hash) + yield self + for child in self.values(): + yield from child._iter_tree(seen=seen) + class MerkleLeaf(MerkleNode): """A leaf to a Merkle tree. diff --git a/swh/model/model.py b/swh/model/model.py index 512824d60a1ab05254dbb7649d6f0ba2ffc54bd6..aff5a7d64a4d840710e66b0fe323395fab0b5c8d 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -7,10 +7,11 @@ import datetime from abc import ABCMeta, abstractmethod from enum import Enum -from typing import List, Optional, Dict +from typing import List, Optional, Dict, Union import attr import dateutil.parser +import iso8601 from .identifiers import ( normalize_timestamp, directory_identifier, revision_identifier, @@ -18,6 +19,13 @@ from .identifiers import ( ) from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes + +class MissingData(Exception): + """Raised by `Content.with_data` when it has no way of fetching the + data (but not when fetching the data fails).""" + pass + + SHA1_SIZE = 20 # TODO: Limit this to 20 bytes @@ -76,9 +84,9 @@ class HashableObject(metaclass=ABCMeta): @attr.s(frozen=True) class Person(BaseModel): """Represents the author/committer of a revision or release.""" - name = attr.ib(type=bytes) - email = attr.ib(type=bytes) fullname = attr.ib(type=bytes) + name = attr.ib(type=Optional[bytes]) + email = attr.ib(type=Optional[bytes]) @attr.s(frozen=True) @@ -117,15 +125,31 @@ class TimestampWithTimezone(BaseModel): raise ValueError('offset too large: %d minutes' % value) @classmethod - def from_dict(cls, d): + def from_dict(cls, obj: Union[Dict, datetime.datetime, int]): """Builds a TimestampWithTimezone from any of the formats accepted by :func:`swh.model.normalize_timestamp`.""" - d = normalize_timestamp(d) + # TODO: this accept way more types than just dicts; find a better + # name + d = normalize_timestamp(obj) return cls( timestamp=Timestamp.from_dict(d['timestamp']), offset=d['offset'], negative_utc=d['negative_utc']) + @classmethod + def from_datetime(cls, dt: datetime.datetime): + return cls.from_dict(dt) + + @classmethod + def from_iso8601(cls, s): + """Builds a TimestampWithTimezone from an ISO8601-formatted string. + """ + dt = iso8601.parse_date(s) + tstz = cls.from_datetime(dt) + if dt.tzname() == '-00:00': + tstz = attr.evolve(tstz, negative_utc=True) + return tstz + @attr.s(frozen=True) class Origin(BaseModel): @@ -362,6 +386,10 @@ class Directory(BaseModel, HashableObject): @attr.s(frozen=True) class BaseContent(BaseModel): + status = attr.ib( + type=str, + validator=attr.validators.in_(['visible', 'hidden', 'absent'])) + def to_dict(self): content = super().to_dict() if content['ctime'] is None: @@ -384,6 +412,10 @@ class BaseContent(BaseModel): raise ValueError('{} is not a valid hash name.'.format(hash_name)) return getattr(self, hash_name) + def hashes(self) -> Dict[str, bytes]: + """Returns a dictionary {hash_name: hash_value}""" + return {algo: getattr(self, algo) for algo in DEFAULT_ALGORITHMS} + @attr.s(frozen=True) class Content(BaseContent): @@ -398,8 +430,8 @@ class Content(BaseContent): type=str, default='visible', validator=attr.validators.in_(['visible', 'hidden'])) - data = attr.ib(type=Optional[bytes], - default=None) + + data = attr.ib(type=Optional[bytes], default=None) ctime = attr.ib(type=Optional[datetime.datetime], default=None) @@ -420,6 +452,16 @@ class Content(BaseContent): def from_dict(cls, d): return super().from_dict(d, use_subclass=False) + def with_data(self) -> 'Content': + """Loads the `data` attribute; meaning that it is guaranteed not to + be None after this call. + + This call is almost a no-op, but subclasses may overload this method + to lazy-load data (eg. from disk or objstorage).""" + if self.data is None: + raise MissingData('Content data is None.') + return self + @attr.s(frozen=True) class SkippedContent(BaseContent): @@ -428,7 +470,7 @@ class SkippedContent(BaseContent): sha256 = attr.ib(type=Optional[bytes]) blake2s256 = attr.ib(type=Optional[bytes]) - length = attr.ib(type=int) + length = attr.ib(type=Optional[int]) status = attr.ib( type=str, diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py index 7b21d20e3af2ccb31aca3607e635845f50592c7d..d9881a1529e46c71f5484607dab5cb68b0803251 100644 --- a/swh/model/tests/test_from_disk.py +++ b/swh/model/tests/test_from_disk.py @@ -12,8 +12,11 @@ import unittest from typing import ClassVar, Optional from swh.model import from_disk -from swh.model.from_disk import Content, DentryPerms, Directory +from swh.model.from_disk import ( + Content, DentryPerms, Directory, DiskBackedContent +) from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex +from swh.model import model TEST_DATA = os.path.join(os.path.dirname(__file__), 'data') @@ -48,6 +51,57 @@ class ModeToPerms(unittest.TestCase): self.assertEqual(perm, from_disk.mode_to_perms(fmode)) +class TestDiskBackedContent(unittest.TestCase): + def test_with_data(self): + expected_content = model.Content( + length=42, status='visible', data=b'foo bar', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + with tempfile.NamedTemporaryFile(mode='w+b') as fd: + content = DiskBackedContent( + length=42, status='visible', path=fd.name, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + fd.write(b'foo bar') + fd.seek(0) + content_with_data = content.with_data() + + assert expected_content == content_with_data + + def test_lazy_data(self): + with tempfile.NamedTemporaryFile(mode='w+b') as fd: + fd.write(b'foo') + fd.seek(0) + content = DiskBackedContent( + length=42, status='visible', path=fd.name, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + fd.write(b'bar') + fd.seek(0) + content_with_data = content.with_data() + fd.write(b'baz') + fd.seek(0) + + assert content_with_data.data == b'bar' + + def test_with_data_cannot_read(self): + with tempfile.NamedTemporaryFile(mode='w+b') as fd: + content = DiskBackedContent( + length=42, status='visible', path=fd.name, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + + with pytest.raises(OSError): + content.with_data() + + def test_missing_path(self): + with pytest.raises(TypeError): + DiskBackedContent( + length=42, status='visible', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + + with pytest.raises(TypeError): + DiskBackedContent( + length=42, status='visible', path=None, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + + class DataMixin: maxDiff = None # type: ClassVar[Optional[int]] @@ -102,7 +156,6 @@ class DataMixin: self.specials = { b'fifo': os.mkfifo, - b'devnull': lambda path: os.mknod(path, device=os.makedev(1, 3)), } self.empty_content = { @@ -402,19 +455,19 @@ class DataMixin: def tearDown(self): self.tmpdir.cleanup() - def assertContentEqual(self, left, right, *, check_data=False, # noqa + def assertContentEqual(self, left, right, *, # noqa check_path=False): if not isinstance(left, Content): raise ValueError('%s is not a Content' % left) if isinstance(right, Content): right = right.get_data() + # Compare dictionaries + keys = DEFAULT_ALGORITHMS | { 'length', 'perms', } - if check_data: - keys |= {'data'} if check_path: keys |= {'path'} @@ -449,7 +502,10 @@ class DataMixin: if isinstance(right, Directory): right = right.get_data() - return self.assertCountEqual(left.entries, right['entries']) + assert left.entries == right['entries'] + assert left.hash == right['id'] + + assert left.to_model() == model.Directory.from_dict(right) def make_contents(self, directory): for filename, content in self.contents.items(): @@ -499,6 +555,19 @@ class SymlinkToContent(DataMixin, unittest.TestCase): conv_content = Content.from_symlink(path=path, mode=perms) self.assertContentEqual(conv_content, symlink) + def test_symlink_to_base_model(self): + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + perms = 0o120000 + model_content = \ + Content.from_symlink(path=path, mode=perms).to_model() + + right = symlink.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) + class FileToContent(DataMixin, unittest.TestCase): def setUp(self): @@ -507,34 +576,128 @@ class FileToContent(DataMixin, unittest.TestCase): self.make_symlinks(self.tmpdir_name) self.make_specials(self.tmpdir_name) + def test_symlink_to_content(self): + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, symlink) + def test_file_to_content(self): - # Check whether loading the data works - for data in [True, False]: + for filename, content in self.contents.items(): + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, content) + + def test_special_to_content(self): + for filename in self.specials: + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, self.empty_content) + + for path in ['/dev/null', '/dev/zero']: + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, self.empty_content) + + def test_symlink_to_content_model(self): + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + model_content = Content.from_file(path=path).to_model() + + right = symlink.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) + + def test_file_to_content_model(self): + for filename, content in self.contents.items(): + path = os.path.join(self.tmpdir_name, filename) + model_content = Content.from_file(path=path).to_model() + + right = content.copy() + for key in ('perms', 'mode'): + right.pop(key, None) + assert model_content.with_data() == model.Content.from_dict(right) + + right['path'] = path + del right['data'] + assert model_content == DiskBackedContent.from_dict(right) + + def test_special_to_content_model(self): + for filename in self.specials: + path = os.path.join(self.tmpdir_name, filename) + model_content = Content.from_file(path=path).to_model() + + right = self.empty_content.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) + + for path in ['/dev/null', '/dev/zero']: + model_content = Content.from_file(path=path).to_model() + + right = self.empty_content.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) + + def test_symlink_max_length(self): + for max_content_length in [4, 10]: for filename, symlink in self.symlinks.items(): path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, symlink, check_data=data) + content = Content.from_file(path=path) + if content.data['length'] > max_content_length: + with pytest.raises(Exception, match='too large'): + Content.from_file( + path=path, + max_content_length=max_content_length) + else: + limited_content = Content.from_file( + path=path, + max_content_length=max_content_length) + assert content == limited_content + def test_file_max_length(self): + for max_content_length in [2, 4]: for filename, content in self.contents.items(): path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, content, check_data=data) + content = Content.from_file(path=path) + limited_content = Content.from_file( + path=path, + max_content_length=max_content_length) + assert content.data['length'] == limited_content.data['length'] + assert content.data['status'] == 'visible' + if content.data['length'] > max_content_length: + assert limited_content.data['status'] == 'absent' + assert limited_content.data['reason'] \ + == 'Content too large' + else: + assert limited_content.data['status'] == 'visible' + def test_special_file_max_length(self): + for max_content_length in [None, 0, 1]: for filename in self.specials: path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, self.empty_content) + content = Content.from_file(path=path) + limited_content = Content.from_file( + path=path, + max_content_length=max_content_length) + assert limited_content == content def test_file_to_content_with_path(self): for filename, content in self.contents.items(): content_w_path = content.copy() path = os.path.join(self.tmpdir_name, filename) content_w_path['path'] = path - conv_content = Content.from_file(path=path, save_path=True) + conv_content = Content.from_file(path=path) self.assertContentEqual(conv_content, content_w_path, check_path=True) +@pytest.mark.fs class DirectoryToObjects(DataMixin, unittest.TestCase): def setUp(self): super().setUp() @@ -685,6 +848,18 @@ class DirectoryToObjects(DataMixin, unittest.TestCase): len(self.contents) + 1) + def test_directory_entry_order(self): + with tempfile.TemporaryDirectory() as dirname: + dirname = os.fsencode(dirname) + open(os.path.join(dirname, b'foo.'), 'a') + open(os.path.join(dirname, b'foo0'), 'a') + os.mkdir(os.path.join(dirname, b'foo')) + + directory = Directory.from_disk(path=dirname) + + assert [entry['name'] for entry in directory.entries] \ + == [b'foo.', b'foo', b'foo0'] + @pytest.mark.fs class TarballTest(DataMixin, unittest.TestCase): @@ -697,12 +872,12 @@ class TarballTest(DataMixin, unittest.TestCase): path=os.path.join(self.tmpdir_name, b'sample-folder') ) - for name, data in self.tarball_contents.items(): + for name, expected in self.tarball_contents.items(): obj = directory[name] if isinstance(obj, Content): - self.assertContentEqual(obj, data) + self.assertContentEqual(obj, expected) elif isinstance(obj, Directory): - self.assertDirectoryEqual(obj, data) + self.assertDirectoryEqual(obj, expected) else: raise self.failureException('Unknown type for %s' % obj) diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index bddf0bca9fcec83e7b6b8643aec00dfc62997109..d5f0f1d46ea103c875c7f3b61f587a53359e3d2d 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -113,7 +113,7 @@ class ContentIdentifier(unittest.TestCase): directory_example = { - 'id': 'c2e41aae41ac17bd4a650770d6ee77f62e52235b', + 'id': 'd7ed3d2c31d608823be58b1cbe57605310615231', 'entries': [ { 'type': 'file', @@ -198,7 +198,28 @@ directory_example = { 'perms': 57344, 'name': b'will_paginate', 'target': '3d531e169db92a16a9a8974f0ae6edf52e52659e' - } + }, + + # in git order, the dir named "order" should be between the files + # named "order." and "order0" + { + 'type': 'dir', + 'perms': 16384, + 'name': b'order', + 'target': '62cdb7020ff920e5aa642c3d4066950dd1f01f4d' + }, + { + 'type': 'file', + 'perms': 16384, + 'name': b'order.', + 'target': '0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33' + }, + { + 'type': 'file', + 'perms': 16384, + 'name': b'order0', + 'target': 'bbe960a25ea311d21d40669e93df2003ba9b90a2' + }, ], } @@ -217,6 +238,13 @@ class DirectoryIdentifier(unittest.TestCase): identifiers.directory_identifier(self.directory), self.directory['id']) + def test_dir_identifier_entry_order(self): + # Reverse order of entries, check the id is still the same. + directory = {'entries': reversed(self.directory['entries'])} + self.assertEqual( + identifiers.directory_identifier(directory), + self.directory['id']) + def test_dir_identifier_empty_directory(self): self.assertEqual( identifiers.directory_identifier(self.empty_directory), diff --git a/swh/model/tests/test_merkle.py b/swh/model/tests/test_merkle.py index 8b1180a4094c19005b19ea52d8879fac9ac405fb..734f7c036143163a24b7e9c9be3be9103d6070fa 100644 --- a/swh/model/tests/test_merkle.py +++ b/swh/model/tests/test_merkle.py @@ -46,6 +46,14 @@ class TestMerkleLeaf(unittest.TestCase): self.data = {'value': b'value'} self.instance = MerkleTestLeaf(self.data) + def test_equality(self): + leaf1 = MerkleTestLeaf(self.data) + leaf2 = MerkleTestLeaf(self.data) + leaf3 = MerkleTestLeaf({}) + + self.assertEqual(leaf1, leaf2) + self.assertNotEqual(leaf1, leaf3) + def test_hash(self): self.assertEqual(self.instance.compute_hash_called, 0) instance_hash = self.instance.hash @@ -114,6 +122,20 @@ class TestMerkleNode(unittest.TestCase): node2[j] = node3 self.nodes[value3] = node3 + def test_equality(self): + node1 = merkle.MerkleNode({'foo': b'bar'}) + node2 = merkle.MerkleNode({'foo': b'bar'}) + node3 = merkle.MerkleNode({}) + + self.assertEqual(node1, node2) + self.assertNotEqual(node1, node3, node1 == node3) + + node1['foo'] = node3 + self.assertNotEqual(node1, node2) + + node2['foo'] = node3 + self.assertEqual(node1, node2) + def test_hash(self): for node in self.nodes.values(): self.assertEqual(node.compute_hash_called, 0) @@ -162,6 +184,10 @@ class TestMerkleNode(unittest.TestCase): collected2 = self.root.collect() self.assertEqual(collected2, {}) + def test_iter_tree(self): + nodes = list(self.root.iter_tree()) + self.assertCountEqual(nodes, self.nodes.values()) + def test_get(self): for key in (b'a', b'b', b'c'): self.assertEqual(self.root[key], self.nodes[b'root/' + key]) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index 5560127ed87e1f7e961c25e00e9d8c39db492d07..a97c3926b7c3d500b3201ff63fee6c312b755790 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -4,10 +4,16 @@ # See top-level LICENSE file for more information import copy +import datetime from hypothesis import given +import pytest -from swh.model.model import Content, Directory, Revision, Release, Snapshot +from swh.model.model import ( + Content, Directory, Revision, Release, Snapshot, + Timestamp, TimestampWithTimezone, + MissingData, +) from swh.model.hashutil import hash_to_bytes from swh.model.hypothesis_strategies import objects, origins, origin_visits from swh.model.identifiers import ( @@ -54,6 +60,53 @@ def test_todict_origin_visits(origin_visit): assert origin_visit == type(origin_visit).from_dict(obj) +def test_timestampwithtimezone_from_datetime(): + tz = datetime.timezone(datetime.timedelta(minutes=+60)) + date = datetime.datetime( + 2020, 2, 27, 14, 39, 19, tzinfo=tz) + + tstz = TimestampWithTimezone.from_datetime(date) + + assert tstz == TimestampWithTimezone( + timestamp=Timestamp( + seconds=1582810759, + microseconds=0, + ), + offset=60, + negative_utc=False, + ) + + +def test_timestampwithtimezone_from_iso8601(): + date = '2020-02-27 14:39:19.123456+0100' + + tstz = TimestampWithTimezone.from_iso8601(date) + + assert tstz == TimestampWithTimezone( + timestamp=Timestamp( + seconds=1582810759, + microseconds=123456, + ), + offset=60, + negative_utc=False, + ) + + +def test_timestampwithtimezone_from_iso8601_negative_utc(): + date = '2020-02-27 13:39:19-0000' + + tstz = TimestampWithTimezone.from_iso8601(date) + + assert tstz == TimestampWithTimezone( + timestamp=Timestamp( + seconds=1582810759, + microseconds=0, + ), + offset=0, + negative_utc=True, + ) + + def test_content_get_hash(): hashes = dict( sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') @@ -62,6 +115,28 @@ def test_content_get_hash(): assert c.get_hash(hash_name) == hash_ +def test_content_hashes(): + hashes = dict( + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + c = Content(length=42, status='visible', **hashes) + assert c.hashes() == hashes + + +def test_content_data(): + c = Content( + length=42, status='visible', data=b'foo', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + assert c.with_data() == c + + +def test_content_data_missing(): + c = Content( + length=42, status='visible', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + with pytest.raises(MissingData): + c.with_data() + + def test_directory_model_id_computation(): dir_dict = dict(directory_example) del dir_dict['id'] diff --git a/version.txt b/version.txt index fc9120ade4f5e100bf204c973fc37610cb316167..1638fac856587a23fc6b1556404dc62814899986 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.56-0-gfcfbd4d \ No newline at end of file +v0.0.57-0-gf7f18a3 \ No newline at end of file