diff --git a/PKG-INFO b/PKG-INFO index 8b07a8646a12e0deea6602ffbc3799cffea8eba5..b5797c6a895baa9c38fcb8b8f3c171553b5c2124 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,16 +1,14 @@ Metadata-Version: 2.1 Name: swh.model -Version: 6.2.0 +Version: 6.3.0 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers Author-email: swh-devel@inria.fr -License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-model Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-model/ -Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) @@ -42,5 +40,3 @@ This module defines the notion of SoftWare Heritage persistent IDentifiers $ swh-identify --no-filename /usr/src/linux/kernel/ swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab ``` - - diff --git a/pytest.ini b/pytest.ini index c5186e582bb1a58fec0cb9bb6b7f30ee4210727f..10242f24f0323cac877764b3d4db78b11aa182f7 100644 --- a/pytest.ini +++ b/pytest.ini @@ -4,3 +4,5 @@ norecursedirs = build docs .* markers = fs: tests that involve filesystem ios requires_optional_deps: tests in test_cli.py that should not run if optional dependencies are not installed + +asyncio_mode = strict diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 8b07a8646a12e0deea6602ffbc3799cffea8eba5..b5797c6a895baa9c38fcb8b8f3c171553b5c2124 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,16 +1,14 @@ Metadata-Version: 2.1 Name: swh.model -Version: 6.2.0 +Version: 6.3.0 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers Author-email: swh-devel@inria.fr -License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-model Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-model/ -Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) @@ -42,5 +40,3 @@ This module defines the notion of SoftWare Heritage persistent IDentifiers $ swh-identify --no-filename /usr/src/linux/kernel/ swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab ``` - - diff --git a/swh/model/git_objects.py b/swh/model/git_objects.py index 566aaa364b1823cbce01163d0f6dae9d7d891410..41be6f2ea77ed6304e9435ed72b47a7281f8d9df 100644 --- a/swh/model/git_objects.py +++ b/swh/model/git_objects.py @@ -240,10 +240,7 @@ def format_git_object_from_headers( if message is not None: entries.extend((b"\n", message)) - concatenated_entries = b"".join(entries) - - header = git_object_header(git_type, len(concatenated_entries)) - return header + concatenated_entries + return format_git_object_from_parts(git_type, entries) def format_git_object_from_parts(git_type: str, parts: Iterable[bytes]) -> bytes: diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index 106e7c0c8e3a342d09cd431c37004b381d2d8926..75d9f8b40d6ea1a5379e96034c68b2f0b227d88b 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -56,7 +56,7 @@ import functools import hashlib from io import BytesIO import os -from typing import Callable, Dict, Optional +from typing import Callable, Dict, Optional, Union ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5"]) """Hashing algorithms supported by this module""" @@ -293,7 +293,7 @@ def hash_git_data(data, git_type, base_algo="sha1"): @functools.lru_cache() -def hash_to_hex(hash): +def hash_to_hex(hash: Union[str, bytes]) -> str: """Converts a hash (in hex or bytes form) to its hexadecimal ascii form Args: @@ -309,7 +309,7 @@ def hash_to_hex(hash): @functools.lru_cache() -def hash_to_bytehex(hash): +def hash_to_bytehex(hash: bytes) -> bytes: """Converts a hash to its hexadecimal bytes representation Args: @@ -322,7 +322,7 @@ def hash_to_bytehex(hash): @functools.lru_cache() -def hash_to_bytes(hash): +def hash_to_bytes(hash: Union[str, bytes]) -> bytes: """Converts a hash (in hex or bytes form) to its raw bytes form Args: @@ -338,7 +338,7 @@ def hash_to_bytes(hash): @functools.lru_cache() -def bytehex_to_hash(hex): +def bytehex_to_hash(hex: bytes) -> bytes: """Converts a hexadecimal bytes representation of a hash to that hash Args: diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py index dabecf962d03c0c7a00895ec215342beb1e31a2a..53c66f00bcfc7789672d4f0f2107b1d8675bcbc9 100644 --- a/swh/model/hypothesis_strategies.py +++ b/swh/model/hypothesis_strategies.py @@ -120,22 +120,22 @@ def persons_d(draw): return dict(fullname=fullname, name=name, email=email) -def persons(): - return persons_d().map(Person.from_dict) +def persons(**kwargs): + return persons_d(**kwargs).map(Person.from_dict) -def timestamps_d(): +def timestamps_d(**kwargs): max_seconds = datetime.datetime.max.replace( tzinfo=datetime.timezone.utc ).timestamp() min_seconds = datetime.datetime.min.replace( tzinfo=datetime.timezone.utc ).timestamp() - return builds( - dict, + defaults = dict( seconds=integers(min_seconds, max_seconds), microseconds=integers(0, 1000000), ) + return builds(dict, **{**defaults, **kwargs}) def timestamps(): @@ -145,6 +145,7 @@ def timestamps(): @composite def timestamps_with_timezone_d( draw, + *, timestamp=timestamps_d(), offset=integers(min_value=-14 * 60, max_value=14 * 60), negative_utc=booleans(), @@ -161,35 +162,34 @@ timestamps_with_timezone = timestamps_with_timezone_d().map( ) -def origins_d(): - return builds(dict, url=iris()) +def origins_d(*, url=iris()): + return builds(dict, url=url) -def origins(): - return origins_d().map(Origin.from_dict) +def origins(**kwargs): + return origins_d(**kwargs).map(Origin.from_dict) -def origin_visits_d(): - return builds( - dict, +def origin_visits_d(**kwargs): + defaults = dict( visit=integers(1, 1000), origin=iris(), date=aware_datetimes(), type=pgsql_text(), ) + return builds(dict, **{**defaults, **kwargs}) -def origin_visits(): - return origin_visits_d().map(OriginVisit.from_dict) +def origin_visits(**kwargs): + return origin_visits_d(**kwargs).map(OriginVisit.from_dict) def metadata_dicts(): return dictionaries(pgsql_text(), pgsql_text()) -def origin_visit_statuses_d(): - return builds( - dict, +def origin_visit_statuses_d(**kwargs): + defaults = dict( visit=integers(1, 1000), origin=iris(), type=optional(sampled_from(["git", "svn", "pypi", "debian"])), @@ -200,60 +200,48 @@ def origin_visit_statuses_d(): snapshot=optional(sha1_git()), metadata=optional(metadata_dicts()), ) + return builds(dict, **{**defaults, **kwargs}) -def origin_visit_statuses(): - return origin_visit_statuses_d().map(OriginVisitStatus.from_dict) +def origin_visit_statuses(**kwargs): + return origin_visit_statuses_d(**kwargs).map(OriginVisitStatus.from_dict) @composite -def releases_d(draw): - target_type = sampled_from([x.value for x in ObjectType]) - name = binary() - message = optional(binary()) - synthetic = booleans() - target = sha1_git() - metadata = optional(revision_metadata()) +def releases_d(draw, **kwargs): + defaults = dict( + target_type=sampled_from([x.value for x in ObjectType]), + name=binary(), + message=optional(binary()), + synthetic=booleans(), + target=sha1_git(), + metadata=optional(revision_metadata()), + raw_manifest=optional(binary()), + ) d = draw( one_of( # None author/date: - builds( - dict, - name=name, - message=message, - synthetic=synthetic, - author=none(), - date=none(), - target=target, - target_type=target_type, - metadata=metadata, - ), + builds(dict, author=none(), date=none(), **{**defaults, **kwargs}), # non-None author/date: builds( dict, - name=name, - message=message, - synthetic=synthetic, date=timestamps_with_timezone_d(), author=persons_d(), - target=target, - target_type=target_type, - metadata=metadata, + **{**defaults, **kwargs}, ), # it is also possible for date to be None but not author, but let's not # overwhelm hypothesis with this edge case ) ) - raw_manifest = draw(optional(binary())) - if raw_manifest: - d["raw_manifest"] = raw_manifest + if d["raw_manifest"] is None: + del d["raw_manifest"] return d -def releases(): - return releases_d().map(Release.from_dict) +def releases(**kwargs): + return releases_d(**kwargs).map(Release.from_dict) revision_metadata = metadata_dicts @@ -266,38 +254,36 @@ def extra_headers(): @composite -def revisions_d(draw): +def revisions_d(draw, **kwargs): + defaults = dict( + message=optional(binary()), + synthetic=booleans(), + parents=tuples(sha1_git()), + directory=sha1_git(), + type=sampled_from([x.value for x in RevisionType]), + metadata=optional(revision_metadata()), + extra_headers=extra_headers(), + raw_manifest=optional(binary()), + ) d = draw( one_of( # None author/committer/date/committer_date builds( dict, - message=optional(binary()), - synthetic=booleans(), author=none(), committer=none(), date=none(), committer_date=none(), - parents=tuples(sha1_git()), - directory=sha1_git(), - type=sampled_from([x.value for x in RevisionType]), - metadata=optional(revision_metadata()), - extra_headers=extra_headers(), + **{**defaults, **kwargs}, ), # non-None author/committer/date/committer_date builds( dict, - message=optional(binary()), - synthetic=booleans(), author=persons_d(), committer=persons_d(), date=timestamps_with_timezone_d(), committer_date=timestamps_with_timezone_d(), - parents=tuples(sha1_git()), - directory=sha1_git(), - type=sampled_from([x.value for x in RevisionType]), - metadata=optional(revision_metadata()), - extra_headers=extra_headers(), + **{**defaults, **kwargs}, ), # There are many other combinations, but let's not overwhelm hypothesis # with these edge cases @@ -305,67 +291,67 @@ def revisions_d(draw): ) # TODO: metadata['extra_headers'] can have binary keys and values - raw_manifest = draw(optional(binary())) - if raw_manifest: - d["raw_manifest"] = raw_manifest + if d["raw_manifest"] is None: + del d["raw_manifest"] return d -def revisions(): - return revisions_d().map(Revision.from_dict) +def revisions(**kwargs): + return revisions_d(**kwargs).map(Revision.from_dict) -def directory_entries_d(): +def directory_entries_d(**kwargs): + defaults = dict( + name=binaries_without_bytes(b"/"), + target=sha1_git(), + ) return one_of( builds( dict, - name=binaries_without_bytes(b"/"), - target=sha1_git(), type=just("file"), perms=one_of( integers(min_value=0o100000, max_value=0o100777), # regular file integers(min_value=0o120000, max_value=0o120777), # symlink ), + **{**defaults, **kwargs}, ), builds( dict, - name=binaries_without_bytes(b"/"), - target=sha1_git(), type=just("dir"), perms=integers( min_value=DentryPerms.directory, max_value=DentryPerms.directory + 0o777, ), + **{**defaults, **kwargs}, ), builds( dict, - name=binaries_without_bytes(b"/"), - target=sha1_git(), type=just("rev"), perms=integers( min_value=DentryPerms.revision, max_value=DentryPerms.revision + 0o777, ), + **{**defaults, **kwargs}, ), ) -def directory_entries(): - return directory_entries_d().map(DirectoryEntry) +def directory_entries(**kwargs): + return directory_entries_d(**kwargs).map(DirectoryEntry) @composite -def directories_d(draw): +def directories_d(draw, raw_manifest=optional(binary())): d = draw(builds(dict, entries=tuples(directory_entries_d()))) - raw_manifest = draw(optional(binary())) - if raw_manifest: - d["raw_manifest"] = raw_manifest + d["raw_manifest"] = draw(raw_manifest) + if d["raw_manifest"] is None: + del d["raw_manifest"] return d -def directories(): - return directories_d().map(Directory.from_dict) +def directories(**kwargs): + return directories_d(**kwargs).map(Directory.from_dict) def contents_d(): @@ -376,21 +362,23 @@ def contents(): return one_of(present_contents(), skipped_contents()) -def present_contents_d(): - return builds( - dict, +def present_contents_d(**kwargs): + defaults = dict( data=binary(max_size=4096), ctime=optional(aware_datetimes()), status=one_of(just("visible"), just("hidden")), ) + return builds(dict, **{**defaults, **kwargs}) -def present_contents(): +def present_contents(**kwargs): return present_contents_d().map(lambda d: Content.from_data(**d)) @composite -def skipped_contents_d(draw): +def skipped_contents_d( + draw, reason=pgsql_text(), status=just("absent"), ctime=optional(aware_datetimes()) +): result = BaseContent._hash_data(draw(binary(max_size=4096))) result.pop("data") nullify_attrs = draw( @@ -398,13 +386,13 @@ def skipped_contents_d(draw): ) for k in nullify_attrs: result[k] = None - result["reason"] = draw(pgsql_text()) - result["status"] = "absent" - result["ctime"] = draw(optional(aware_datetimes())) + result["reason"] = draw(reason) + result["status"] = draw(status) + result["ctime"] = draw(ctime) return result -def skipped_contents(): +def skipped_contents(**kwargs): return skipped_contents_d().map(SkippedContent.from_dict) @@ -492,35 +480,38 @@ def snapshots(*, min_size=0, max_size=100, only_objects=False): ).map(Snapshot.from_dict) -def metadata_authorities(): - return builds(MetadataAuthority, url=iris(), metadata=just(None)) +def metadata_authorities(url=iris()): + return builds(MetadataAuthority, url=url, metadata=just(None)) -def metadata_fetchers(): - return builds( - MetadataFetcher, +def metadata_fetchers(**kwargs): + defaults = dict( name=text(min_size=1, alphabet=string.printable), version=text( min_size=1, alphabet=string.ascii_letters + string.digits + string.punctuation, ), + ) + return builds( + MetadataFetcher, metadata=just(None), + **{**defaults, **kwargs}, ) -def raw_extrinsic_metadata(): - return builds( - RawExtrinsicMetadata, +def raw_extrinsic_metadata(**kwargs): + defaults = dict( target=extended_swhids(), discovery_date=aware_datetimes(), authority=metadata_authorities(), fetcher=metadata_fetchers(), format=text(min_size=1, alphabet=string.printable), ) + return builds(RawExtrinsicMetadata, **{**defaults, **kwargs}) -def raw_extrinsic_metadata_d(): - return raw_extrinsic_metadata().map(RawExtrinsicMetadata.to_dict) +def raw_extrinsic_metadata_d(**kwargs): + return raw_extrinsic_metadata(**kwargs).map(RawExtrinsicMetadata.to_dict) def objects(blacklist_types=("origin_visit_status",), split_content=False): diff --git a/swh/model/model.py b/swh/model/model.py index 508d41cb46f489671c37cd338c2b9afcb442350f..1073cc61cf4623ba65428b164466b82d4ad35b6c 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -16,10 +16,11 @@ method to convert between them and msgpack-serializable objects. """ from abc import ABCMeta, abstractmethod +import collections import datetime from enum import Enum import hashlib -from typing import Any, Dict, Iterable, Optional, Tuple, Type, TypeVar, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union import attr from attrs_strict import AttributeTypeError @@ -29,7 +30,7 @@ from typing_extensions import Final from . import git_objects from .collections import ImmutableDict -from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_hex +from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytehex, hash_to_hex from .swhids import CoreSWHID from .swhids import ExtendedObjectType as SwhidExtendedObjectType from .swhids import ExtendedSWHID @@ -266,7 +267,7 @@ class HashableObjectWithManifest(HashableObject): attribute is set to an empty value. """ if self.raw_manifest is None: - return super().compute_hash() + return super().compute_hash() # calls self._compute_hash_from_attributes() else: return _compute_hash_from_manifest(self.raw_manifest) @@ -943,12 +944,15 @@ class Revision(HashableObjectWithManifest, BaseModel): ) +_DIR_ENTRY_TYPES = ["file", "dir", "rev"] + + @attr.s(frozen=True, slots=True) class DirectoryEntry(BaseModel): object_type: Final = "directory_entry" name = attr.ib(type=bytes, validator=type_validator()) - type = attr.ib(type=str, validator=attr.validators.in_(["file", "dir", "rev"])) + type = attr.ib(type=str, validator=attr.validators.in_(_DIR_ENTRY_TYPES)) target = attr.ib(type=Sha1Git, validator=type_validator(), repr=hash_repr) perms = attr.ib(type=int, validator=type_validator(), converter=int, repr=oct) """Usually one of the values of `swh.model.from_disk.DentryPerms`.""" @@ -996,6 +1000,87 @@ class Directory(HashableObjectWithManifest, BaseModel): """Returns a SWHID representing this object.""" return CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=self.id) + @classmethod + def from_possibly_duplicated_entries( + cls, + *, + entries: Tuple[DirectoryEntry, ...], + id: Sha1Git = b"", + raw_manifest: Optional[bytes] = None, + ) -> Tuple[bool, "Directory"]: + """Constructs a ``Directory`` object from a list of entries that may contain + duplicated names. + + This is required to represent legacy objects, that were ingested in the + storage database before this check was added. + + As it is impossible for a ``Directory`` instances to have more than one entry + with a given names, this function computes a ``raw_manifest`` and renames one of + the entries before constructing the ``Directory``. + + Returns: + ``(is_corrupt, directory)`` where ``is_corrupt`` is True iff some + entry names were indeed duplicated + """ + # First, try building a Directory object normally without any extra computation, + # which works the overwhelming majority of the time: + try: + return (False, Directory(entries=entries, id=id, raw_manifest=raw_manifest)) + except ValueError: + pass + + # If it fails: + # 1. compute a raw_manifest if there isn't already one: + if raw_manifest is None: + # invalid_directory behaves like a Directory object, but without the + # duplicated entry check; which allows computing its raw_manifest + invalid_directory = type("", (), {})() + invalid_directory.entries = entries + raw_manifest = git_objects.directory_git_object(invalid_directory) + + # 2. look for duplicated entries: + entries_by_name: Dict[ + bytes, Dict[str, List[DirectoryEntry]] + ] = collections.defaultdict(lambda: collections.defaultdict(list)) + for entry in entries: + entries_by_name[entry.name][entry.type].append(entry) + + # 3. strip duplicates + deduplicated_entries = [] + for entry_lists in entries_by_name.values(): + # We could pick one entry at random to keep the original name; but we try to + # "minimize" the impact, by preserving entries of type "rev" first + # (because renaming them would likely break git submodules entirely + # when this directory is written to disk), + # then entries of type "dir" (because renaming them affects the path + # of every file in the dir, instead of just one "cnt"). + dir_entry_types = ("rev", "dir", "file") + assert set(dir_entry_types) == set(_DIR_ENTRY_TYPES) + picked_winner = False # when True, all future entries must be renamed + for type_ in dir_entry_types: + for entry in entry_lists[type_]: + if not picked_winner: + # this is the "most important" entry according to this + # heuristic; it gets to keep its name. + deduplicated_entries.append(entry) + picked_winner = True + else: + # the heuristic already found an entry more important than + # this one; so this one must be renamed to something. + # we pick the beginning of its hash, it should be good enough + # to avoid any conflict. + new_name = ( + entry.name + b"_" + hash_to_bytehex(entry.target)[0:10] + ) + renamed_entry = attr.evolve(entry, name=new_name) + deduplicated_entries.append(renamed_entry) + + # Finally, return the "fixed" the directory + dir_ = Directory( + entries=tuple(deduplicated_entries), id=id, raw_manifest=raw_manifest + ) + return (True, dir_) + @attr.s(frozen=True, slots=True) class BaseContent(BaseModel): diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index 590e4b4a2e59ec0c51f3766e10d64aabbdaff943..4540c433db2b00630a8a0366e440a5892c74600d 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -13,7 +13,7 @@ import attr from attrs_strict import AttributeTypeError import dateutil from hypothesis import given -from hypothesis.strategies import binary +from hypothesis.strategies import binary, none import pytest from swh.model.collections import ImmutableDict @@ -841,7 +841,7 @@ def test_content_naive_datetime(): ) -@given(strategies.present_contents().filter(lambda cnt: cnt.data is not None)) +@given(strategies.present_contents()) def test_content_git_roundtrip(content): assert content.data is not None raw = swh.model.git_objects.content_git_object(content) @@ -886,7 +886,7 @@ def test_skipped_content_naive_datetime(): # Directory -@given(strategies.directories().filter(lambda d: d.raw_manifest is None)) +@given(strategies.directories(raw_manifest=none())) def test_directory_check(directory): directory.check() @@ -903,7 +903,7 @@ def test_directory_check(directory): directory2.check() -@given(strategies.directories().filter(lambda d: d.raw_manifest is None)) +@given(strategies.directories(raw_manifest=none())) def test_directory_raw_manifest(directory): assert "raw_manifest" not in directory.to_dict() @@ -943,10 +943,147 @@ def test_directory_duplicate_entry_name(): Directory(entries=entries) +@given(strategies.directories()) +def test_directory_from_possibly_duplicated_entries__no_duplicates(directory): + """ + Directory.from_possibly_duplicated_entries should return the directory + unchanged if it has no duplicated entry name. + """ + assert (False, directory) == Directory.from_possibly_duplicated_entries( + id=directory.id, entries=directory.entries, raw_manifest=directory.raw_manifest + ) + assert (False, directory) == Directory.from_possibly_duplicated_entries( + entries=directory.entries, raw_manifest=directory.raw_manifest + ) + + +@pytest.mark.parametrize("rev_first", [True, False]) +def test_directory_from_possibly_duplicated_entries__rev_and_dir(rev_first): + entries = ( + DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1), + DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0), + ) + if rev_first: + entries = tuple(reversed(entries)) + (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries) + assert is_corrupt + assert dir_.entries == ( + DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0), + DirectoryEntry( + name=b"foo_0101010101", type="dir", target=b"\x01" * 20, perms=1 + ), + ) + + # order is independent of 'rev_first' because it is always sorted in git order + assert dir_.raw_manifest == ( + # fmt: off + b"tree 52\x00" + + b"0 foo\x00" + b"\x00" * 20 + + b"1 foo\x00" + b"\x01" * 20 + # fmt: on + ) + + +@pytest.mark.parametrize("file_first", [True, False]) +def test_directory_from_possibly_duplicated_entries__file_and_dir(file_first): + entries = ( + DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1), + DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0), + ) + if file_first: + entries = tuple(reversed(entries)) + (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries) + assert is_corrupt + assert dir_.entries == ( + DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1), + DirectoryEntry( + name=b"foo_0000000000", type="file", target=b"\x00" * 20, perms=0 + ), + ) + + # order is independent of 'file_first' because it is always sorted in git order + assert dir_.raw_manifest == ( + # fmt: off + b"tree 52\x00" + + b"0 foo\x00" + b"\x00" * 20 + + b"1 foo\x00" + b"\x01" * 20 + # fmt: on + ) + + +def test_directory_from_possibly_duplicated_entries__two_files1(): + entries = ( + DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1), + DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0), + ) + (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries) + assert is_corrupt + + assert dir_.entries == ( + DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1), + DirectoryEntry( + name=b"foo_0000000000", type="file", target=b"\x00" * 20, perms=0 + ), + ) + assert dir_.raw_manifest == ( + # fmt: off + b"tree 52\x00" + + b"1 foo\x00" + b"\x01" * 20 + + b"0 foo\x00" + b"\x00" * 20 + # fmt: on + ) + + +def test_directory_from_possibly_duplicated_entries__two_files2(): + """ + Same as above, but entries are in a different order (and order matters + to break the tie) + """ + entries = ( + DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0), + DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1), + ) + (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries) + assert is_corrupt + + assert dir_.entries == ( + DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0), + DirectoryEntry( + name=b"foo_0101010101", type="file", target=b"\x01" * 20, perms=1 + ), + ) + assert dir_.raw_manifest == ( + # fmt: off + b"tree 52\x00" + + b"0 foo\x00" + b"\x00" * 20 + + b"1 foo\x00" + b"\x01" * 20 + # fmt: on + ) + + +def test_directory_from_possibly_duplicated_entries__preserve_manifest(): + entries = ( + DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1), + DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0), + ) + (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries( + entries=entries, raw_manifest=b"blah" + ) + assert is_corrupt + assert dir_.entries == ( + DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0), + DirectoryEntry( + name=b"foo_0101010101", type="dir", target=b"\x01" * 20, perms=1 + ), + ) + + assert dir_.raw_manifest == b"blah" + + # Release -@given(strategies.releases().filter(lambda rel: rel.raw_manifest is None)) +@given(strategies.releases(raw_manifest=none())) def test_release_check(release): release.check() @@ -963,7 +1100,7 @@ def test_release_check(release): release2.check() -@given(strategies.releases().filter(lambda rev: rev.raw_manifest is None)) +@given(strategies.releases(raw_manifest=none())) def test_release_raw_manifest(release): raw_manifest = b"foo" id_ = hashlib.new("sha1", raw_manifest).digest() @@ -983,7 +1120,7 @@ def test_release_raw_manifest(release): # Revision -@given(strategies.revisions().filter(lambda rev: rev.raw_manifest is None)) +@given(strategies.revisions(raw_manifest=none())) def test_revision_check(revision): revision.check() @@ -1000,7 +1137,7 @@ def test_revision_check(revision): revision2.check() -@given(strategies.revisions().filter(lambda rev: rev.raw_manifest is None)) +@given(strategies.revisions(raw_manifest=none())) def test_revision_raw_manifest(revision): raw_manifest = b"foo"