diff --git a/PKG-INFO b/PKG-INFO index 6357512042a6ec0f50f1399f8afa75d7010f9849..34a102a59d671d9c74154523bcdf137fb2fa1727 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 3.2.0 +Version: 4.0.0 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/docs/data-model.rst b/docs/data-model.rst index 2e311619f52d1d50e9bc0fbdeec89e637db8442d..ff81babb30482db3d987dcccda6b2c8251bbf780 100644 --- a/docs/data-model.rst +++ b/docs/data-model.rst @@ -74,8 +74,7 @@ synonyms. **directories** a list of named directory entries, each of which pointing to other artifacts, usually file contents or sub-directories. Directory entries are also - associated to arbitrary metadata, which vary with technologies, but usually - includes permission bits, modification timestamps, etc. + associated to some metadata stored as permission bits. **revisions** (AKA "commits") software development within a specific project is essentially a time-indexed @@ -92,8 +91,8 @@ synonyms. some revisions are more equals than others and get selected by developers as denoting important project milestones known as "releases". Each release points to the last commit in project history corresponding to the release and - might carry arbitrary metadata—e.g., release name and version, release - message, cryptographic signatures, etc. + carries metadata: release name and version, release message, cryptographic + signatures, etc. Additionally, the following crawling-related information are stored as @@ -260,3 +259,32 @@ making emergent structures such as code reuse across different projects or software origins, readily available. Further reinforcing the Software Heritage use cases, this object could become a veritable "map of the stars" of our entire software commons. + + +Extended data model +------------------- + +In addition to the artifacts detailed above used to represent original software +artifacts, the Software Heritage archive stores information about these +artifacts. + +**extid** + a relationship between an original identifier of an artifact, in its + native/upstream environment, and a `core SWHID <persistent-identifiers>`, + which is specific to Software Heritage. As such, it is a triple made of: + + * the external identifier, stored as bytes whose format is opaque to the + data model + * a type (a simple name and a version), to identify the type of relationship + * the "target", which is a core SWHID + +**raw extrinsic metadata** + an opaque bytestring, along with its format (a simple name), an identifier + of the object the metadata is about and in which context (similar to a + `qualified SWHID <persistent-identifiers>`), and provenance information + (the authority who provided it, the fetcher tool used to get it, and the + data it was discovered at). + + It provides both a way to store information about an artifact contributed by + external entities, after the artifact was created, and an escape hatch to + store metadata that would not otherwise fit in the data model. diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 6357512042a6ec0f50f1399f8afa75d7010f9849..34a102a59d671d9c74154523bcdf137fb2fa1727 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 3.2.0 +Version: 4.0.0 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh/__init__.py b/swh/__init__.py index 8d9f1510154ada0018aa0293468d2666ed4f3a8b..b36383a61027f0875a3cb103edc8f2a4528a3289 100644 --- a/swh/__init__.py +++ b/swh/__init__.py @@ -1,4 +1,3 @@ from pkgutil import extend_path -from typing import List -__path__: List[str] = extend_path(__path__, __name__) +__path__ = extend_path(__path__, __name__) diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py index 513dc93989a16f48c9e5d13f6fcac7c7d0a89f3e..f44e5de5115a20c502663cb9ebf29f2249dee994 100644 --- a/swh/model/hypothesis_strategies.py +++ b/swh/model/hypothesis_strategies.py @@ -215,7 +215,7 @@ def releases_d(draw): target = sha1_git() metadata = optional(revision_metadata()) - return draw( + d = draw( one_of( builds( dict, @@ -242,6 +242,11 @@ def releases_d(draw): ) ) + raw_manifest = draw(optional(binary())) + if raw_manifest: + d["raw_manifest"] = raw_manifest + return d + def releases(): return releases_d().map(Release.from_dict) @@ -256,35 +261,67 @@ def extra_headers(): ).map(tuple) -def revisions_d(): - return builds( - dict, - message=optional(binary()), - synthetic=booleans(), - author=persons_d(), - committer=persons_d(), - date=timestamps_with_timezone_d(), - committer_date=timestamps_with_timezone_d(), - parents=tuples(sha1_git()), - directory=sha1_git(), - type=sampled_from([x.value for x in RevisionType]), - metadata=optional(revision_metadata()), - extra_headers=extra_headers(), +@composite +def revisions_d(draw): + d = draw( + builds( + dict, + message=optional(binary()), + synthetic=booleans(), + author=persons_d(), + committer=persons_d(), + date=timestamps_with_timezone_d(), + committer_date=timestamps_with_timezone_d(), + parents=tuples(sha1_git()), + directory=sha1_git(), + type=sampled_from([x.value for x in RevisionType]), + metadata=optional(revision_metadata()), + extra_headers=extra_headers(), + ) ) # TODO: metadata['extra_headers'] can have binary keys and values + raw_manifest = draw(optional(binary())) + if raw_manifest: + d["raw_manifest"] = raw_manifest + return d + def revisions(): return revisions_d().map(Revision.from_dict) def directory_entries_d(): - return builds( - dict, - name=binaries_without_bytes(b"/"), - target=sha1_git(), - type=sampled_from(["file", "dir", "rev"]), - perms=sampled_from([perm.value for perm in DentryPerms]), + return one_of( + builds( + dict, + name=binaries_without_bytes(b"/"), + target=sha1_git(), + type=just("file"), + perms=one_of( + integers(min_value=0o100000, max_value=0o100777), # regular file + integers(min_value=0o120000, max_value=0o120777), # symlink + ), + ), + builds( + dict, + name=binaries_without_bytes(b"/"), + target=sha1_git(), + type=just("dir"), + perms=integers( + min_value=DentryPerms.directory, + max_value=DentryPerms.directory + 0o777, + ), + ), + builds( + dict, + name=binaries_without_bytes(b"/"), + target=sha1_git(), + type=just("rev"), + perms=integers( + min_value=DentryPerms.revision, max_value=DentryPerms.revision + 0o777, + ), + ), ) @@ -292,8 +329,14 @@ def directory_entries(): return directory_entries_d().map(DirectoryEntry) -def directories_d(): - return builds(dict, entries=tuples(directory_entries_d())) +@composite +def directories_d(draw): + d = draw(builds(dict, entries=tuples(directory_entries_d()))) + + raw_manifest = draw(optional(binary())) + if raw_manifest: + d["raw_manifest"] = raw_manifest + return d def directories(): diff --git a/swh/model/model.py b/swh/model/model.py index 735ce46866e496f51bca7a79c97da67b48bb7c4e..7d7cad8e69f45ba2a794b45fa3e3fa237cd38f11 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -49,6 +49,8 @@ KeyType = Union[Dict[str, str], Dict[str, bytes], bytes] SHA1_SIZE = 20 +_OFFSET_CHARS = frozenset(b"+-0123456789") + # TODO: Limit this to 20 bytes Sha1Git = bytes Sha1 = bytes @@ -94,6 +96,9 @@ def _check_type(type_, value): if type_ is object or type_ is Any: return True + if type_ is None: + return value is None + origin = getattr(type_, "__origin__", None) # Non-generic type, check it directly @@ -189,6 +194,14 @@ class BaseModel: deduplication.""" raise NotImplementedError(f"unique_key for {self}") + def check(self) -> None: + """Performs internal consistency checks, and raises an error if one fails.""" + attr.validate(self) + + +def _compute_hash_from_manifest(manifest: bytes) -> Sha1Git: + return hashlib.new("sha1", manifest).digest() + class HashableObject(metaclass=ABCMeta): """Mixin to automatically compute object identifier hash when @@ -198,7 +211,6 @@ class HashableObject(metaclass=ABCMeta): id: Sha1Git - @abstractmethod def compute_hash(self) -> bytes: """Derived model classes must implement this to compute the object hash. @@ -206,7 +218,11 @@ class HashableObject(metaclass=ABCMeta): This method is called by the object initialization if the `id` attribute is set to an empty value. """ - pass + return self._compute_hash_from_attributes() + + @abstractmethod + def _compute_hash_from_attributes(self) -> Sha1Git: + raise NotImplementedError(f"_compute_hash_from_attributes for {self}") def __attrs_post_init__(self): if not self.id: @@ -216,6 +232,53 @@ class HashableObject(metaclass=ABCMeta): def unique_key(self) -> KeyType: return self.id + def check(self) -> None: + super().check() # type: ignore + + if self.id != self.compute_hash(): + raise ValueError("'id' does not match recomputed hash.") + + +class HashableObjectWithManifest(HashableObject): + """Derived class of HashableObject, for objects that may need to store + verbatim git objects as ``raw_manifest`` to preserve original hashes.""" + + raw_manifest: Optional[bytes] = None + """Stores the original content of git objects when they cannot be faithfully + represented using only the other attributes. + + This should only be used as a last resort, and only set in the Git loader, + for objects too corrupt to fit the data model.""" + + def to_dict(self): + d = super().to_dict() + if d["raw_manifest"] is None: + del d["raw_manifest"] + return d + + def compute_hash(self) -> bytes: + """Derived model classes must implement this to compute + the object hash. + + This method is called by the object initialization if the `id` + attribute is set to an empty value. + """ + if self.raw_manifest is None: + return super().compute_hash() + else: + return _compute_hash_from_manifest(self.raw_manifest) + + def check(self) -> None: + super().check() + + if ( + self.raw_manifest is not None + and self.id == self._compute_hash_from_attributes() + ): + raise ValueError( + f"{self} has a non-none raw_manifest attribute, but does not need it." + ) + @attr.s(frozen=True, slots=True) class Person(BaseModel): @@ -325,6 +388,15 @@ class TimestampWithTimezone(BaseModel): offset = attr.ib(type=int, validator=type_validator()) negative_utc = attr.ib(type=bool, validator=type_validator()) + offset_bytes = attr.ib(type=bytes, validator=type_validator()) + """Raw git representation of the timezone, as an offset from UTC. + It should follow this format: ``+HHMM`` or ``-HHMM`` (including ``+0000`` and + ``-0000``). + + However, when created from git objects, it must be the exact bytes used in the + original objects, so it may differ from this format when they do. + """ + @offset.validator def check_offset(self, attribute, value): """Checks the offset is a 16-bits signed integer (in theory, it @@ -334,11 +406,47 @@ class TimestampWithTimezone(BaseModel): # you'll find in the wild... raise ValueError("offset too large: %d minutes" % value) + self._check_offsets_match() + @negative_utc.validator def check_negative_utc(self, attribute, value): if self.offset and value: raise ValueError("negative_utc can only be True is offset=0") + self._check_offsets_match() + + @offset_bytes.default + def _default_offset_bytes(self): + negative = self.offset < 0 or self.negative_utc + (hours, minutes) = divmod(abs(self.offset), 60) + return f"{'-' if negative else '+'}{hours:02}{minutes:02}".encode() + + @offset_bytes.validator + def check_offset_bytes(self, attribute, value): + if not set(value) <= _OFFSET_CHARS: + raise ValueError(f"invalid characters in offset_bytes: {value!r}") + + self._check_offsets_match() + + def _check_offsets_match(self): + offset_str = self.offset_bytes.decode() + assert offset_str[0] in "+-" + sign = int(offset_str[0] + "1") + hours = int(offset_str[1:-2]) + minutes = int(offset_str[-2:]) + offset = sign * (hours * 60 + minutes) + if offset != self.offset: + raise ValueError( + f"offset_bytes ({self.offset_bytes!r}) does not match offset " + f"{divmod(self.offset, 60)}" + ) + + if offset == 0 and self.negative_utc != self.offset_bytes.startswith(b"-"): + raise ValueError( + f"offset_bytes ({self.offset_bytes!r}) does not match negative_utc " + f"({self.negative_utc})" + ) + @classmethod def from_dict(cls, time_representation: Union[Dict, datetime.datetime, int]): """Builds a TimestampWithTimezone from any of the formats @@ -422,7 +530,8 @@ class TimestampWithTimezone(BaseModel): dt = iso8601.parse_date(s) tstz = cls.from_datetime(dt) if dt.tzname() == "-00:00": - tstz = attr.evolve(tstz, negative_utc=True) + assert tstz.offset_bytes == b"+0000" + tstz = attr.evolve(tstz, negative_utc=True, offset_bytes=b"-0000") return tstz @@ -439,8 +548,8 @@ class Origin(HashableObject, BaseModel): def unique_key(self) -> KeyType: return {"url": self.url} - def compute_hash(self) -> bytes: - return hashlib.sha1(self.url.encode("utf-8")).digest() + def _compute_hash_from_attributes(self) -> bytes: + return _compute_hash_from_manifest(self.url.encode("utf-8")) def swhid(self) -> ExtendedSWHID: """Returns a SWHID representing this origin.""" @@ -583,9 +692,8 @@ class Snapshot(HashableObject, BaseModel): ) id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"", repr=hash_repr) - def compute_hash(self) -> bytes: - git_object = git_objects.snapshot_git_object(self) - return hashlib.new("sha1", git_object).digest() + def _compute_hash_from_attributes(self) -> bytes: + return _compute_hash_from_manifest(git_objects.snapshot_git_object(self)) @classmethod def from_dict(cls, d): @@ -604,7 +712,7 @@ class Snapshot(HashableObject, BaseModel): @attr.s(frozen=True, slots=True) -class Release(HashableObject, BaseModel): +class Release(HashableObjectWithManifest, BaseModel): object_type: Final = "release" name = attr.ib(type=bytes, validator=type_validator()) @@ -623,10 +731,10 @@ class Release(HashableObject, BaseModel): default=None, ) id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"", repr=hash_repr) + raw_manifest = attr.ib(type=Optional[bytes], default=None) - def compute_hash(self) -> bytes: - git_object = git_objects.release_git_object(self) - return hashlib.new("sha1", git_object).digest() + def _compute_hash_from_attributes(self) -> bytes: + return _compute_hash_from_manifest(git_objects.release_git_object(self)) @author.validator def check_author(self, attribute, value): @@ -680,7 +788,7 @@ def tuplify_extra_headers(value: Iterable): @attr.s(frozen=True, slots=True) -class Revision(HashableObject, BaseModel): +class Revision(HashableObjectWithManifest, BaseModel): object_type: Final = "revision" message = attr.ib(type=Optional[bytes], validator=type_validator()) @@ -707,6 +815,7 @@ class Revision(HashableObject, BaseModel): converter=tuplify_extra_headers, default=(), ) + raw_manifest = attr.ib(type=Optional[bytes], default=None) def __attrs_post_init__(self): super().__attrs_post_init__() @@ -722,9 +831,8 @@ class Revision(HashableObject, BaseModel): attr.validate(self) object.__setattr__(self, "metadata", metadata) - def compute_hash(self) -> bytes: - git_object = git_objects.revision_git_object(self) - return hashlib.new("sha1", git_object).digest() + def _compute_hash_from_attributes(self) -> bytes: + return _compute_hash_from_manifest(git_objects.revision_git_object(self)) @classmethod def from_dict(cls, d): @@ -775,19 +883,19 @@ class DirectoryEntry(BaseModel): @name.validator def check_name(self, attribute, value): if b"/" in value: - raise ValueError("{value!r} is not a valid directory entry name.") + raise ValueError(f"{value!r} is not a valid directory entry name.") @attr.s(frozen=True, slots=True) -class Directory(HashableObject, BaseModel): +class Directory(HashableObjectWithManifest, BaseModel): object_type: Final = "directory" entries = attr.ib(type=Tuple[DirectoryEntry, ...], validator=type_validator()) id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"", repr=hash_repr) + raw_manifest = attr.ib(type=Optional[bytes], default=None) - def compute_hash(self) -> bytes: - git_object = git_objects.directory_git_object(self) - return hashlib.new("sha1", git_object).digest() + def _compute_hash_from_attributes(self) -> bytes: + return _compute_hash_from_manifest(git_objects.directory_git_object(self)) @entries.validator def check_entries(self, attribute, value): @@ -1132,9 +1240,10 @@ class RawExtrinsicMetadata(HashableObject, BaseModel): id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"", repr=hash_repr) - def compute_hash(self) -> bytes: - git_object = git_objects.raw_extrinsic_metadata_git_object(self) - return hashlib.new("sha1", git_object).digest() + def _compute_hash_from_attributes(self) -> bytes: + return _compute_hash_from_manifest( + git_objects.raw_extrinsic_metadata_git_object(self) + ) @origin.validator def check_origin(self, attribute, value): @@ -1333,6 +1442,5 @@ class ExtID(HashableObject, BaseModel): extid_version=d.get("extid_version", 0), ) - def compute_hash(self) -> bytes: - git_object = git_objects.extid_git_object(self) - return hashlib.new("sha1", git_object).digest() + def _compute_hash_from_attributes(self) -> bytes: + return _compute_hash_from_manifest(git_objects.extid_git_object(self)) diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index 188c584ae97dd9452d6bbef2b703574022356012..2501e342806c4832a522a0628f8fe684046ca86d 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -1046,6 +1046,7 @@ TS_DICTS = [ "timestamp": {"seconds": 12345, "microseconds": 0}, "offset": 0, "negative_utc": False, + "offset_bytes": b"+0000", }, ), ( @@ -1054,6 +1055,7 @@ TS_DICTS = [ "timestamp": {"seconds": 12345, "microseconds": 0}, "offset": 0, "negative_utc": False, + "offset_bytes": b"+0000", }, ), ( @@ -1062,6 +1064,7 @@ TS_DICTS = [ "timestamp": {"seconds": 12345, "microseconds": 0}, "offset": 0, "negative_utc": False, + "offset_bytes": b"+0000", }, ), ( @@ -1070,6 +1073,7 @@ TS_DICTS = [ "timestamp": {"seconds": 12345, "microseconds": 0}, "offset": 0, "negative_utc": False, + "offset_bytes": b"+0000", }, ), ( @@ -1078,6 +1082,7 @@ TS_DICTS = [ "timestamp": {"seconds": 12345, "microseconds": 0}, "offset": 0, "negative_utc": False, + "offset_bytes": b"+0000", }, ), ( @@ -1090,6 +1095,7 @@ TS_DICTS = [ "timestamp": {"seconds": 12345, "microseconds": 0}, "offset": 0, "negative_utc": False, + "offset_bytes": b"+0000", }, ), ( @@ -1102,6 +1108,7 @@ TS_DICTS = [ "timestamp": {"seconds": 12345, "microseconds": 100}, "offset": 0, "negative_utc": False, + "offset_bytes": b"+0000", }, ), ( @@ -1110,6 +1117,7 @@ TS_DICTS = [ "timestamp": {"seconds": 12345, "microseconds": 0}, "offset": 0, "negative_utc": True, + "offset_bytes": b"-0000", }, ), ( @@ -1118,6 +1126,7 @@ TS_DICTS = [ "timestamp": {"seconds": 12345, "microseconds": 0}, "offset": 0, "negative_utc": False, + "offset_bytes": b"+0000", }, ), ] @@ -1153,6 +1162,7 @@ TS_TIMEZONES = [ datetime.timezone.max, ] TS_TZ_EXPECTED = [-1439, -60, 0, 60, 1439] +TS_TZ_BYTES_EXPECTED = [b"-2359", b"-0100", b"+0000", b"+0100", b"+2359"] TS_DATETIMES = [ datetime.datetime(2020, 2, 27, 14, 39, 19, tzinfo=UTC), datetime.datetime(2120, 12, 31, 23, 59, 59, tzinfo=UTC), @@ -1162,14 +1172,19 @@ TS_DT_EXPECTED = [1582814359, 4765132799, -11348929020] @pytest.mark.parametrize("date, seconds", zip(TS_DATETIMES, TS_DT_EXPECTED)) -@pytest.mark.parametrize("tz, offset", zip(TS_TIMEZONES, TS_TZ_EXPECTED)) +@pytest.mark.parametrize( + "tz, offset, offset_bytes", zip(TS_TIMEZONES, TS_TZ_EXPECTED, TS_TZ_BYTES_EXPECTED) +) @pytest.mark.parametrize("microsecond", [0, 1, 10, 100, 1000, 999999]) -def test_normalize_timestamp_datetime(date, seconds, tz, offset, microsecond): +def test_normalize_timestamp_datetime( + date, seconds, tz, offset, offset_bytes, microsecond +): date = date.astimezone(tz).replace(microsecond=microsecond) assert TimestampWithTimezone.from_dict(date).to_dict() == { "timestamp": {"seconds": seconds, "microseconds": microsecond}, "offset": offset, "negative_utc": False, + "offset_bytes": offset_bytes, } diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index 47f6d3c781de4ec09729509a68b083488e9dda81..68c3daa3347cbcd6973a914ce7594d9acd788f16 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -6,6 +6,7 @@ import collections import copy import datetime +import hashlib from typing import Any, List, Optional, Tuple, Union import attr @@ -17,6 +18,7 @@ import pytest from swh.model.collections import ImmutableDict from swh.model.from_disk import DentryPerms +import swh.model.git_objects from swh.model.hashutil import MultiHash, hash_to_bytes import swh.model.hypothesis_strategies as strategies import swh.model.model @@ -137,6 +139,7 @@ _TYPE_VALIDATOR_PARAMETERS: List[Tuple[Any, List[Any], List[Any]]] = [ [None, bytearray(b"\x12\x34"), "123", 0, 123, (), (1, 2, 3), ImmutableDict()], ), (str, ["", "123"], [None, b"123", b"", 0, (), (1, 2, 3), ImmutableDict()]), + (None, [None], [b"", b"123", "", "foo", 0, 123, ImmutableDict(), float("NaN")]), # unions: ( Optional[int], @@ -195,7 +198,10 @@ _TYPE_VALIDATOR_PARAMETERS: List[Tuple[Any, List[Any], List[Any]]] = [ # standard types: ( datetime.datetime, - [datetime.datetime.now(), datetime.datetime.now(tz=datetime.timezone.utc)], + [ + datetime.datetime(2021, 12, 15, 12, 59, 27), + datetime.datetime(2021, 12, 15, 12, 59, 27, tzinfo=datetime.timezone.utc), + ], [None, 123], ), # ImmutableDict @@ -447,21 +453,27 @@ def test_timestampwithtimezone(): tstz = TimestampWithTimezone(timestamp=ts, offset=0, negative_utc=False) attr.validate(tstz) assert tstz.negative_utc is False + assert tstz.offset_bytes == b"+0000" - attr.validate(TimestampWithTimezone(timestamp=ts, offset=10, negative_utc=False)) + tstz = TimestampWithTimezone(timestamp=ts, offset=10, negative_utc=False) + attr.validate(tstz) + assert tstz.offset_bytes == b"+0010" - attr.validate(TimestampWithTimezone(timestamp=ts, offset=-10, negative_utc=False)) + tstz = TimestampWithTimezone(timestamp=ts, offset=-10, negative_utc=False) + attr.validate(tstz) + assert tstz.offset_bytes == b"-0010" tstz = TimestampWithTimezone(timestamp=ts, offset=0, negative_utc=True) attr.validate(tstz) assert tstz.negative_utc is True + assert tstz.offset_bytes == b"-0000" with pytest.raises(AttributeTypeError): TimestampWithTimezone( timestamp=datetime.datetime.now(), offset=0, negative_utc=False ) - with pytest.raises(AttributeTypeError): + with pytest.raises((AttributeTypeError, TypeError)): TimestampWithTimezone(timestamp=ts, offset="0", negative_utc=False) with pytest.raises(AttributeTypeError): @@ -741,6 +753,42 @@ def test_skipped_content_naive_datetime(): # Directory +@given(strategies.directories().filter(lambda d: d.raw_manifest is None)) +def test_directory_check(directory): + directory.check() + + directory2 = attr.evolve(directory, id=b"\x00" * 20) + with pytest.raises(ValueError, match="does not match recomputed hash"): + directory2.check() + + directory2 = attr.evolve( + directory, raw_manifest=swh.model.git_objects.directory_git_object(directory) + ) + with pytest.raises( + ValueError, match="non-none raw_manifest attribute, but does not need it." + ): + directory2.check() + + +@given(strategies.directories().filter(lambda d: d.raw_manifest is None)) +def test_directory_raw_manifest(directory): + assert "raw_manifest" not in directory.to_dict() + + raw_manifest = b"foo" + id_ = hashlib.new("sha1", raw_manifest).digest() + + directory2 = attr.evolve(directory, raw_manifest=raw_manifest) + assert directory2.to_dict()["raw_manifest"] == raw_manifest + with pytest.raises(ValueError, match="does not match recomputed hash"): + directory2.check() + + directory2 = attr.evolve(directory, raw_manifest=raw_manifest, id=id_) + assert directory2.id is not None + assert directory2.id == id_ != directory.id + assert directory2.to_dict()["raw_manifest"] == raw_manifest + directory2.check() + + def test_directory_entry_name_validation(): with pytest.raises(ValueError, match="valid directory entry name."): DirectoryEntry(name=b"foo/", type="dir", target=b"\x00" * 20, perms=0), @@ -762,9 +810,81 @@ def test_directory_duplicate_entry_name(): Directory(entries=entries) +# Release + + +@given(strategies.releases().filter(lambda rel: rel.raw_manifest is None)) +def test_release_check(release): + release.check() + + release2 = attr.evolve(release, id=b"\x00" * 20) + with pytest.raises(ValueError, match="does not match recomputed hash"): + release2.check() + + release2 = attr.evolve( + release, raw_manifest=swh.model.git_objects.release_git_object(release) + ) + with pytest.raises( + ValueError, match="non-none raw_manifest attribute, but does not need it." + ): + release2.check() + + +@given(strategies.releases().filter(lambda rev: rev.raw_manifest is None)) +def test_release_raw_manifest(release): + raw_manifest = b"foo" + id_ = hashlib.new("sha1", raw_manifest).digest() + + release2 = attr.evolve(release, raw_manifest=raw_manifest) + assert release2.to_dict()["raw_manifest"] == raw_manifest + with pytest.raises(ValueError, match="does not match recomputed hash"): + release2.check() + + release2 = attr.evolve(release, raw_manifest=raw_manifest, id=id_) + assert release2.id is not None + assert release2.id == id_ != release.id + assert release2.to_dict()["raw_manifest"] == raw_manifest + release2.check() + + # Revision +@given(strategies.revisions().filter(lambda rev: rev.raw_manifest is None)) +def test_revision_check(revision): + revision.check() + + revision2 = attr.evolve(revision, id=b"\x00" * 20) + with pytest.raises(ValueError, match="does not match recomputed hash"): + revision2.check() + + revision2 = attr.evolve( + revision, raw_manifest=swh.model.git_objects.revision_git_object(revision) + ) + with pytest.raises( + ValueError, match="non-none raw_manifest attribute, but does not need it." + ): + revision2.check() + + +@given(strategies.revisions().filter(lambda rev: rev.raw_manifest is None)) +def test_revision_raw_manifest(revision): + + raw_manifest = b"foo" + id_ = hashlib.new("sha1", raw_manifest).digest() + + revision2 = attr.evolve(revision, raw_manifest=raw_manifest) + assert revision2.to_dict()["raw_manifest"] == raw_manifest + with pytest.raises(ValueError, match="does not match recomputed hash"): + revision2.check() + + revision2 = attr.evolve(revision, raw_manifest=raw_manifest, id=id_) + assert revision2.id is not None + assert revision2.id == id_ != revision.id + assert revision2.to_dict()["raw_manifest"] == raw_manifest + revision2.check() + + def test_revision_extra_headers_no_headers(): rev_dict = revision_example.copy() rev_dict.pop("id") diff --git a/tox.ini b/tox.ini index b57d93905054f5dc6d2bf997b8f5ffa468a84117..5211a7c556c7f5558b8f298aa48bdefed0c95d2d 100644 --- a/tox.ini +++ b/tox.ini @@ -39,7 +39,7 @@ commands = extras = testing deps = - mypy + mypy==0.920 commands = mypy swh