diff --git a/PKG-INFO b/PKG-INFO index e1be14e1e0fbd93865b3bd4c98335e1b6fa3d6be..f9303105f82ade6db51d464ca50906ca6fafb0ea 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 2.9.0 +Version: 3.0.0 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/docs/persistent-identifiers.rst b/docs/persistent-identifiers.rst index bab3ff368cf0c8cc928bd039f370debc0e02dd72..adad1291414ad5c2d50e04c17bb7d342b1a5d50d 100644 --- a/docs/persistent-identifiers.rst +++ b/docs/persistent-identifiers.rst @@ -45,7 +45,7 @@ properties. Together, these identifiers form a `Merkle structure See the :ref:`Software Heritage data model <data-model>` for an overview of object types and how they are linked together. See -:py:mod:`swh.model.identifiers` for details on how the intrinsic identifiers +:py:mod:`swh.model.git_objects` for details on how the intrinsic identifiers embedded in SWHIDs are computed. The optional qualifiers are of two kinds: @@ -141,23 +141,23 @@ The actual object pointed to is identified by the intrinsic identifier ``<object_id>``, which is a hex-encoded (using lowercase ASCII characters) SHA1 computed on the content and metadata of the object itself, as follows: -* for **snapshots**, intrinsic identifiers are computed as per - :py:func:`swh.model.identifiers.snapshot_identifier` +* for **snapshots**, intrinsic identifiers are SHA1 hashes of manifests computed as per + :py:func:`swh.model.git_objects.snapshot_git_object` * for **releases**, as per - :py:func:`swh.model.identifiers.release_identifier` + :py:func:`swh.model.git_objects.release_git_object` that produces the same result as a git release hash * for **revisions**, as per - :py:func:`swh.model.identifiers.revision_identifier` + :py:func:`swh.model.git_objects.revision_git_object` that produces the same result as a git commit hash * for **directories**, per - :py:func:`swh.model.identifiers.directory_identifier` + :py:func:`swh.model.git_objects.directory_git_object` that produces the same result as a git tree hash * for **contents**, the intrinsic identifier is the ``sha1_git`` hash returned by - :py:func:`swh.model.identifiers.content_identifier`, i.e., the SHA1 of a byte + :py:meth:`swh.hashutil.MultiHash.digest`, i.e., the SHA1 of a byte sequence obtained by juxtaposing the ASCII string ``"blob"`` (without quotes), a space, the length of the content as decimal digits, a NULL byte, and the actual content of the file. diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index e1be14e1e0fbd93865b3bd4c98335e1b6fa3d6be..f9303105f82ade6db51d464ca50906ca6fafb0ea 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 2.9.0 +Version: 3.0.0 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh.model.egg-info/SOURCES.txt b/swh.model.egg-info/SOURCES.txt index 753984d84e32b477ac56d2e3cea965093bdab96d..bc820c0bdc2bf82bcca2fa147c3e0ef0cc8192ef 100644 --- a/swh.model.egg-info/SOURCES.txt +++ b/swh.model.egg-info/SOURCES.txt @@ -46,12 +46,14 @@ swh/model/cli.py swh/model/collections.py swh/model/exceptions.py swh/model/from_disk.py +swh/model/git_objects.py swh/model/hashutil.py swh/model/hypothesis_strategies.py swh/model/identifiers.py swh/model/merkle.py swh/model/model.py swh/model/py.typed +swh/model/swhids.py swh/model/toposort.py swh/model/validators.py swh/model/fields/__init__.py @@ -72,6 +74,7 @@ swh/model/tests/test_identifiers.py swh/model/tests/test_merkle.py swh/model/tests/test_model.py swh/model/tests/test_swh_model_data.py +swh/model/tests/test_swhids.py swh/model/tests/test_toposort.py swh/model/tests/test_validators.py swh/model/tests/data/dir-folders/sample-folder.tgz diff --git a/swh/model/cli.py b/swh/model/cli.py index e547aeb6c9afb12895897f404dee8dbd3a8d8cfd..ede67e2269beb471d486927f0b5043951bb4f0bd 100644 --- a/swh/model/cli.py +++ b/swh/model/cli.py @@ -26,7 +26,7 @@ except ImportError: swh_cli_group = click # type: ignore from swh.model.from_disk import Directory -from swh.model.identifiers import CoreSWHID, ObjectType +from swh.model.swhids import CoreSWHID CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @@ -42,7 +42,7 @@ _DULWICH_TYPES = { class CoreSWHIDParamType(click.ParamType): """Click argument that accepts a core SWHID and returns them as - :class:`swh.model.identifiers.CoreSWHID` instances """ + :class:`swh.model.swhids.CoreSWHID` instances """ name = "SWHID" @@ -87,17 +87,9 @@ def swhid_of_dir(path: bytes, exclude_patterns: Iterable[bytes] = None) -> CoreS def swhid_of_origin(url): - from swh.model.hashutil import hash_to_bytes - from swh.model.identifiers import ( - ExtendedObjectType, - ExtendedSWHID, - origin_identifier, - ) + from swh.model.model import Origin - return ExtendedSWHID( - object_type=ExtendedObjectType.ORIGIN, - object_id=hash_to_bytes(origin_identifier({"url": url})), - ) + return Origin(url).swhid() def swhid_of_git_repo(path) -> CoreSWHID: @@ -110,7 +102,7 @@ def swhid_of_git_repo(path) -> CoreSWHID: ) from swh.model import hashutil - from swh.model.identifiers import snapshot_identifier + from swh.model.model import Snapshot repo = dulwich.repo.Repo(path) @@ -133,10 +125,7 @@ def swhid_of_git_repo(path) -> CoreSWHID: snapshot = {"branches": branches} - return CoreSWHID( - object_type=ObjectType.SNAPSHOT, - object_id=hashutil.hash_to_bytes(snapshot_identifier(snapshot)), - ) + return Snapshot.from_dict(snapshot).swhid() def identify_object( diff --git a/swh/model/collections.py b/swh/model/collections.py index 5fd8f68221564efab463ce694bf972cc39ca52c6..2a5195746187eaaea38fca78b8784b6596e9309e 100644 --- a/swh/model/collections.py +++ b/swh/model/collections.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +"""Utility data structures.""" + from collections.abc import Mapping from typing import Dict, Generic, Iterable, Optional, Tuple, TypeVar, Union @@ -11,6 +13,11 @@ VT = TypeVar("VT") class ImmutableDict(Mapping, Generic[KT, VT]): + """A frozen dictionary. + + This class behaves like a dictionary, but internally stores objects in a tuple, + so it is both immutable and hashable.""" + data: Tuple[Tuple[KT, VT], ...] def __init__( diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py index a0c4b292c22ffc8c81884c28e229d5531e334076..450bb68ac55471d8588c0ad6dcb470a813527a36 100644 --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -3,6 +3,13 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +"""Conversion from filesystem tree to SWH objects. + +This module allows reading a tree of directories and files from a local +filesystem, and convert them to in-memory data structures, which can then +be exported to SWH data model objects, as defined in :mod:`swh.model.model`. +""" + import datetime import enum import fnmatch @@ -18,16 +25,10 @@ from typing_extensions import Final from . import model from .exceptions import InvalidDirectoryPath -from .hashutil import MultiHash -from .identifiers import ( - CoreSWHID, - ObjectType, - directory_entry_sort_key, - directory_identifier, -) -from .identifiers import identifier_to_bytes as id_to_bytes -from .identifiers import identifier_to_str as id_to_str +from .git_objects import directory_entry_sort_key +from .hashutil import MultiHash, hash_to_hex from .merkle import MerkleLeaf, MerkleNode +from .swhids import CoreSWHID, ObjectType @attr.s(frozen=True, slots=True) @@ -218,7 +219,7 @@ class Content(MerkleLeaf): return CoreSWHID(object_type=ObjectType.CONTENT, object_id=self.hash) def __repr__(self): - return "Content(id=%s)" % id_to_str(self.hash) + return "Content(id=%s)" % hash_to_hex(self.hash) def compute_hash(self): return self.data["sha1_git"] @@ -479,8 +480,8 @@ class Directory(MerkleNode): @property def entries(self): - """Child nodes, sorted by name in the same way `directory_identifier` - does.""" + """Child nodes, sorted by name in the same way + :func:`swh.model.git_objects.directory_git_object` does.""" if self.__entries is None: self.__entries = sorted( ( @@ -498,7 +499,7 @@ class Directory(MerkleNode): return CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=self.hash) def compute_hash(self): - return id_to_bytes(directory_identifier({"entries": self.entries})) + return model.Directory.from_dict({"entries": self.entries}).id def to_model(self) -> model.Directory: """Builds a `model.Directory` object based on this node; @@ -550,6 +551,6 @@ class Directory(MerkleNode): def __repr__(self): return "Directory(id=%s, entries=[%s])" % ( - id_to_str(self.hash), + hash_to_hex(self.hash), ", ".join(str(entry) for entry in self), ) diff --git a/swh/model/git_objects.py b/swh/model/git_objects.py new file mode 100644 index 0000000000000000000000000000000000000000..ab4b38d7e520d59def1b9ad6d5e4adba7c1715ea --- /dev/null +++ b/swh/model/git_objects.py @@ -0,0 +1,645 @@ +# Copyright (C) 2015-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +Converts SWH model objects to git(-like) objects + +Most of the functions in this module take as argument an object from +:mod:`swh.model.model`, and format it like a git object. + +They are the inverse functions of those in :mod:`swh.loader.git.converters`, +but with extensions, as SWH's model is a superset of Git's: + +* extensions of existing types (eg. revision/commit and release/tag dates + can be expressed with precision up to milliseconds, to support formatting + Mercurial objects) +* new types, for SWH's specific needs (:class:`swh.model.model.RawExtrinsicMetadata` + and :class:`swh.model.model.ExtID`) +* support for somewhat corrupted git objects that we need to reproduce + +This is used for two purposes: + +* Format manifests that can be hashed to produce :ref:`intrinsic identifiers + <persistent-identifiers>` +* Write git objects to reproduce git repositories that were ingested in the archive. +""" + + +from __future__ import annotations + +import datetime +from functools import lru_cache +from typing import Dict, Iterable, List, Optional, Tuple, Union, cast +import warnings + +from . import model +from .collections import ImmutableDict +from .hashutil import git_object_header, hash_to_bytehex + + +def directory_entry_sort_key(entry: model.DirectoryEntry): + """The sorting key for tree entries""" + if isinstance(entry, dict): + # For backward compatibility + entry = model.DirectoryEntry.from_dict(entry) + if entry.type == "dir": + return entry.name + b"/" + else: + return entry.name + + +@lru_cache() +def _perms_to_bytes(perms): + """Convert the perms value to its canonical bytes representation""" + oc = oct(perms)[2:] + return oc.encode("ascii") + + +def escape_newlines(snippet): + """Escape the newlines present in snippet according to git rules. + + New lines in git manifests are escaped by indenting the next line by one + space. + + """ + + if b"\n" in snippet: + return b"\n ".join(snippet.split(b"\n")) + else: + return snippet + + +def format_date(date: model.Timestamp) -> bytes: + """Convert a date object into an UTC timestamp encoded as ascii bytes. + + Git stores timestamps as an integer number of seconds since the UNIX epoch. + + However, Software Heritage stores timestamps as an integer number of + microseconds (postgres type "datetime with timezone"). + + Therefore, we print timestamps with no microseconds as integers, and + timestamps with microseconds as floating point values. We elide the + trailing zeroes from microsecond values, to "future-proof" our + representation if we ever need more precision in timestamps. + + """ + if isinstance(date, dict): + # For backward compatibility + date = model.Timestamp.from_dict(date) + + if not date.microseconds: + return str(date.seconds).encode() + else: + float_value = "%d.%06d" % (date.seconds, date.microseconds) + return float_value.rstrip("0").encode() + + +@lru_cache() +def format_offset(offset: int, negative_utc: Optional[bool] = None) -> bytes: + """Convert an integer number of minutes into an offset representation. + + The offset representation is [+-]hhmm where: + + - hh is the number of hours; + - mm is the number of minutes. + + A null offset is represented as +0000. + """ + if offset < 0 or offset == 0 and negative_utc: + sign = "-" + else: + sign = "+" + + hours = abs(offset) // 60 + minutes = abs(offset) % 60 + + t = "%s%02d%02d" % (sign, hours, minutes) + return t.encode() + + +def normalize_timestamp(time_representation): + """Normalize a time representation for processing by Software Heritage + + This function supports a numeric timestamp (representing a number of + seconds since the UNIX epoch, 1970-01-01 at 00:00 UTC), a + :obj:`datetime.datetime` object (with timezone information), or a + normalized Software Heritage time representation (idempotency). + + Args: + time_representation: the representation of a timestamp + + Returns: + dict: a normalized dictionary with three keys: + + - timestamp: a dict with two optional keys: + + - seconds: the integral number of seconds since the UNIX epoch + - microseconds: the integral number of microseconds + + - offset: the timezone offset as a number of minutes relative to + UTC + - negative_utc: a boolean representing whether the offset is -0000 + when offset = 0. + """ + if time_representation is None: + return None + else: + return model.TimestampWithTimezone.from_dict(time_representation).to_dict() + + +def directory_git_object(directory: Union[Dict, model.Directory]) -> bytes: + """Formats a directory as a git tree. + + A directory's identifier is the tree sha1 à la git of a directory listing, + using the following algorithm, which is equivalent to the git algorithm for + trees: + + 1. Entries of the directory are sorted using the name (or the name with '/' + appended for directory entries) as key, in bytes order. + + 2. For each entry of the directory, the following bytes are output: + + - the octal representation of the permissions for the entry (stored in + the 'perms' member), which is a representation of the entry type: + + - b'100644' (int 33188) for files + - b'100755' (int 33261) for executable files + - b'120000' (int 40960) for symbolic links + - b'40000' (int 16384) for directories + - b'160000' (int 57344) for references to revisions + + - an ascii space (b'\x20') + - the entry's name (as raw bytes), stored in the 'name' member + - a null byte (b'\x00') + - the 20 byte long identifier of the object pointed at by the entry, + stored in the 'target' member: + + - for files or executable files: their blob sha1_git + - for symbolic links: the blob sha1_git of a file containing the link + destination + - for directories: their intrinsic identifier + - for revisions: their intrinsic identifier + + (Note that there is no separator between entries) + + """ + if isinstance(directory, dict): + # For backward compatibility + warnings.warn( + "directory_git_object's argument should be a swh.model.model.Directory " + "object.", + DeprecationWarning, + stacklevel=2, + ) + directory = model.Directory.from_dict(directory) + directory = cast(model.Directory, directory) + + components = [] + + for entry in sorted(directory.entries, key=directory_entry_sort_key): + components.extend( + [_perms_to_bytes(entry.perms), b"\x20", entry.name, b"\x00", entry.target,] + ) + + return format_git_object_from_parts("tree", components) + + +def format_git_object_from_headers( + git_type: str, + headers: Iterable[Tuple[bytes, bytes]], + message: Optional[bytes] = None, +) -> bytes: + """Format a git_object comprised of a git header and a manifest, + which is itself a sequence of `headers`, and an optional `message`. + + The git_object format, compatible with the git format for tag and commit + objects, is as follows: + + - for each `key`, `value` in `headers`, emit: + + - the `key`, literally + - an ascii space (``\\x20``) + - the `value`, with newlines escaped using :func:`escape_newlines`, + - an ascii newline (``\\x0a``) + + - if the `message` is not None, emit: + + - an ascii newline (``\\x0a``) + - the `message`, literally + + Args: + headers: a sequence of key/value headers stored in the manifest; + message: an optional message used to trail the manifest. + + Returns: + the formatted git_object as bytes + """ + entries: List[bytes] = [] + + for key, value in headers: + entries.extend((key, b" ", escape_newlines(value), b"\n")) + + if message is not None: + entries.extend((b"\n", message)) + + concatenated_entries = b"".join(entries) + + header = git_object_header(git_type, len(concatenated_entries)) + return header + concatenated_entries + + +def format_git_object_from_parts(git_type: str, parts: Iterable[bytes]) -> bytes: + """Similar to :func:`format_git_object_from_headers`, but for manifests made of + a flat list of entries, instead of key-value + message, ie. trees and snapshots.""" + concatenated_parts = b"".join(parts) + + header = git_object_header(git_type, len(concatenated_parts)) + return header + concatenated_parts + + +def format_author_data( + author: model.Person, date_offset: Optional[model.TimestampWithTimezone] +) -> bytes: + """Format authorship data according to git standards. + + Git authorship data has two components: + + - an author specification, usually a name and email, but in practice an + arbitrary bytestring + - optionally, a timestamp with a UTC offset specification + + The authorship data is formatted thus:: + + `name and email`[ `timestamp` `utc_offset`] + + The timestamp is encoded as a (decimal) number of seconds since the UNIX + epoch (1970-01-01 at 00:00 UTC). As an extension to the git format, we + support fractional timestamps, using a dot as the separator for the decimal + part. + + The utc offset is a number of minutes encoded as '[+-]HHMM'. Note that some + tools can pass a negative offset corresponding to the UTC timezone + ('-0000'), which is valid and is encoded as such. + + Returns: + the byte string containing the authorship data + """ + + ret = [author.fullname] + + if date_offset is not None: + date_f = format_date(date_offset.timestamp) + offset_f = format_offset(date_offset.offset, date_offset.negative_utc) + + ret.extend([b" ", date_f, b" ", offset_f]) + + return b"".join(ret) + + +def revision_git_object(revision: Union[Dict, model.Revision]) -> bytes: + """Formats a revision as a git tree. + + The fields used for the revision identifier computation are: + + - directory + - parents + - author + - author_date + - committer + - committer_date + - extra_headers or metadata -> extra_headers + - message + + A revision's identifier is the 'git'-checksum of a commit manifest + constructed as follows (newlines are a single ASCII newline character):: + + tree <directory identifier> + [for each parent in parents] + parent <parent identifier> + [end for each parents] + author <author> <author_date> + committer <committer> <committer_date> + [for each key, value in extra_headers] + <key> <encoded value> + [end for each extra_headers] + + <message> + + The directory identifier is the ascii representation of its hexadecimal + encoding. + + Author and committer are formatted using the :attr:`Person.fullname` attribute only. + Dates are formatted with the :func:`format_offset` function. + + Extra headers are an ordered list of [key, value] pairs. Keys are strings + and get encoded to utf-8 for identifier computation. Values are either byte + strings, unicode strings (that get encoded to utf-8), or integers (that get + encoded to their utf-8 decimal representation). + + Multiline extra header values are escaped by indenting the continuation + lines with one ascii space. + + If the message is None, the manifest ends with the last header. Else, the + message is appended to the headers after an empty line. + + The checksum of the full manifest is computed using the 'commit' git object + type. + + """ + if isinstance(revision, dict): + # For backward compatibility + warnings.warn( + "revision_git_object's argument should be a swh.model.model.Revision " + "object.", + DeprecationWarning, + stacklevel=2, + ) + revision = model.Revision.from_dict(revision) + revision = cast(model.Revision, revision) + + headers = [(b"tree", hash_to_bytehex(revision.directory))] + for parent in revision.parents: + if parent: + headers.append((b"parent", hash_to_bytehex(parent))) + + headers.append((b"author", format_author_data(revision.author, revision.date))) + headers.append( + (b"committer", format_author_data(revision.committer, revision.committer_date),) + ) + + # Handle extra headers + metadata = revision.metadata or ImmutableDict() + extra_headers = revision.extra_headers or () + if not extra_headers and "extra_headers" in metadata: + extra_headers = metadata["extra_headers"] + + headers.extend(extra_headers) + + return format_git_object_from_headers("commit", headers, revision.message) + + +def target_type_to_git(target_type: model.ObjectType) -> bytes: + """Convert a software heritage target type to a git object type""" + return { + model.ObjectType.CONTENT: b"blob", + model.ObjectType.DIRECTORY: b"tree", + model.ObjectType.REVISION: b"commit", + model.ObjectType.RELEASE: b"tag", + model.ObjectType.SNAPSHOT: b"refs", + }[target_type] + + +def release_git_object(release: Union[Dict, model.Release]) -> bytes: + if isinstance(release, dict): + # For backward compatibility + warnings.warn( + "release_git_object's argument should be a swh.model.model.Directory " + "object.", + DeprecationWarning, + stacklevel=2, + ) + release = model.Release.from_dict(release) + release = cast(model.Release, release) + + headers = [ + (b"object", hash_to_bytehex(release.target)), + (b"type", target_type_to_git(release.target_type)), + (b"tag", release.name), + ] + + if release.author is not None: + headers.append((b"tagger", format_author_data(release.author, release.date))) + + return format_git_object_from_headers("tag", headers, release.message) + + +def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes: + """Formats a snapshot as a git-like object. + + Snapshots are a set of named branches, which are pointers to objects at any + level of the Software Heritage DAG. + + As well as pointing to other objects in the Software Heritage DAG, branches + can also be *alias*es, in which case their target is the name of another + branch in the same snapshot, or *dangling*, in which case the target is + unknown (and represented by the ``None`` value). + + A snapshot identifier is a salted sha1 (using the git hashing algorithm + with the ``snapshot`` object type) of a manifest following the algorithm: + + 1. Branches are sorted using the name as key, in bytes order. + + 2. For each branch, the following bytes are output: + + - the type of the branch target: + + - ``content``, ``directory``, ``revision``, ``release`` or ``snapshot`` + for the corresponding entries in the DAG; + - ``alias`` for branches referencing another branch; + - ``dangling`` for dangling branches + + - an ascii space (``\\x20``) + - the branch name (as raw bytes) + - a null byte (``\\x00``) + - the length of the target identifier, as an ascii-encoded decimal number + (``20`` for current intrinsic identifiers, ``0`` for dangling + branches, the length of the target branch name for branch aliases) + - a colon (``:``) + - the identifier of the target object pointed at by the branch, + stored in the 'target' member: + + - for contents: their *sha1_git* + - for directories, revisions, releases or snapshots: their intrinsic + identifier + - for branch aliases, the name of the target branch (as raw bytes) + - for dangling branches, the empty string + + Note that, akin to directory manifests, there is no separator between + entries. Because of symbolic branches, identifiers are of arbitrary + length but are length-encoded to avoid ambiguity. + """ + if isinstance(snapshot, dict): + # For backward compatibility + warnings.warn( + "snapshot_git_object's argument should be a swh.model.model.Snapshot " + "object.", + DeprecationWarning, + stacklevel=2, + ) + snapshot = model.Snapshot.from_dict(snapshot) + snapshot = cast(model.Snapshot, snapshot) + + unresolved = [] + lines = [] + + for name, target in sorted(snapshot.branches.items()): + if not target: + target_type = b"dangling" + target_id = b"" + elif target.target_type == model.TargetType.ALIAS: + target_type = b"alias" + target_id = target.target + if target_id not in snapshot.branches or target_id == name: + unresolved.append((name, target_id)) + else: + target_type = target.target_type.value.encode() + target_id = target.target + + lines.extend( + [ + target_type, + b"\x20", + name, + b"\x00", + ("%d:" % len(target_id)).encode(), + target_id, + ] + ) + + if unresolved: + raise ValueError( + "Branch aliases unresolved: %s" + % ", ".join("%r -> %r" % x for x in unresolved), + unresolved, + ) + + return format_git_object_from_parts("snapshot", lines) + + +def raw_extrinsic_metadata_git_object( + metadata: Union[Dict, model.RawExtrinsicMetadata] +) -> bytes: + """Formats RawExtrinsicMetadata as a git-like object. + + A raw_extrinsic_metadata identifier is a salted sha1 (using the git + hashing algorithm with the ``raw_extrinsic_metadata`` object type) of + a manifest following the format:: + + target $ExtendedSwhid + discovery_date $Timestamp + authority $StrWithoutSpaces $IRI + fetcher $Str $Version + format $StrWithoutSpaces + origin $IRI <- optional + visit $IntInDecimal <- optional + snapshot $CoreSwhid <- optional + release $CoreSwhid <- optional + revision $CoreSwhid <- optional + path $Bytes <- optional + directory $CoreSwhid <- optional + + $MetadataBytes + + $IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as + described below) + + $StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces. + + $Str is an UTF-8 string. + + $CoreSwhid are core SWHIDs, as defined in :ref:`persistent-identifiers`. + $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for + origins and 'emd' for raw extrinsic metadata) + + $Timestamp is a decimal representation of the rounded-down integer number of + seconds since the UNIX epoch (1970-01-01 00:00:00 UTC), + with no leading '0' (unless the timestamp value is zero) and no timezone. + It may be negative by prefixing it with a '-', which must not be followed + by a '0'. + + Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, + ie. by adding a space after them. + """ + if isinstance(metadata, dict): + # For backward compatibility + warnings.warn( + "raw_extrinsic_metadata_git_object's argument should be a " + "swh.model.model.RawExtrinsicMetadata object.", + DeprecationWarning, + stacklevel=2, + ) + metadata = model.RawExtrinsicMetadata.from_dict(metadata) + metadata = cast(model.RawExtrinsicMetadata, metadata) + + # equivalent to using math.floor(dt.timestamp()) to round down, + # as int(dt.timestamp()) rounds toward zero, + # which would map two seconds on the 0 timestamp. + # + # This should never be an issue in practice as Software Heritage didn't + # start collecting metadata before 2015. + timestamp = ( + metadata.discovery_date.astimezone(datetime.timezone.utc) + .replace(microsecond=0) + .timestamp() + ) + assert timestamp.is_integer() + + headers = [ + (b"target", str(metadata.target).encode()), + (b"discovery_date", str(int(timestamp)).encode("ascii")), + ( + b"authority", + f"{metadata.authority.type.value} {metadata.authority.url}".encode(), + ), + (b"fetcher", f"{metadata.fetcher.name} {metadata.fetcher.version}".encode(),), + (b"format", metadata.format.encode()), + ] + + for key in ( + "origin", + "visit", + "snapshot", + "release", + "revision", + "path", + "directory", + ): + if getattr(metadata, key, None) is not None: + value: bytes + if key == "path": + value = getattr(metadata, key) + else: + value = str(getattr(metadata, key)).encode() + + headers.append((key.encode("ascii"), value)) + + return format_git_object_from_headers( + "raw_extrinsic_metadata", headers, metadata.metadata + ) + + +def extid_git_object(extid: model.ExtID) -> bytes: + """Formats an extid as a gi-like object. + + An ExtID identifier is a salted sha1 (using the git hashing algorithm with + the ``extid`` object type) of a manifest following the format: + + ``` + extid_type $StrWithoutSpaces + [extid_version $Str] + extid $Bytes + target $CoreSwhid + ``` + + $StrWithoutSpaces is an ASCII string, and may not contain spaces. + + Newlines in $Bytes are escaped as with other git fields, ie. by adding a + space after them. + + The extid_version line is only generated if the version is non-zero. + """ + + headers = [ + (b"extid_type", extid.extid_type.encode("ascii")), + ] + extid_version = extid.extid_version + if extid_version != 0: + headers.append((b"extid_version", str(extid_version).encode("ascii"))) + + headers.extend( + [(b"extid", extid.extid), (b"target", str(extid.target).encode("ascii")),] + ) + + return format_git_object_from_headers("extid", headers) diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py index ac912d4cb9d1e84eaaaade4675b06f07ab54fa8e..c8644a39134d8199f3431d112e6f7644c52bcafa 100644 --- a/swh/model/hypothesis_strategies.py +++ b/swh/model/hypothesis_strategies.py @@ -29,12 +29,6 @@ from hypothesis.strategies import ( ) from .from_disk import DentryPerms -from .identifiers import ( - ExtendedObjectType, - ExtendedSWHID, - identifier_to_bytes, - snapshot_identifier, -) from .model import ( BaseContent, Content, @@ -58,6 +52,7 @@ from .model import ( Timestamp, TimestampWithTimezone, ) +from .swhids import ExtendedObjectType, ExtendedSWHID pgsql_alphabet = characters( blacklist_categories=("Cs",), blacklist_characters=["\u0000"] @@ -400,7 +395,7 @@ def snapshots_d(draw, *, min_size=0, max_size=100, only_objects=False): # Ensure no cycles between aliases while True: try: - id_ = snapshot_identifier( + snapshot = Snapshot.from_dict( { "branches": { name: branch or None for (name, branch) in branches.items() @@ -413,7 +408,7 @@ def snapshots_d(draw, *, min_size=0, max_size=100, only_objects=False): else: break - return dict(id=identifier_to_bytes(id_), branches=branches) + return snapshot.to_dict() def snapshots(*, min_size=0, max_size=100, only_objects=False): diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index 421d2e77cd0111f35b8a787a0f5b3d41443973c2..6fa63666ba22f21f90bfbcb0b8cd6bfbaa607b0f 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -3,60 +3,25 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from __future__ import annotations +from typing import Any, Dict +import warnings -import binascii -import datetime -import enum -from functools import lru_cache -import hashlib -import re -from typing import ( - Any, - Dict, - Generic, - Iterable, - List, - Optional, - Tuple, - Type, - TypeVar, - Union, -) -import urllib.parse - -import attr -from attrs_strict import type_validator - -from .exceptions import ValidationError -from .hashutil import MultiHash, git_object_header, hash_to_bytes, hash_to_hex - - -class ObjectType(enum.Enum): - """Possible object types of a QualifiedSWHID or CoreSWHID. - - The values of each variant is what is used in the SWHID's string representation.""" - - SNAPSHOT = "snp" - REVISION = "rev" - RELEASE = "rel" - DIRECTORY = "dir" - CONTENT = "cnt" - - -class ExtendedObjectType(enum.Enum): - """Possible object types of an ExtendedSWHID. +from . import model - The variants are a superset of :class:`ObjectType`'s""" +# Reexport for backward compatibility +from .git_objects import * # noqa +from .hashutil import MultiHash, hash_to_hex - SNAPSHOT = "snp" - REVISION = "rev" - RELEASE = "rel" - DIRECTORY = "dir" - CONTENT = "cnt" - ORIGIN = "ori" - RAW_EXTRINSIC_METADATA = "emd" +# Reexport for backward compatibility +from .swhids import * # noqa +warnings.warn( + "The swh.model.identifiers module is deprecated. " + "SWHID-related classes were moved to swh.model.swhids, and identifier " + "computation is now done directly with swh.model.model classes.", + DeprecationWarning, + stacklevel=2, +) # The following are deprecated aliases of the variants defined in ObjectType # while transitioning from SWHID to QualifiedSWHID @@ -69,1208 +34,58 @@ CONTENT = "content" RAW_EXTRINSIC_METADATA = "raw_extrinsic_metadata" -SWHID_NAMESPACE = "swh" -SWHID_VERSION = 1 -SWHID_TYPES = ["snp", "rel", "rev", "dir", "cnt"] -EXTENDED_SWHID_TYPES = SWHID_TYPES + ["ori", "emd"] -SWHID_SEP = ":" -SWHID_CTXT_SEP = ";" -SWHID_QUALIFIERS = {"origin", "anchor", "visit", "path", "lines"} - -SWHID_RE_RAW = ( - f"(?P<namespace>{SWHID_NAMESPACE})" - f"{SWHID_SEP}(?P<scheme_version>{SWHID_VERSION})" - f"{SWHID_SEP}(?P<object_type>{'|'.join(EXTENDED_SWHID_TYPES)})" - f"{SWHID_SEP}(?P<object_id>[0-9a-f]{{40}})" - f"({SWHID_CTXT_SEP}(?P<qualifiers>\\S+))?" -) -SWHID_RE = re.compile(SWHID_RE_RAW) - - -@lru_cache() -def identifier_to_bytes(identifier): - """Convert a text identifier to bytes. - - Args: - identifier: an identifier, either a 40-char hexadecimal string or a - bytes object of length 20 - Returns: - The length 20 bytestring corresponding to the given identifier - - Raises: - ValueError: if the identifier is of an unexpected type or length. - """ - - if isinstance(identifier, bytes): - if len(identifier) != 20: - raise ValueError( - "Wrong length for bytes identifier %s, expected 20" % len(identifier) - ) - return identifier - - if isinstance(identifier, str): - if len(identifier) != 40: - raise ValueError( - "Wrong length for str identifier %s, expected 40" % len(identifier) - ) - return bytes.fromhex(identifier) - - raise ValueError( - "Wrong type for identifier %s, expected bytes or str" - % identifier.__class__.__name__ - ) - - -@lru_cache() -def identifier_to_str(identifier): - """Convert an identifier to an hexadecimal string. - - Args: - identifier: an identifier, either a 40-char hexadecimal string or a - bytes object of length 20 - - Returns: - The length 40 string corresponding to the given identifier, hex encoded - - Raises: - ValueError: if the identifier is of an unexpected type or length. - """ - - if isinstance(identifier, str): - if len(identifier) != 40: - raise ValueError( - "Wrong length for str identifier %s, expected 40" % len(identifier) - ) - return identifier - - if isinstance(identifier, bytes): - if len(identifier) != 20: - raise ValueError( - "Wrong length for bytes identifier %s, expected 20" % len(identifier) - ) - return binascii.hexlify(identifier).decode() - - raise ValueError( - "Wrong type for identifier %s, expected bytes or str" - % identifier.__class__.__name__ - ) - - def content_identifier(content: Dict[str, Any]) -> Dict[str, bytes]: - """Return the intrinsic identifier for a content. - - A content's identifier is the sha1, sha1_git and sha256 checksums of its - data. - - Args: - content: a content conforming to the Software Heritage schema - - Returns: - A dictionary with all the hashes for the data - - Raises: - KeyError: if the content doesn't have a data member. - + """Deprecated, use :class:`swh.model.Content` instead: + ``content_identifier(d)`` is equivalent to: + ``{k: hash_to_hex(v) for (k, v) in Content.from_data(d["data"]).hashes().items()}`` """ - return MultiHash.from_data(content["data"]).digest() -def directory_entry_sort_key(entry): - """The sorting key for tree entries""" - if entry["type"] == "dir": - return entry["name"] + b"/" - else: - return entry["name"] - - -@lru_cache() -def _perms_to_bytes(perms): - """Convert the perms value to its bytes representation""" - oc = oct(perms)[2:] - return oc.encode("ascii") - - -def escape_newlines(snippet): - """Escape the newlines present in snippet according to git rules. - - New lines in git manifests are escaped by indenting the next line by one - space. - - """ - - if b"\n" in snippet: - return b"\n ".join(snippet.split(b"\n")) - else: - return snippet - - def directory_identifier(directory: Dict[str, Any]) -> str: - """Return the intrinsic identifier for a directory. - - A directory's identifier is the tree sha1 à la git of a directory listing, - using the following algorithm, which is equivalent to the git algorithm for - trees: - - 1. Entries of the directory are sorted using the name (or the name with '/' - appended for directory entries) as key, in bytes order. - - 2. For each entry of the directory, the following bytes are output: - - - the octal representation of the permissions for the entry (stored in - the 'perms' member), which is a representation of the entry type: - - - b'100644' (int 33188) for files - - b'100755' (int 33261) for executable files - - b'120000' (int 40960) for symbolic links - - b'40000' (int 16384) for directories - - b'160000' (int 57344) for references to revisions - - - an ascii space (b'\x20') - - the entry's name (as raw bytes), stored in the 'name' member - - a null byte (b'\x00') - - the 20 byte long identifier of the object pointed at by the entry, - stored in the 'target' member: - - - for files or executable files: their blob sha1_git - - for symbolic links: the blob sha1_git of a file containing the link - destination - - for directories: their intrinsic identifier - - for revisions: their intrinsic identifier + """Deprecated, use :class:`swh.model.Directory` instead: + ``directory_identifier(d)`` is equivalent to: + ``hash_to_hex(Directory.from_dict(d).id)``. - (Note that there is no separator between entries) - - """ - git_object = directory_git_object(directory) - return hashlib.new("sha1", git_object).hexdigest() - - -def directory_git_object(directory: Dict[str, Any]) -> bytes: - components = [] - - for entry in sorted(directory["entries"], key=directory_entry_sort_key): - components.extend( - [ - _perms_to_bytes(entry["perms"]), - b"\x20", - entry["name"], - b"\x00", - identifier_to_bytes(entry["target"]), - ] - ) - - return format_git_object_from_parts("tree", components) - - -def format_date(date): - """Convert a date object into an UTC timestamp encoded as ascii bytes. - - Git stores timestamps as an integer number of seconds since the UNIX epoch. - - However, Software Heritage stores timestamps as an integer number of - microseconds (postgres type "datetime with timezone"). - - Therefore, we print timestamps with no microseconds as integers, and - timestamps with microseconds as floating point values. We elide the - trailing zeroes from microsecond values, to "future-proof" our - representation if we ever need more precision in timestamps. - - """ - if not isinstance(date, dict): - raise ValueError("format_date only supports dicts, %r received" % date) - - seconds = date.get("seconds", 0) - microseconds = date.get("microseconds", 0) - if not microseconds: - return str(seconds).encode() - else: - float_value = "%d.%06d" % (seconds, microseconds) - return float_value.rstrip("0").encode() - - -@lru_cache() -def format_offset(offset, negative_utc=None): - """Convert an integer number of minutes into an offset representation. - - The offset representation is [+-]hhmm where: - - - hh is the number of hours; - - mm is the number of minutes. - - A null offset is represented as +0000. - """ - if offset < 0 or offset == 0 and negative_utc: - sign = "-" - else: - sign = "+" - - hours = abs(offset) // 60 - minutes = abs(offset) % 60 - - t = "%s%02d%02d" % (sign, hours, minutes) - return t.encode() - - -def normalize_timestamp(time_representation): - """Normalize a time representation for processing by Software Heritage - - This function supports a numeric timestamp (representing a number of - seconds since the UNIX epoch, 1970-01-01 at 00:00 UTC), a - :obj:`datetime.datetime` object (with timezone information), or a - normalized Software Heritage time representation (idempotency). - - Args: - time_representation: the representation of a timestamp - - Returns: - dict: a normalized dictionary with three keys: - - - timestamp: a dict with two optional keys: - - - seconds: the integral number of seconds since the UNIX epoch - - microseconds: the integral number of microseconds - - - offset: the timezone offset as a number of minutes relative to - UTC - - negative_utc: a boolean representing whether the offset is -0000 - when offset = 0. - - """ - - if time_representation is None: - return None - - negative_utc = False - - if isinstance(time_representation, dict): - ts = time_representation["timestamp"] - if isinstance(ts, dict): - seconds = ts.get("seconds", 0) - microseconds = ts.get("microseconds", 0) - elif isinstance(ts, int): - seconds = ts - microseconds = 0 - else: - raise ValueError( - "normalize_timestamp received non-integer timestamp member:" " %r" % ts - ) - offset = time_representation["offset"] - if "negative_utc" in time_representation: - negative_utc = time_representation["negative_utc"] - if negative_utc is None: - negative_utc = False - elif isinstance(time_representation, datetime.datetime): - microseconds = time_representation.microsecond - if microseconds: - time_representation = time_representation.replace(microsecond=0) - seconds = int(time_representation.timestamp()) - utcoffset = time_representation.utcoffset() - if utcoffset is None: - raise ValueError( - "normalize_timestamp received datetime without timezone: %s" - % time_representation - ) - - # utcoffset is an integer number of minutes - seconds_offset = utcoffset.total_seconds() - offset = int(seconds_offset) // 60 - elif isinstance(time_representation, int): - seconds = time_representation - microseconds = 0 - offset = 0 - else: - raise ValueError( - "normalize_timestamp received non-integer timestamp:" - " %r" % time_representation - ) - - return { - "timestamp": {"seconds": seconds, "microseconds": microseconds,}, - "offset": offset, - "negative_utc": negative_utc, - } - - -def format_author(author): - """Format the specification of an author. - - An author is either a byte string (passed unchanged), or a dict with three - keys, fullname, name and email. - - If the fullname exists, return it; if it doesn't, we construct a fullname - using the following heuristics: if the name value is None, we return the - email in angle brackets, else, we return the name, a space, and the email - in angle brackets. - - """ - if isinstance(author, bytes) or author is None: - return author - - if "fullname" in author: - return author["fullname"] - - ret = [] - if author["name"] is not None: - ret.append(author["name"]) - if author["email"] is not None: - ret.append(b"".join([b"<", author["email"], b">"])) - - return b" ".join(ret) - - -def format_git_object_from_headers( - git_type: str, - headers: Iterable[Tuple[bytes, bytes]], - message: Optional[bytes] = None, -) -> bytes: - """Format a git_object comprised of a git header and a manifest, - which is itself a sequence of `headers`, and an optional `message`. - - The git_object format, compatible with the git format for tag and commit - objects, is as follows: - - - for each `key`, `value` in `headers`, emit: - - - the `key`, literally - - an ascii space (``\\x20``) - - the `value`, with newlines escaped using :func:`escape_newlines`, - - an ascii newline (``\\x0a``) - - - if the `message` is not None, emit: - - - an ascii newline (``\\x0a``) - - the `message`, literally - - Args: - headers: a sequence of key/value headers stored in the manifest; - message: an optional message used to trail the manifest. - - Returns: - the formatted git_object as bytes - """ - entries: List[bytes] = [] - - for key, value in headers: - entries.extend((key, b" ", escape_newlines(value), b"\n")) - - if message is not None: - entries.extend((b"\n", message)) - - concatenated_entries = b"".join(entries) - - header = git_object_header(git_type, len(concatenated_entries)) - return header + concatenated_entries - - -def format_git_object_from_parts(git_type: str, parts: Iterable[bytes]) -> bytes: - """Similar to :func:`format_git_object_from_headers`, but for manifests made of - a flat list of entries, instead of key-value + message, ie. trees and snapshots.""" - concatenated_parts = b"".join(parts) - - header = git_object_header(git_type, len(concatenated_parts)) - return header + concatenated_parts - - -def format_author_data(author, date_offset) -> bytes: - """Format authorship data according to git standards. - - Git authorship data has two components: - - - an author specification, usually a name and email, but in practice an - arbitrary bytestring - - optionally, a timestamp with a UTC offset specification - - The authorship data is formatted thus:: - - `name and email`[ `timestamp` `utc_offset`] - - The timestamp is encoded as a (decimal) number of seconds since the UNIX - epoch (1970-01-01 at 00:00 UTC). As an extension to the git format, we - support fractional timestamps, using a dot as the separator for the decimal - part. - - The utc offset is a number of minutes encoded as '[+-]HHMM'. Note that some - tools can pass a negative offset corresponding to the UTC timezone - ('-0000'), which is valid and is encoded as such. - - Args: - author: an author specification (dict with two bytes values: name and - email, or byte value) - date_offset: a normalized date/time representation as returned by - :func:`normalize_timestamp`. - - Returns: - the byte string containing the authorship data - - """ - - ret = [format_author(author)] - - date_offset = normalize_timestamp(date_offset) - - if date_offset is not None: - date_f = format_date(date_offset["timestamp"]) - offset_f = format_offset(date_offset["offset"], date_offset["negative_utc"]) - - ret.extend([b" ", date_f, b" ", offset_f]) - - return b"".join(ret) + See :func:`swh.model.git_objects.directory_git_object` for details of the + format used to generate this identifier.""" + return hash_to_hex(model.Directory.from_dict(directory).id) def revision_identifier(revision: Dict[str, Any]) -> str: - """Return the intrinsic identifier for a revision. - - The fields used for the revision identifier computation are: - - - directory - - parents - - author - - author_date - - committer - - committer_date - - extra_headers or metadata -> extra_headers - - message - - A revision's identifier is the 'git'-checksum of a commit manifest - constructed as follows (newlines are a single ASCII newline character):: - - tree <directory identifier> - [for each parent in parents] - parent <parent identifier> - [end for each parents] - author <author> <author_date> - committer <committer> <committer_date> - [for each key, value in extra_headers] - <key> <encoded value> - [end for each extra_headers] - - <message> - - The directory identifier is the ascii representation of its hexadecimal - encoding. - - Author and committer are formatted with the :func:`format_author` function. - Dates are formatted with the :func:`format_offset` function. - - Extra headers are an ordered list of [key, value] pairs. Keys are strings - and get encoded to utf-8 for identifier computation. Values are either byte - strings, unicode strings (that get encoded to utf-8), or integers (that get - encoded to their utf-8 decimal representation). - - Multiline extra header values are escaped by indenting the continuation - lines with one ascii space. + """Deprecated, use :class:`swh.model.Revision` instead: + ``revision_identifier(d)`` is equivalent to: + ``hash_to_hex(Revision.from_dict(d).id)``. - If the message is None, the manifest ends with the last header. Else, the - message is appended to the headers after an empty line. - - The checksum of the full manifest is computed using the 'commit' git object - type. - - """ - git_object = revision_git_object(revision) - return hashlib.new("sha1", git_object).hexdigest() - - -def revision_git_object(revision: Dict[str, Any]) -> bytes: - """Formats the git_object of a revision. See :func:`revision_identifier` for details - on the format.""" - headers = [(b"tree", identifier_to_str(revision["directory"]).encode())] - for parent in revision["parents"]: - if parent: - headers.append((b"parent", identifier_to_str(parent).encode())) - - headers.append( - (b"author", format_author_data(revision["author"], revision["date"])) - ) - headers.append( - ( - b"committer", - format_author_data(revision["committer"], revision["committer_date"]), - ) - ) - - # Handle extra headers - metadata = revision.get("metadata") or {} - extra_headers = revision.get("extra_headers", ()) - if not extra_headers and "extra_headers" in metadata: - extra_headers = metadata["extra_headers"] - - headers.extend(extra_headers) - - return format_git_object_from_headers("commit", headers, revision["message"]) - - -def target_type_to_git(target_type: str) -> bytes: - """Convert a software heritage target type to a git object type""" - return { - "content": b"blob", - "directory": b"tree", - "revision": b"commit", - "release": b"tag", - "snapshot": b"refs", - }[target_type] + See :func:`swh.model.git_objects.revision_git_object` for details of the + format used to generate this identifier.""" + return hash_to_hex(model.Revision.from_dict(revision).id) def release_identifier(release: Dict[str, Any]) -> str: - """Return the intrinsic identifier for a release.""" - git_object = release_git_object(release) - return hashlib.new("sha1", git_object).hexdigest() - - -def release_git_object(release: Dict[str, Any]) -> bytes: - headers = [ - (b"object", identifier_to_str(release["target"]).encode()), - (b"type", target_type_to_git(release["target_type"])), - (b"tag", release["name"]), - ] - - if "author" in release and release["author"]: - headers.append( - (b"tagger", format_author_data(release["author"], release["date"])) - ) - - return format_git_object_from_headers("tag", headers, release["message"]) - - -def snapshot_identifier( - snapshot: Dict[str, Any], *, ignore_unresolved: bool = False -) -> str: - """Return the intrinsic identifier for a snapshot. - - Snapshots are a set of named branches, which are pointers to objects at any - level of the Software Heritage DAG. + """Deprecated, use :class:`swh.model.Release` instead: + ``release_identifier(d)`` is equivalent to: + ``hash_to_hex(Release.from_dict(d).id)``. - As well as pointing to other objects in the Software Heritage DAG, branches - can also be *alias*es, in which case their target is the name of another - branch in the same snapshot, or *dangling*, in which case the target is - unknown (and represented by the ``None`` value). + See :func:`swh.model.git_objects.release_git_object` for details of the + format used to generate this identifier.""" + return hash_to_hex(model.Release.from_dict(release).id) - A snapshot identifier is a salted sha1 (using the git hashing algorithm - with the ``snapshot`` object type) of a manifest following the algorithm: - 1. Branches are sorted using the name as key, in bytes order. +def snapshot_identifier(snapshot: Dict[str, Any]) -> str: + """Deprecated, use :class:`swh.model.Snapshot` instead: + ``snapshot_identifier(d)`` is equivalent to: + ``hash_to_hex(Snapshot.from_dict(d).id)``. - 2. For each branch, the following bytes are output: - - - the type of the branch target: - - - ``content``, ``directory``, ``revision``, ``release`` or ``snapshot`` - for the corresponding entries in the DAG; - - ``alias`` for branches referencing another branch; - - ``dangling`` for dangling branches - - - an ascii space (``\\x20``) - - the branch name (as raw bytes) - - a null byte (``\\x00``) - - the length of the target identifier, as an ascii-encoded decimal number - (``20`` for current intrinsic identifiers, ``0`` for dangling - branches, the length of the target branch name for branch aliases) - - a colon (``:``) - - the identifier of the target object pointed at by the branch, - stored in the 'target' member: - - - for contents: their *sha1_git* - - for directories, revisions, releases or snapshots: their intrinsic - identifier - - for branch aliases, the name of the target branch (as raw bytes) - - for dangling branches, the empty string - - Note that, akin to directory manifests, there is no separator between - entries. Because of symbolic branches, identifiers are of arbitrary - length but are length-encoded to avoid ambiguity. - - Args: - snapshot (dict): the snapshot of which to compute the identifier. A - single entry is needed, ``'branches'``, which is itself a :class:`dict` - mapping each branch to its target - ignore_unresolved (bool): if `True`, ignore unresolved branch aliases. - - Returns: - str: the intrinsic identifier for `snapshot` - - """ - git_object = snapshot_git_object(snapshot, ignore_unresolved=ignore_unresolved) - return hashlib.new("sha1", git_object).hexdigest() - - -def snapshot_git_object( - snapshot: Dict[str, Any], *, ignore_unresolved: bool = False -) -> bytes: - """Formats the git_object of a revision. See :func:`snapshot_identifier` for details - on the format.""" - unresolved = [] - lines = [] - - for name, target in sorted(snapshot["branches"].items()): - if not target: - target_type = b"dangling" - target_id = b"" - elif target["target_type"] == "alias": - target_type = b"alias" - target_id = target["target"] - if target_id not in snapshot["branches"] or target_id == name: - unresolved.append((name, target_id)) - else: - target_type = target["target_type"].encode() - target_id = identifier_to_bytes(target["target"]) - - lines.extend( - [ - target_type, - b"\x20", - name, - b"\x00", - ("%d:" % len(target_id)).encode(), - target_id, - ] - ) - - if unresolved and not ignore_unresolved: - raise ValueError( - "Branch aliases unresolved: %s" - % ", ".join("%r -> %r" % x for x in unresolved), - unresolved, - ) - - return format_git_object_from_parts("snapshot", lines) + See :func:`swh.model.git_objects.snapshot_git_object` for details of the + format used to generate this identifier.""" + return hash_to_hex(model.Snapshot.from_dict(snapshot).id) def origin_identifier(origin): - """Return the intrinsic identifier for an origin. - - An origin's identifier is the sha1 checksum of the entire origin URL - - """ - return hashlib.sha1(origin["url"].encode("utf-8")).hexdigest() - - -def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: - """Return the intrinsic identifier for a RawExtrinsicMetadata object. - - A raw_extrinsic_metadata identifier is a salted sha1 (using the git - hashing algorithm with the ``raw_extrinsic_metadata`` object type) of - a manifest following the format:: - - target $ExtendedSwhid - discovery_date $Timestamp - authority $StrWithoutSpaces $IRI - fetcher $Str $Version - format $StrWithoutSpaces - origin $IRI <- optional - visit $IntInDecimal <- optional - snapshot $CoreSwhid <- optional - release $CoreSwhid <- optional - revision $CoreSwhid <- optional - path $Bytes <- optional - directory $CoreSwhid <- optional - - $MetadataBytes - - $IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as - described below) - - $StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces. - - $Str is an UTF-8 string. - - $CoreSwhid are core SWHIDs, as defined in :ref:`persistent-identifiers`. - $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for - origins and 'emd' for raw extrinsic metadata) - - $Timestamp is a decimal representation of the rounded-down integer number of - seconds since the UNIX epoch (1970-01-01 00:00:00 UTC), - with no leading '0' (unless the timestamp value is zero) and no timezone. - It may be negative by prefixing it with a '-', which must not be followed - by a '0'. - - Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, - ie. by adding a space after them. - - Returns: - str: the intrinsic identifier for ``metadata`` - - """ - git_object = raw_extrinsic_metadata_git_object(metadata) - return hashlib.new("sha1", git_object).hexdigest() - - -def raw_extrinsic_metadata_git_object(metadata: Dict[str, Any]) -> bytes: - """Formats the git_object of a raw_extrinsic_metadata object. - See :func:`raw_extrinsic_metadata_identifier` for details - on the format.""" - # equivalent to using math.floor(dt.timestamp()) to round down, - # as int(dt.timestamp()) rounds toward zero, - # which would map two seconds on the 0 timestamp. - # - # This should never be an issue in practice as Software Heritage didn't - # start collecting metadata before 2015. - timestamp = ( - metadata["discovery_date"] - .astimezone(datetime.timezone.utc) - .replace(microsecond=0) - .timestamp() - ) - assert timestamp.is_integer() - - headers = [ - (b"target", str(metadata["target"]).encode()), - (b"discovery_date", str(int(timestamp)).encode("ascii")), - ( - b"authority", - f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(), - ), - ( - b"fetcher", - f"{metadata['fetcher']['name']} {metadata['fetcher']['version']}".encode(), - ), - (b"format", metadata["format"].encode()), - ] - - for key in ( - "origin", - "visit", - "snapshot", - "release", - "revision", - "path", - "directory", - ): - if metadata.get(key) is not None: - value: bytes - if key == "path": - value = metadata[key] - else: - value = str(metadata[key]).encode() - - headers.append((key.encode("ascii"), value)) - - return format_git_object_from_headers( - "raw_extrinsic_metadata", headers, metadata["metadata"] - ) - - -def extid_identifier(extid: Dict[str, Any]) -> str: - """Return the intrinsic identifier for an ExtID object. - - An ExtID identifier is a salted sha1 (using the git hashing algorithm with - the ``extid`` object type) of a manifest following the format: - - ``` - extid_type $StrWithoutSpaces - [extid_version $Str] - extid $Bytes - target $CoreSwhid - ``` - - $StrWithoutSpaces is an ASCII string, and may not contain spaces. - - Newlines in $Bytes are escaped as with other git fields, ie. by adding a - space after them. - - The extid_version line is only generated if the version is non-zero. - - Returns: - str: the intrinsic identifier for `extid` - + """Deprecated, use :class:`swh.model.Origin` instead: + ``origin_identifier(url)`` is equivalent to: + ``hash_to_hex(Origin(url=url).id)``. """ - headers = [ - (b"extid_type", extid["extid_type"].encode("ascii")), - ] - extid_version = extid.get("extid_version", 0) - if extid_version != 0: - headers.append((b"extid_version", str(extid_version).encode("ascii"))) - - headers.extend( - [(b"extid", extid["extid"]), (b"target", str(extid["target"]).encode("ascii")),] - ) - - git_object = format_git_object_from_headers("extid", headers) - return hashlib.new("sha1", git_object).hexdigest() - - -# type of the "object_type" attribute of the SWHID class; either -# ObjectType or ExtendedObjectType -_TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType) - -# the SWHID class itself (this is used so that X.from_string() can return X -# for all X subclass of _BaseSWHID) -_TSWHID = TypeVar("_TSWHID", bound="_BaseSWHID") - - -@attr.s(frozen=True, kw_only=True) -class _BaseSWHID(Generic[_TObjectType]): - """Common base class for CoreSWHID, QualifiedSWHID, and ExtendedSWHID. - - This is an "abstract" class and should not be instantiated directly; - it only exists to deduplicate code between these three SWHID classes.""" - - namespace = attr.ib(type=str, default=SWHID_NAMESPACE) - """the namespace of the identifier, defaults to ``swh``""" - - scheme_version = attr.ib(type=int, default=SWHID_VERSION) - """the scheme version of the identifier, defaults to 1""" - - # overridden by subclasses - object_type: _TObjectType - """the type of object the identifier points to""" - - object_id = attr.ib(type=bytes, validator=type_validator()) - """object's identifier""" - - @namespace.validator - def check_namespace(self, attribute, value): - if value != SWHID_NAMESPACE: - raise ValidationError( - "Invalid SWHID: invalid namespace: %(namespace)s", - params={"namespace": value}, - ) - - @scheme_version.validator - def check_scheme_version(self, attribute, value): - if value != SWHID_VERSION: - raise ValidationError( - "Invalid SWHID: invalid version: %(version)s", params={"version": value} - ) - - @object_id.validator - def check_object_id(self, attribute, value): - if len(value) != 20: - raise ValidationError( - "Invalid SWHID: invalid checksum: %(object_id)s", - params={"object_id": hash_to_hex(value)}, - ) - - def __str__(self) -> str: - return SWHID_SEP.join( - [ - self.namespace, - str(self.scheme_version), - self.object_type.value, - hash_to_hex(self.object_id), - ] - ) - - @classmethod - def from_string(cls: Type[_TSWHID], s: str) -> _TSWHID: - parts = _parse_swhid(s) - if parts.pop("qualifiers"): - raise ValidationError(f"{cls.__name__} does not support qualifiers.") - try: - return cls(**parts) - except ValueError as e: - raise ValidationError( - "ValueError: %(args)s", params={"args": e.args} - ) from None - - -@attr.s(frozen=True, kw_only=True) -class CoreSWHID(_BaseSWHID[ObjectType]): - """ - Dataclass holding the relevant info associated to a SoftWare Heritage - persistent IDentifier (SWHID). - - Unlike `QualifiedSWHID`, it is restricted to core SWHIDs, ie. SWHIDs - with no qualifiers. - - Raises: - swh.model.exceptions.ValidationError: In case of invalid object type or id - - To get the raw SWHID string from an instance of this class, - use the :func:`str` function: - - >>> swhid = CoreSWHID( - ... object_type=ObjectType.CONTENT, - ... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'), - ... ) - >>> str(swhid) - 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' - - And vice-versa with :meth:`CoreSWHID.from_string`: - - >>> swhid == CoreSWHID.from_string( - ... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0" - ... ) - True - """ - - object_type = attr.ib( - type=ObjectType, validator=type_validator(), converter=ObjectType - ) - """the type of object the identifier points to""" - - def to_extended(self) -> ExtendedSWHID: - """Converts this CoreSWHID into an ExtendedSWHID. - - As ExtendedSWHID is a superset of CoreSWHID, this is lossless.""" - return ExtendedSWHID( - namespace=self.namespace, - scheme_version=self.scheme_version, - object_type=ExtendedObjectType(self.object_type.value), - object_id=self.object_id, - ) - - -def _parse_core_swhid(swhid: Union[str, CoreSWHID, None]) -> Optional[CoreSWHID]: - if swhid is None or isinstance(swhid, CoreSWHID): - return swhid - else: - return CoreSWHID.from_string(swhid) - - -def _parse_lines_qualifier( - lines: Union[str, Tuple[int, Optional[int]], None] -) -> Optional[Tuple[int, Optional[int]]]: - try: - if lines is None or isinstance(lines, tuple): - return lines - elif "-" in lines: - (from_, to) = lines.split("-", 2) - return (int(from_), int(to)) - else: - return (int(lines), None) - except ValueError: - raise ValidationError( - "Invalid format for the lines qualifier: %(lines)s", params={"lines": lines} - ) - - -def _parse_path_qualifier(path: Union[str, bytes, None]) -> Optional[bytes]: - if path is None or isinstance(path, bytes): - return path - else: - return urllib.parse.unquote_to_bytes(path) - - -@attr.s(frozen=True, kw_only=True) -class QualifiedSWHID(_BaseSWHID[ObjectType]): - """ - Dataclass holding the relevant info associated to a SoftWare Heritage - persistent IDentifier (SWHID) - - Raises: - swh.model.exceptions.ValidationError: In case of invalid object type or id - - To get the raw SWHID string from an instance of this class, - use the :func:`str` function: - - >>> swhid = QualifiedSWHID( - ... object_type=ObjectType.CONTENT, - ... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'), - ... lines=(5, 10), - ... ) - >>> str(swhid) - 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0;lines=5-10' - - And vice-versa with :meth:`QualifiedSWHID.from_string`: - - >>> swhid == QualifiedSWHID.from_string( - ... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0;lines=5-10" - ... ) - True - """ - - object_type = attr.ib( - type=ObjectType, validator=type_validator(), converter=ObjectType - ) - """the type of object the identifier points to""" - - # qualifiers: - - origin = attr.ib(type=Optional[str], default=None, validator=type_validator()) - """the software origin where an object has been found or observed in the wild, - as an URI""" - - visit = attr.ib(type=Optional[CoreSWHID], default=None, converter=_parse_core_swhid) - """the core identifier of a snapshot corresponding to a specific visit - of a repository containing the designated object""" - - anchor = attr.ib( - type=Optional[CoreSWHID], - default=None, - validator=type_validator(), - converter=_parse_core_swhid, - ) - """a designated node in the Merkle DAG relative to which a path to the object - is specified, as the core identifier of a directory, a revision, a release, - or a snapshot""" - - path = attr.ib( - type=Optional[bytes], - default=None, - validator=type_validator(), - converter=_parse_path_qualifier, - ) - """the absolute file path, from the root directory associated to the anchor node, - to the object; when the anchor denotes a directory or a revision, and almost always - when it’s a release, the root directory is uniquely determined; - when the anchor denotes a snapshot, the root directory is the one pointed to by HEAD - (possibly indirectly), and undefined if such a reference is missing""" - - lines = attr.ib( - type=Optional[Tuple[int, Optional[int]]], - default=None, - validator=type_validator(), - converter=_parse_lines_qualifier, - ) - """lines: line number(s) of interest, usually within a content object""" - - @visit.validator - def check_visit(self, attribute, value): - if value and value.object_type != ObjectType.SNAPSHOT: - raise ValidationError( - "The 'visit' qualifier must be a 'snp' SWHID, not '%(type)s'", - params={"type": value.object_type.value}, - ) - - @anchor.validator - def check_anchor(self, attribute, value): - if value and value.object_type not in ( - ObjectType.DIRECTORY, - ObjectType.REVISION, - ObjectType.RELEASE, - ObjectType.SNAPSHOT, - ): - raise ValidationError( - "The 'visit' qualifier must be a 'dir', 'rev', 'rel', or 'snp' SWHID, " - "not '%s(type)s'", - params={"type": value.object_type.value}, - ) - - def qualifiers(self) -> Dict[str, str]: - origin = self.origin - if origin: - unescaped_origin = origin - origin = origin.replace(";", "%3B") - assert urllib.parse.unquote_to_bytes( - origin - ) == urllib.parse.unquote_to_bytes( - unescaped_origin - ), "Escaping ';' in the origin qualifier corrupted the origin URL." - - d: Dict[str, Optional[str]] = { - "origin": origin, - "visit": str(self.visit) if self.visit else None, - "anchor": str(self.anchor) if self.anchor else None, - "path": ( - urllib.parse.quote_from_bytes(self.path) - if self.path is not None - else None - ), - "lines": ( - "-".join(str(line) for line in self.lines if line is not None) - if self.lines - else None - ), - } - return {k: v for (k, v) in d.items() if v is not None} - - def __str__(self) -> str: - swhid = SWHID_SEP.join( - [ - self.namespace, - str(self.scheme_version), - self.object_type.value, - hash_to_hex(self.object_id), - ] - ) - qualifiers = self.qualifiers() - if qualifiers: - for k, v in qualifiers.items(): - swhid += "%s%s=%s" % (SWHID_CTXT_SEP, k, v) - return swhid - - @classmethod - def from_string(cls, s: str) -> QualifiedSWHID: - parts = _parse_swhid(s) - qualifiers = parts.pop("qualifiers") - invalid_qualifiers = set(qualifiers) - SWHID_QUALIFIERS - if invalid_qualifiers: - raise ValidationError( - "Invalid qualifier(s): %(qualifiers)s", - params={"qualifiers": ", ".join(invalid_qualifiers)}, - ) - try: - return QualifiedSWHID(**parts, **qualifiers) - except ValueError as e: - raise ValidationError( - "ValueError: %(args)s", params={"args": e.args} - ) from None - - -@attr.s(frozen=True, kw_only=True) -class ExtendedSWHID(_BaseSWHID[ExtendedObjectType]): - """ - Dataclass holding the relevant info associated to a SoftWare Heritage - persistent IDentifier (SWHID). - - It extends `CoreSWHID`, by allowing non-standard object types; and should - only be used internally to Software Heritage. - - Raises: - swh.model.exceptions.ValidationError: In case of invalid object type or id - - To get the raw SWHID string from an instance of this class, - use the :func:`str` function: - - >>> swhid = ExtendedSWHID( - ... object_type=ExtendedObjectType.CONTENT, - ... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'), - ... ) - >>> str(swhid) - 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' - - And vice-versa with :meth:`CoreSWHID.from_string`: - - >>> swhid == ExtendedSWHID.from_string( - ... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0" - ... ) - True - """ - - object_type = attr.ib( - type=ExtendedObjectType, - validator=type_validator(), - converter=ExtendedObjectType, - ) - """the type of object the identifier points to""" - - -def _parse_swhid(swhid: str) -> Dict[str, Any]: - """Parse a Software Heritage identifier (SWHID) from string (see: - :ref:`persistent-identifiers`.) - - This is for internal use; use :meth:`CoreSWHID.from_string`, - :meth:`QualifiedSWHID.from_string`, or :meth:`ExtendedSWHID.from_string` instead, - as they perform validation and build a dataclass. - - Args: - swhid (str): A persistent identifier - - Raises: - swh.model.exceptions.ValidationError: if passed string is not a valid SWHID - - """ - m = SWHID_RE.fullmatch(swhid) - if not m: - raise ValidationError( - "Invalid SWHID: invalid syntax: %(swhid)s", params={"swhid": swhid} - ) - parts: Dict[str, Any] = m.groupdict() - - qualifiers_raw = parts["qualifiers"] - parts["qualifiers"] = {} - if qualifiers_raw: - for qualifier in qualifiers_raw.split(SWHID_CTXT_SEP): - try: - k, v = qualifier.split("=", maxsplit=1) - parts["qualifiers"][k] = v - except ValueError: - raise ValidationError( - "Invalid SWHID: invalid qualifier: %(qualifier)s", - params={"qualifier": qualifier}, - ) - - parts["scheme_version"] = int(parts["scheme_version"]) - parts["object_id"] = hash_to_bytes(parts["object_id"]) - return parts + return hash_to_hex(model.Origin.from_dict(origin).id) diff --git a/swh/model/model.py b/swh/model/model.py index ee1be6897b8ee0dcf7a71cc5086845678ae3a55a..0167eae597af309533cee10c7f06433a97448424 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -3,10 +3,22 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +""" +Implementation of Software Heritage's data model + +See :ref:`data-model` for an overview of the data model. + +The classes defined in this module are immutable +`attrs objects <https://attrs.org/>`__ and enums. + +All classes define a ``from_dict`` class method and a ``to_dict`` +method to convert between them and msgpack-serializable objects. +""" + from abc import ABCMeta, abstractmethod import datetime from enum import Enum -from hashlib import sha256 +import hashlib from typing import Any, Dict, Iterable, Optional, Tuple, TypeVar, Union import attr @@ -15,22 +27,13 @@ import dateutil.parser import iso8601 from typing_extensions import Final +from . import git_objects from .collections import ImmutableDict -from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes -from .identifiers import ( - directory_identifier, - extid_identifier, - normalize_timestamp, - origin_identifier, - raw_extrinsic_metadata_identifier, - release_identifier, - revision_identifier, - snapshot_identifier, -) -from .identifiers import CoreSWHID -from .identifiers import ExtendedObjectType as SwhidExtendedObjectType -from .identifiers import ExtendedSWHID -from .identifiers import ObjectType as SwhidObjectType +from .hashutil import DEFAULT_ALGORITHMS, MultiHash +from .swhids import CoreSWHID +from .swhids import ExtendedObjectType as SwhidExtendedObjectType +from .swhids import ExtendedSWHID +from .swhids import ObjectType as SwhidObjectType class MissingData(Exception): @@ -193,7 +196,29 @@ class Person(BaseModel): Anonymization is simply a Person which fullname is the hashed, with unset name or email. """ - return Person(fullname=sha256(self.fullname).digest(), name=None, email=None,) + return Person( + fullname=hashlib.sha256(self.fullname).digest(), name=None, email=None, + ) + + @classmethod + def from_dict(cls, d): + """ + If the fullname is missing, construct a fullname + using the following heuristics: if the name value is None, we return the + email in angle brackets, else, we return the name, a space, and the email + in angle brackets. + """ + if "fullname" not in d: + parts = [] + if d["name"] is not None: + parts.append(d["name"]) + if d["email"] is not None: + parts.append(b"".join([b"<", d["email"], b">"])) + + fullname = b" ".join(parts) + d = {**d, "fullname": fullname} + d = {"name": None, "email": None, **d} + return super().from_dict(d) @attr.s(frozen=True, slots=True) @@ -243,16 +268,60 @@ class TimestampWithTimezone(BaseModel): raise ValueError("negative_utc can only be True is offset=0") @classmethod - def from_dict(cls, obj: Union[Dict, datetime.datetime, int]): + def from_dict(cls, time_representation: Union[Dict, datetime.datetime, int]): """Builds a TimestampWithTimezone from any of the formats accepted by :func:`swh.model.normalize_timestamp`.""" # TODO: this accept way more types than just dicts; find a better # name - d = normalize_timestamp(obj) + negative_utc = False + + if isinstance(time_representation, dict): + ts = time_representation["timestamp"] + if isinstance(ts, dict): + seconds = ts.get("seconds", 0) + microseconds = ts.get("microseconds", 0) + elif isinstance(ts, int): + seconds = ts + microseconds = 0 + else: + raise ValueError( + f"TimestampWithTimezone.from_dict received non-integer timestamp " + f"member {ts!r}" + ) + offset = time_representation["offset"] + if "negative_utc" in time_representation: + negative_utc = time_representation["negative_utc"] + if negative_utc is None: + negative_utc = False + elif isinstance(time_representation, datetime.datetime): + microseconds = time_representation.microsecond + if microseconds: + time_representation = time_representation.replace(microsecond=0) + seconds = int(time_representation.timestamp()) + utcoffset = time_representation.utcoffset() + if utcoffset is None: + raise ValueError( + f"TimestampWithTimezone.from_dict received datetime without " + f"timezone: {time_representation}" + ) + + # utcoffset is an integer number of minutes + seconds_offset = utcoffset.total_seconds() + offset = int(seconds_offset) // 60 + elif isinstance(time_representation, int): + seconds = time_representation + microseconds = 0 + offset = 0 + else: + raise ValueError( + f"TimestampWithTimezone.from_dict received non-integer timestamp: " + f"{time_representation!r}" + ) + return cls( - timestamp=Timestamp.from_dict(d["timestamp"]), - offset=d["offset"], - negative_utc=d["negative_utc"], + timestamp=Timestamp(seconds=seconds, microseconds=microseconds), + offset=offset, + negative_utc=negative_utc, ) @classmethod @@ -286,21 +355,25 @@ class TimestampWithTimezone(BaseModel): @attr.s(frozen=True, slots=True) -class Origin(BaseModel): +class Origin(HashableObject, BaseModel): """Represents a software source: a VCS and an URL.""" object_type: Final = "origin" url = attr.ib(type=str, validator=type_validator()) + id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") + def unique_key(self) -> KeyType: return {"url": self.url} + def compute_hash(self) -> bytes: + return hashlib.sha1(self.url.encode("utf-8")).digest() + def swhid(self) -> ExtendedSWHID: """Returns a SWHID representing this origin.""" return ExtendedSWHID( - object_type=SwhidExtendedObjectType.ORIGIN, - object_id=hash_to_bytes(origin_identifier(self.unique_key())), + object_type=SwhidExtendedObjectType.ORIGIN, object_id=self.id, ) @@ -431,7 +504,8 @@ class Snapshot(HashableObject, BaseModel): id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") def compute_hash(self) -> bytes: - return hash_to_bytes(snapshot_identifier(self.to_dict())) + git_object = git_objects.snapshot_git_object(self) + return hashlib.new("sha1", git_object).digest() @classmethod def from_dict(cls, d): @@ -471,7 +545,8 @@ class Release(HashableObject, BaseModel): id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") def compute_hash(self) -> bytes: - return hash_to_bytes(release_identifier(self.to_dict())) + git_object = git_objects.release_git_object(self) + return hashlib.new("sha1", git_object).digest() @author.validator def check_author(self, attribute, value): @@ -514,6 +589,7 @@ class RevisionType(Enum): SUBVERSION = "svn" MERCURIAL = "hg" CVS = "cvs" + BAZAAR = "bzr" def tuplify_extra_headers(value: Iterable): @@ -564,7 +640,8 @@ class Revision(HashableObject, BaseModel): object.__setattr__(self, "metadata", metadata) def compute_hash(self) -> bytes: - return hash_to_bytes(revision_identifier(self.to_dict())) + git_object = git_objects.revision_git_object(self) + return hashlib.new("sha1", git_object).digest() @classmethod def from_dict(cls, d): @@ -621,7 +698,8 @@ class Directory(HashableObject, BaseModel): id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") def compute_hash(self) -> bytes: - return hash_to_bytes(directory_identifier(self.to_dict())) + git_object = git_objects.directory_git_object(self) + return hashlib.new("sha1", git_object).digest() @classmethod def from_dict(cls, d): @@ -950,7 +1028,8 @@ class RawExtrinsicMetadata(HashableObject, BaseModel): id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") def compute_hash(self) -> bytes: - return hash_to_bytes(raw_extrinsic_metadata_identifier(self.to_dict())) + git_object = git_objects.raw_extrinsic_metadata_git_object(self) + return hashlib.new("sha1", git_object).digest() @origin.validator def check_origin(self, attribute, value): @@ -1150,4 +1229,5 @@ class ExtID(HashableObject, BaseModel): ) def compute_hash(self) -> bytes: - return hash_to_bytes(extid_identifier(self.to_dict())) + git_object = git_objects.extid_git_object(self) + return hashlib.new("sha1", git_object).digest() diff --git a/swh/model/swhids.py b/swh/model/swhids.py new file mode 100644 index 0000000000000000000000000000000000000000..ee1be2004376e318f907e5f8c4623653be5f4716 --- /dev/null +++ b/swh/model/swhids.py @@ -0,0 +1,457 @@ +# Copyright (C) 2015-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +Classes to represent :ref:`SWH persistend IDentifiers <persistent-identifiers>`. + +:class:`CoreSWHID` represents a SWHID with no qualifier, and :class:`QualifiedSWHID` +represents a SWHID that may have qualifiers. +:class:`ExtendedSWHID` extends the definition of SWHID to other object types, +and is used internally in Software Heritage; it does not support qualifiers. +""" + +from __future__ import annotations + +import enum +import re +from typing import Any, Dict, Generic, Optional, Tuple, Type, TypeVar, Union +import urllib.parse + +import attr +from attrs_strict import type_validator + +from .exceptions import ValidationError +from .hashutil import hash_to_bytes, hash_to_hex + + +class ObjectType(enum.Enum): + """Possible object types of a QualifiedSWHID or CoreSWHID. + + The values of each variant is what is used in the SWHID's string representation.""" + + SNAPSHOT = "snp" + REVISION = "rev" + RELEASE = "rel" + DIRECTORY = "dir" + CONTENT = "cnt" + + +class ExtendedObjectType(enum.Enum): + """Possible object types of an ExtendedSWHID. + + The variants are a superset of :class:`ObjectType`'s""" + + SNAPSHOT = "snp" + REVISION = "rev" + RELEASE = "rel" + DIRECTORY = "dir" + CONTENT = "cnt" + ORIGIN = "ori" + RAW_EXTRINSIC_METADATA = "emd" + + +SWHID_NAMESPACE = "swh" +SWHID_VERSION = 1 +SWHID_TYPES = ["snp", "rel", "rev", "dir", "cnt"] +EXTENDED_SWHID_TYPES = SWHID_TYPES + ["ori", "emd"] +SWHID_SEP = ":" +SWHID_CTXT_SEP = ";" +SWHID_QUALIFIERS = {"origin", "anchor", "visit", "path", "lines"} + +SWHID_RE_RAW = ( + f"(?P<namespace>{SWHID_NAMESPACE})" + f"{SWHID_SEP}(?P<scheme_version>{SWHID_VERSION})" + f"{SWHID_SEP}(?P<object_type>{'|'.join(EXTENDED_SWHID_TYPES)})" + f"{SWHID_SEP}(?P<object_id>[0-9a-f]{{40}})" + f"({SWHID_CTXT_SEP}(?P<qualifiers>\\S+))?" +) +SWHID_RE = re.compile(SWHID_RE_RAW) + + +# type of the "object_type" attribute of the SWHID class; either +# ObjectType or ExtendedObjectType +_TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType) + +# the SWHID class itself (this is used so that X.from_string() can return X +# for all X subclass of _BaseSWHID) +_TSWHID = TypeVar("_TSWHID", bound="_BaseSWHID") + + +@attr.s(frozen=True, kw_only=True) +class _BaseSWHID(Generic[_TObjectType]): + """Common base class for CoreSWHID, QualifiedSWHID, and ExtendedSWHID. + + This is an "abstract" class and should not be instantiated directly; + it only exists to deduplicate code between these three SWHID classes.""" + + namespace = attr.ib(type=str, default=SWHID_NAMESPACE) + """the namespace of the identifier, defaults to ``swh``""" + + scheme_version = attr.ib(type=int, default=SWHID_VERSION) + """the scheme version of the identifier, defaults to 1""" + + # overridden by subclasses + object_type: _TObjectType + """the type of object the identifier points to""" + + object_id = attr.ib(type=bytes, validator=type_validator()) + """object's identifier""" + + @namespace.validator + def check_namespace(self, attribute, value): + if value != SWHID_NAMESPACE: + raise ValidationError( + "Invalid SWHID: invalid namespace: %(namespace)s", + params={"namespace": value}, + ) + + @scheme_version.validator + def check_scheme_version(self, attribute, value): + if value != SWHID_VERSION: + raise ValidationError( + "Invalid SWHID: invalid version: %(version)s", params={"version": value} + ) + + @object_id.validator + def check_object_id(self, attribute, value): + if len(value) != 20: + raise ValidationError( + "Invalid SWHID: invalid checksum: %(object_id)s", + params={"object_id": hash_to_hex(value)}, + ) + + def __str__(self) -> str: + return SWHID_SEP.join( + [ + self.namespace, + str(self.scheme_version), + self.object_type.value, + hash_to_hex(self.object_id), + ] + ) + + @classmethod + def from_string(cls: Type[_TSWHID], s: str) -> _TSWHID: + parts = _parse_swhid(s) + if parts.pop("qualifiers"): + raise ValidationError(f"{cls.__name__} does not support qualifiers.") + try: + return cls(**parts) + except ValueError as e: + raise ValidationError( + "ValueError: %(args)s", params={"args": e.args} + ) from None + + +@attr.s(frozen=True, kw_only=True) +class CoreSWHID(_BaseSWHID[ObjectType]): + """ + Dataclass holding the relevant info associated to a SoftWare Heritage + persistent IDentifier (SWHID). + + Unlike `QualifiedSWHID`, it is restricted to core SWHIDs, ie. SWHIDs + with no qualifiers. + + Raises: + swh.model.exceptions.ValidationError: In case of invalid object type or id + + To get the raw SWHID string from an instance of this class, + use the :func:`str` function: + + >>> swhid = CoreSWHID( + ... object_type=ObjectType.CONTENT, + ... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'), + ... ) + >>> str(swhid) + 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' + + And vice-versa with :meth:`CoreSWHID.from_string`: + + >>> swhid == CoreSWHID.from_string( + ... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0" + ... ) + True + """ + + object_type = attr.ib( + type=ObjectType, validator=type_validator(), converter=ObjectType + ) + """the type of object the identifier points to""" + + def to_extended(self) -> ExtendedSWHID: + """Converts this CoreSWHID into an ExtendedSWHID. + + As ExtendedSWHID is a superset of CoreSWHID, this is lossless.""" + return ExtendedSWHID( + namespace=self.namespace, + scheme_version=self.scheme_version, + object_type=ExtendedObjectType(self.object_type.value), + object_id=self.object_id, + ) + + +def _parse_core_swhid(swhid: Union[str, CoreSWHID, None]) -> Optional[CoreSWHID]: + if swhid is None or isinstance(swhid, CoreSWHID): + return swhid + else: + return CoreSWHID.from_string(swhid) + + +def _parse_lines_qualifier( + lines: Union[str, Tuple[int, Optional[int]], None] +) -> Optional[Tuple[int, Optional[int]]]: + try: + if lines is None or isinstance(lines, tuple): + return lines + elif "-" in lines: + (from_, to) = lines.split("-", 2) + return (int(from_), int(to)) + else: + return (int(lines), None) + except ValueError: + raise ValidationError( + "Invalid format for the lines qualifier: %(lines)s", params={"lines": lines} + ) + + +def _parse_path_qualifier(path: Union[str, bytes, None]) -> Optional[bytes]: + if path is None or isinstance(path, bytes): + return path + else: + return urllib.parse.unquote_to_bytes(path) + + +@attr.s(frozen=True, kw_only=True) +class QualifiedSWHID(_BaseSWHID[ObjectType]): + """ + Dataclass holding the relevant info associated to a SoftWare Heritage + persistent IDentifier (SWHID) + + Raises: + swh.model.exceptions.ValidationError: In case of invalid object type or id + + To get the raw SWHID string from an instance of this class, + use the :func:`str` function: + + >>> swhid = QualifiedSWHID( + ... object_type=ObjectType.CONTENT, + ... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'), + ... lines=(5, 10), + ... ) + >>> str(swhid) + 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0;lines=5-10' + + And vice-versa with :meth:`QualifiedSWHID.from_string`: + + >>> swhid == QualifiedSWHID.from_string( + ... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0;lines=5-10" + ... ) + True + """ + + object_type = attr.ib( + type=ObjectType, validator=type_validator(), converter=ObjectType + ) + """the type of object the identifier points to""" + + # qualifiers: + + origin = attr.ib(type=Optional[str], default=None, validator=type_validator()) + """the software origin where an object has been found or observed in the wild, + as an URI""" + + visit = attr.ib(type=Optional[CoreSWHID], default=None, converter=_parse_core_swhid) + """the core identifier of a snapshot corresponding to a specific visit + of a repository containing the designated object""" + + anchor = attr.ib( + type=Optional[CoreSWHID], + default=None, + validator=type_validator(), + converter=_parse_core_swhid, + ) + """a designated node in the Merkle DAG relative to which a path to the object + is specified, as the core identifier of a directory, a revision, a release, + or a snapshot""" + + path = attr.ib( + type=Optional[bytes], + default=None, + validator=type_validator(), + converter=_parse_path_qualifier, + ) + """the absolute file path, from the root directory associated to the anchor node, + to the object; when the anchor denotes a directory or a revision, and almost always + when it’s a release, the root directory is uniquely determined; + when the anchor denotes a snapshot, the root directory is the one pointed to by HEAD + (possibly indirectly), and undefined if such a reference is missing""" + + lines = attr.ib( + type=Optional[Tuple[int, Optional[int]]], + default=None, + validator=type_validator(), + converter=_parse_lines_qualifier, + ) + """lines: line number(s) of interest, usually within a content object""" + + @visit.validator + def check_visit(self, attribute, value): + if value and value.object_type != ObjectType.SNAPSHOT: + raise ValidationError( + "The 'visit' qualifier must be a 'snp' SWHID, not '%(type)s'", + params={"type": value.object_type.value}, + ) + + @anchor.validator + def check_anchor(self, attribute, value): + if value and value.object_type not in ( + ObjectType.DIRECTORY, + ObjectType.REVISION, + ObjectType.RELEASE, + ObjectType.SNAPSHOT, + ): + raise ValidationError( + "The 'visit' qualifier must be a 'dir', 'rev', 'rel', or 'snp' SWHID, " + "not '%s(type)s'", + params={"type": value.object_type.value}, + ) + + def qualifiers(self) -> Dict[str, str]: + origin = self.origin + if origin: + unescaped_origin = origin + origin = origin.replace(";", "%3B") + assert urllib.parse.unquote_to_bytes( + origin + ) == urllib.parse.unquote_to_bytes( + unescaped_origin + ), "Escaping ';' in the origin qualifier corrupted the origin URL." + + d: Dict[str, Optional[str]] = { + "origin": origin, + "visit": str(self.visit) if self.visit else None, + "anchor": str(self.anchor) if self.anchor else None, + "path": ( + urllib.parse.quote_from_bytes(self.path) + if self.path is not None + else None + ), + "lines": ( + "-".join(str(line) for line in self.lines if line is not None) + if self.lines + else None + ), + } + return {k: v for (k, v) in d.items() if v is not None} + + def __str__(self) -> str: + swhid = SWHID_SEP.join( + [ + self.namespace, + str(self.scheme_version), + self.object_type.value, + hash_to_hex(self.object_id), + ] + ) + qualifiers = self.qualifiers() + if qualifiers: + for k, v in qualifiers.items(): + swhid += "%s%s=%s" % (SWHID_CTXT_SEP, k, v) + return swhid + + @classmethod + def from_string(cls, s: str) -> QualifiedSWHID: + parts = _parse_swhid(s) + qualifiers = parts.pop("qualifiers") + invalid_qualifiers = set(qualifiers) - SWHID_QUALIFIERS + if invalid_qualifiers: + raise ValidationError( + "Invalid qualifier(s): %(qualifiers)s", + params={"qualifiers": ", ".join(invalid_qualifiers)}, + ) + try: + return QualifiedSWHID(**parts, **qualifiers) + except ValueError as e: + raise ValidationError( + "ValueError: %(args)s", params={"args": e.args} + ) from None + + +@attr.s(frozen=True, kw_only=True) +class ExtendedSWHID(_BaseSWHID[ExtendedObjectType]): + """ + Dataclass holding the relevant info associated to a SoftWare Heritage + persistent IDentifier (SWHID). + + It extends `CoreSWHID`, by allowing non-standard object types; and should + only be used internally to Software Heritage. + + Raises: + swh.model.exceptions.ValidationError: In case of invalid object type or id + + To get the raw SWHID string from an instance of this class, + use the :func:`str` function: + + >>> swhid = ExtendedSWHID( + ... object_type=ExtendedObjectType.CONTENT, + ... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'), + ... ) + >>> str(swhid) + 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' + + And vice-versa with :meth:`CoreSWHID.from_string`: + + >>> swhid == ExtendedSWHID.from_string( + ... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0" + ... ) + True + """ + + object_type = attr.ib( + type=ExtendedObjectType, + validator=type_validator(), + converter=ExtendedObjectType, + ) + """the type of object the identifier points to""" + + +def _parse_swhid(swhid: str) -> Dict[str, Any]: + """Parse a Software Heritage identifier (SWHID) from string (see: + :ref:`persistent-identifiers`.) + + This is for internal use; use :meth:`CoreSWHID.from_string`, + :meth:`QualifiedSWHID.from_string`, or :meth:`ExtendedSWHID.from_string` instead, + as they perform validation and build a dataclass. + + Args: + swhid (str): A persistent identifier + + Raises: + swh.model.exceptions.ValidationError: if passed string is not a valid SWHID + + """ + m = SWHID_RE.fullmatch(swhid) + if not m: + raise ValidationError( + "Invalid SWHID: invalid syntax: %(swhid)s", params={"swhid": swhid} + ) + parts: Dict[str, Any] = m.groupdict() + + qualifiers_raw = parts["qualifiers"] + parts["qualifiers"] = {} + if qualifiers_raw: + for qualifier in qualifiers_raw.split(SWHID_CTXT_SEP): + try: + k, v = qualifier.split("=", maxsplit=1) + parts["qualifiers"][k] = v + except ValueError: + raise ValidationError( + "Invalid SWHID: invalid qualifier: %(qualifier)s", + params={"qualifier": qualifier}, + ) + + parts["scheme_version"] = int(parts["scheme_version"]) + parts["object_id"] = hash_to_bytes(parts["object_id"]) + return parts diff --git a/swh/model/tests/swh_model_data.py b/swh/model/tests/swh_model_data.py index 1f5dded97c45274e19f48e5d277254ba4c1ab0da..d920c1e779c00e58692c4697849e184696b0e7e6 100644 --- a/swh/model/tests/swh_model_data.py +++ b/swh/model/tests/swh_model_data.py @@ -9,7 +9,6 @@ from typing import Dict, Sequence import attr from swh.model.hashutil import MultiHash, hash_to_bytes -from swh.model.identifiers import ExtendedSWHID from swh.model.model import ( BaseModel, Content, @@ -35,6 +34,7 @@ from swh.model.model import ( Timestamp, TimestampWithTimezone, ) +from swh.model.swhids import ExtendedSWHID UTC = datetime.timezone.utc diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index 01d2b70bf2ef8b04dbefb3873424dff553db3088..188c584ae97dd9452d6bbef2b703574022356012 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -3,27 +3,25 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import binascii import datetime import hashlib -import itertools from typing import Dict import unittest -import attr import pytest -from swh.model import hashutil, identifiers -from swh.model.exceptions import ValidationError +from swh.model import git_objects, hashutil from swh.model.hashutil import hash_to_bytes as _x -from swh.model.identifiers import ( - SWHID_QUALIFIERS, - CoreSWHID, - ExtendedObjectType, - ExtendedSWHID, - ObjectType, - QualifiedSWHID, - normalize_timestamp, +from swh.model.model import ( + Content, + Directory, + ExtID, + Origin, + RawExtrinsicMetadata, + Release, + Revision, + Snapshot, + TimestampWithTimezone, ) @@ -35,43 +33,6 @@ def remove_id(d: Dict) -> Dict: return d -class UtilityFunctionsIdentifier(unittest.TestCase): - def setUp(self): - self.str_id = "c2e41aae41ac17bd4a650770d6ee77f62e52235b" - self.bytes_id = binascii.unhexlify(self.str_id) - self.bad_type_id = object() - - def test_identifier_to_bytes(self): - for id in [self.str_id, self.bytes_id]: - self.assertEqual(identifiers.identifier_to_bytes(id), self.bytes_id) - - # wrong length - with self.assertRaises(ValueError) as cm: - identifiers.identifier_to_bytes(id[:-2]) - - self.assertIn("length", str(cm.exception)) - - with self.assertRaises(ValueError) as cm: - identifiers.identifier_to_bytes(self.bad_type_id) - - self.assertIn("type", str(cm.exception)) - - def test_identifier_to_str(self): - for id in [self.str_id, self.bytes_id]: - self.assertEqual(identifiers.identifier_to_str(id), self.str_id) - - # wrong length - with self.assertRaises(ValueError) as cm: - identifiers.identifier_to_str(id[:-2]) - - self.assertIn("length", str(cm.exception)) - - with self.assertRaises(ValueError) as cm: - identifiers.identifier_to_str(self.bad_type_id) - - self.assertIn("type", str(cm.exception)) - - class UtilityFunctionsDateOffset(unittest.TestCase): def setUp(self): self.dates = { @@ -79,9 +40,6 @@ class UtilityFunctionsDateOffset(unittest.TestCase): b"1448210036.002342": {"seconds": 1448210036, "microseconds": 2342,}, b"1448210036.12": {"seconds": 1448210036, "microseconds": 120000,}, } - self.broken_dates = [ - 1448210036.12, - ] self.offsets = { 0: b"+0000", @@ -91,16 +49,11 @@ class UtilityFunctionsDateOffset(unittest.TestCase): def test_format_date(self): for date_repr, date in self.dates.items(): - self.assertEqual(identifiers.format_date(date), date_repr) - - def test_format_date_fail(self): - for date in self.broken_dates: - with self.assertRaises(ValueError): - identifiers.format_date(date) + self.assertEqual(git_objects.format_date(date), date_repr) def test_format_offset(self): for offset, res in self.offsets.items(): - self.assertEqual(identifiers.format_offset(offset), res) + self.assertEqual(git_objects.format_offset(offset), res) content_example = { @@ -117,12 +70,12 @@ class ContentIdentifier(unittest.TestCase): def test_content_identifier(self): self.assertEqual( - identifiers.content_identifier(content_example), self.content_id + Content.from_data(content_example["data"]).hashes(), self.content_id ) directory_example = { - "id": "d7ed3d2c31d608823be58b1cbe57605310615231", + "id": _x("d7ed3d2c31d608823be58b1cbe57605310615231"), "entries": [ { "type": "file", @@ -231,8 +184,6 @@ directory_example = { ], } -dummy_qualifiers = {"origin": "https://example.com", "lines": "42"} - class DirectoryIdentifier(unittest.TestCase): def setUp(self): @@ -244,32 +195,29 @@ class DirectoryIdentifier(unittest.TestCase): } def test_dir_identifier(self): + self.assertEqual(Directory.from_dict(self.directory).id, self.directory["id"]) self.assertEqual( - identifiers.directory_identifier(self.directory), self.directory["id"] - ) - self.assertEqual( - identifiers.directory_identifier(remove_id(self.directory)), - self.directory["id"], + Directory.from_dict(remove_id(self.directory)).id, self.directory["id"], ) def test_dir_identifier_entry_order(self): # Reverse order of entries, check the id is still the same. directory = {"entries": reversed(self.directory["entries"])} self.assertEqual( - identifiers.directory_identifier(remove_id(directory)), self.directory["id"] + Directory.from_dict(remove_id(directory)).id, self.directory["id"], ) def test_dir_identifier_empty_directory(self): self.assertEqual( - identifiers.directory_identifier(remove_id(self.empty_directory)), - self.empty_directory["id"], + Directory.from_dict(remove_id(self.empty_directory)).id, + _x(self.empty_directory["id"]), ) linus_tz = datetime.timezone(datetime.timedelta(minutes=-420)) revision_example = { - "id": "bc0195aad0daa2ad5b0d76cce22b167bc3435590", + "id": _x("bc0195aad0daa2ad5b0d76cce22b167bc3435590"), "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"), "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")], "author": { @@ -314,7 +262,7 @@ dg1KdHOa34shrKDaOVzW self.revision = revision_example self.revision_none_metadata = { - "id": "bc0195aad0daa2ad5b0d76cce22b167bc3435590", + "id": _x("bc0195aad0daa2ad5b0d76cce22b167bc3435590"), "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"), "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")], "author": { @@ -330,12 +278,13 @@ dg1KdHOa34shrKDaOVzW 2015, 7, 12, 15, 10, 30, tzinfo=linus_tz ), "message": b"Linux 4.2-rc2\n", + "type": "git", + "synthetic": False, "metadata": None, } self.synthetic_revision = { - "id": b"\xb2\xa7\xe1&\x04\x92\xe3D\xfa\xb3\xcb\xf9\x1b\xc1<\x91" - b"\xe0T&\xfd", + "id": _x("b2a7e1260492e344fab3cbf91bc13c91e05426fd"), "author": { "name": b"Software Heritage", "email": b"robot@softwareheritage.org", @@ -352,10 +301,9 @@ dg1KdHOa34shrKDaOVzW }, "committer_date": 1437047495, "synthetic": True, - "parents": [None], + "parents": [], "message": b"synthetic revision message\n", - "directory": b"\xd1\x1f\x00\xa6\xa0\xfe\xa6\x05SA\xd2U\x84\xb5\xa9" - b"e\x16\xc0\xd2\xb8", + "directory": _x("d11f00a6a0fea6055341d25584b5a96516c0d2b8"), "metadata": { "original_artifact": [ { @@ -372,7 +320,7 @@ dg1KdHOa34shrKDaOVzW # cat commit.txt | git hash-object -t commit --stdin self.revision_with_extra_headers = { - "id": "010d34f384fa99d047cdd5e2f41e56e5c2feee45", + "id": _x("010d34f384fa99d047cdd5e2f41e56e5c2feee45"), "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"), "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")], "author": { @@ -390,6 +338,8 @@ dg1KdHOa34shrKDaOVzW 2015, 7, 12, 15, 10, 30, tzinfo=linus_tz ), "message": b"Linux 4.2-rc2\n", + "type": "git", + "synthetic": False, "extra_headers": ( (b"svn-repo-uuid", b"046f1af7-66c2-d61b-5410-ce57b7db7bff"), (b"svn-revision", b"10"), @@ -397,7 +347,7 @@ dg1KdHOa34shrKDaOVzW } self.revision_with_gpgsig = { - "id": "44cc742a8ca17b9c279be4cc195a93a6ef7a320e", + "id": _x("44cc742a8ca17b9c279be4cc195a93a6ef7a320e"), "directory": _x("b134f9b7dc434f593c0bab696345548b37de0558"), "parents": [ _x("689664ae944b4692724f13b709a4e4de28b54e57"), @@ -417,10 +367,12 @@ dg1KdHOa34shrKDaOVzW * 'master' of git://github.com/alexhenrie/git-po: l10n: ca.po: update translation """, + "type": "git", + "synthetic": False, } self.revision_no_message = { - "id": "4cfc623c9238fa92c832beed000ce2d003fd8333", + "id": _x("4cfc623c9238fa92c832beed000ce2d003fd8333"), "directory": _x("b134f9b7dc434f593c0bab696345548b37de0558"), "parents": [ _x("689664ae944b4692724f13b709a4e4de28b54e57"), @@ -435,10 +387,12 @@ dg1KdHOa34shrKDaOVzW "committer": {"name": b"Jiang Xin", "email": b"worldhello.net@gmail.com",}, "committer_date": {"timestamp": 1428538899, "offset": 480,}, "message": None, + "type": "git", + "synthetic": False, } self.revision_empty_message = { - "id": "7442cd78bd3b4966921d6a7f7447417b7acb15eb", + "id": _x("7442cd78bd3b4966921d6a7f7447417b7acb15eb"), "directory": _x("b134f9b7dc434f593c0bab696345548b37de0558"), "parents": [ _x("689664ae944b4692724f13b709a4e4de28b54e57"), @@ -453,10 +407,12 @@ dg1KdHOa34shrKDaOVzW "committer": {"name": b"Jiang Xin", "email": b"worldhello.net@gmail.com",}, "committer_date": {"timestamp": 1428538899, "offset": 480,}, "message": b"", + "type": "git", + "synthetic": False, } self.revision_only_fullname = { - "id": "010d34f384fa99d047cdd5e2f41e56e5c2feee45", + "id": _x("010d34f384fa99d047cdd5e2f41e56e5c2feee45"), "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"), "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")], "author": {"fullname": b"Linus Torvalds <torvalds@linux-foundation.org>",}, @@ -468,6 +424,8 @@ dg1KdHOa34shrKDaOVzW 2015, 7, 12, 15, 10, 30, tzinfo=linus_tz ), "message": b"Linux 4.2-rc2\n", + "type": "git", + "synthetic": False, "extra_headers": ( (b"svn-repo-uuid", b"046f1af7-66c2-d61b-5410-ce57b7db7bff"), (b"svn-revision", b"10"), @@ -476,62 +434,58 @@ dg1KdHOa34shrKDaOVzW def test_revision_identifier(self): self.assertEqual( - identifiers.revision_identifier(self.revision), - identifiers.identifier_to_str(self.revision["id"]), + Revision.from_dict(self.revision).id, self.revision["id"], ) self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision)), - identifiers.identifier_to_str(self.revision["id"]), + Revision.from_dict(remove_id(self.revision)).id, self.revision["id"], ) def test_revision_identifier_none_metadata(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision_none_metadata)), - identifiers.identifier_to_str(self.revision_none_metadata["id"]), + Revision.from_dict(remove_id(self.revision_none_metadata)).id, + self.revision_none_metadata["id"], ) def test_revision_identifier_synthetic(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.synthetic_revision)), - identifiers.identifier_to_str(self.synthetic_revision["id"]), + Revision.from_dict(remove_id(self.synthetic_revision)).id, + self.synthetic_revision["id"], ) def test_revision_identifier_with_extra_headers(self): self.assertEqual( - identifiers.revision_identifier( - remove_id(self.revision_with_extra_headers) - ), - identifiers.identifier_to_str(self.revision_with_extra_headers["id"]), + Revision.from_dict(remove_id(self.revision_with_extra_headers)).id, + self.revision_with_extra_headers["id"], ) def test_revision_identifier_with_gpgsig(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision_with_gpgsig)), - identifiers.identifier_to_str(self.revision_with_gpgsig["id"]), + Revision.from_dict(remove_id(self.revision_with_gpgsig)).id, + self.revision_with_gpgsig["id"], ) def test_revision_identifier_no_message(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision_no_message)), - identifiers.identifier_to_str(self.revision_no_message["id"]), + Revision.from_dict(remove_id(self.revision_no_message)).id, + self.revision_no_message["id"], ) def test_revision_identifier_empty_message(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision_empty_message)), - identifiers.identifier_to_str(self.revision_empty_message["id"]), + Revision.from_dict(remove_id(self.revision_empty_message)).id, + self.revision_empty_message["id"], ) def test_revision_identifier_only_fullname(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision_only_fullname)), - identifiers.identifier_to_str(self.revision_only_fullname["id"]), + Revision.from_dict(remove_id(self.revision_only_fullname)).id, + self.revision_only_fullname["id"], ) release_example = { - "id": "2b10839e32c4c476e9d94492756bb1a3e1ec4aa8", - "target": b't\x1b"R\xa5\xe1Ml`\xa9\x13\xc7z`\x99\xab\xe7:\x85J', + "id": _x("2b10839e32c4c476e9d94492756bb1a3e1ec4aa8"), + "target": _x("741b2252a5e14d6c60a913c77a6099abe73a854a"), "target_type": "revision", "name": b"v2.6.14", "author": { @@ -561,8 +515,8 @@ class ReleaseIdentifier(unittest.TestCase): self.release = release_example self.release_no_author = { - "id": b"&y\x1a\x8b\xcf\x0em3\xf4:\xefv\x82\xbd\xb5U#mV\xde", - "target": "9ee1c939d1cb936b1f98e8d81aeffab57bae46ab", + "id": _x("26791a8bcf0e6d33f43aef7682bdb555236d56de"), + "target": _x("9ee1c939d1cb936b1f98e8d81aeffab57bae46ab"), "target_type": "revision", "name": b"v2.6.12", "message": b"""\ @@ -579,40 +533,38 @@ o6X/3T+vm8K3bf3driRr34c= } self.release_no_message = { - "id": "b6f4f446715f7d9543ef54e41b62982f0db40045", - "target": "9ee1c939d1cb936b1f98e8d81aeffab57bae46ab", + "id": _x("b6f4f446715f7d9543ef54e41b62982f0db40045"), + "target": _x("9ee1c939d1cb936b1f98e8d81aeffab57bae46ab"), "target_type": "revision", "name": b"v2.6.12", "author": {"name": b"Linus Torvalds", "email": b"torvalds@g5.osdl.org",}, "date": datetime.datetime(2005, 10, 27, 17, 2, 33, tzinfo=linus_tz), "message": None, + "synthetic": False, } self.release_empty_message = { - "id": "71a0aea72444d396575dc25ac37fec87ee3c6492", - "target": "9ee1c939d1cb936b1f98e8d81aeffab57bae46ab", + "id": _x("71a0aea72444d396575dc25ac37fec87ee3c6492"), + "target": _x("9ee1c939d1cb936b1f98e8d81aeffab57bae46ab"), "target_type": "revision", "name": b"v2.6.12", "author": {"name": b"Linus Torvalds", "email": b"torvalds@g5.osdl.org",}, "date": datetime.datetime(2005, 10, 27, 17, 2, 33, tzinfo=linus_tz), "message": b"", + "synthetic": False, } self.release_negative_utc = { - "id": "97c8d2573a001f88e72d75f596cf86b12b82fd01", + "id": _x("97c8d2573a001f88e72d75f596cf86b12b82fd01"), "name": b"20081029", - "target": "54e9abca4c77421e2921f5f156c9fe4a9f7441c7", + "target": _x("54e9abca4c77421e2921f5f156c9fe4a9f7441c7"), "target_type": "revision", "date": { "timestamp": {"seconds": 1225281976}, "offset": 0, "negative_utc": True, }, - "author": { - "name": b"Otavio Salvador", - "email": b"otavio@debian.org", - "id": 17640, - }, + "author": {"name": b"Otavio Salvador", "email": b"otavio@debian.org",}, "synthetic": False, "message": b"tagging version 20081029\n\nr56558\n", } @@ -628,62 +580,62 @@ o6X/3T+vm8K3bf3driRr34c= "offset": 600, "timestamp": {"microseconds": 0, "seconds": 1377480558,}, }, - "id": b"\\\x98\xf5Y\xd04\x16-\xe2->\xbe\xb9T3\xe6\xf8\x88R1", + "id": _x("5c98f559d034162de22d3ebeb95433e6f8885231"), "message": b"Release of v0.3.2.", "name": b"0.3.2", "synthetic": False, - "target": (b"\xc0j\xa3\xd9;x\xa2\x86\\I5\x17" b"\x000\xf8\xc2\xd79o\xd3"), + "target": _x("c06aa3d93b78a2865c4935170030f8c2d7396fd3"), "target_type": "revision", } self.release_snapshot_target = dict(self.release) self.release_snapshot_target["target_type"] = "snapshot" - self.release_snapshot_target["id"] = "c29c3ddcc6769a04e54dd69d63a6fdcbc566f850" + self.release_snapshot_target["id"] = _x( + "c29c3ddcc6769a04e54dd69d63a6fdcbc566f850" + ) def test_release_identifier(self): self.assertEqual( - identifiers.release_identifier(self.release), - identifiers.identifier_to_str(self.release["id"]), + Release.from_dict(self.release).id, self.release["id"], ) self.assertEqual( - identifiers.release_identifier(remove_id(self.release)), - identifiers.identifier_to_str(self.release["id"]), + Release.from_dict(remove_id(self.release)).id, self.release["id"], ) def test_release_identifier_no_author(self): self.assertEqual( - identifiers.release_identifier(remove_id(self.release_no_author)), - identifiers.identifier_to_str(self.release_no_author["id"]), + Release.from_dict(remove_id(self.release_no_author)).id, + self.release_no_author["id"], ) def test_release_identifier_no_message(self): self.assertEqual( - identifiers.release_identifier(remove_id(self.release_no_message)), - identifiers.identifier_to_str(self.release_no_message["id"]), + Release.from_dict(remove_id(self.release_no_message)).id, + self.release_no_message["id"], ) def test_release_identifier_empty_message(self): self.assertEqual( - identifiers.release_identifier(remove_id(self.release_empty_message)), - identifiers.identifier_to_str(self.release_empty_message["id"]), + Release.from_dict(remove_id(self.release_empty_message)).id, + self.release_empty_message["id"], ) def test_release_identifier_negative_utc(self): self.assertEqual( - identifiers.release_identifier(remove_id(self.release_negative_utc)), - identifiers.identifier_to_str(self.release_negative_utc["id"]), + Release.from_dict(remove_id(self.release_negative_utc)).id, + self.release_negative_utc["id"], ) def test_release_identifier_newline_in_author(self): self.assertEqual( - identifiers.release_identifier(remove_id(self.release_newline_in_author)), - identifiers.identifier_to_str(self.release_newline_in_author["id"]), + Release.from_dict(remove_id(self.release_newline_in_author)).id, + self.release_newline_in_author["id"], ) def test_release_identifier_snapshot_target(self): self.assertEqual( - identifiers.release_identifier(self.release_snapshot_target), - identifiers.identifier_to_str(self.release_snapshot_target["id"]), + Release.from_dict(self.release_snapshot_target).id, + self.release_snapshot_target["id"], ) @@ -721,17 +673,17 @@ class SnapshotIdentifier(unittest.TestCase): super().setUp() self.empty = { - "id": "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e", + "id": _x("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), "branches": {}, } self.dangling_branch = { - "id": "c84502e821eb21ed84e9fd3ec40973abc8b32353", + "id": _x("c84502e821eb21ed84e9fd3ec40973abc8b32353"), "branches": {b"HEAD": None,}, } self.unresolved = { - "id": "84b4548ea486e4b0a7933fa541ff1503a0afe1e0", + "id": _x("84b4548ea486e4b0a7933fa541ff1503a0afe1e0"), "branches": {b"foo": {"target": b"bar", "target_type": "alias",},}, } @@ -739,32 +691,22 @@ class SnapshotIdentifier(unittest.TestCase): def test_empty_snapshot(self): self.assertEqual( - identifiers.snapshot_identifier(remove_id(self.empty)), - identifiers.identifier_to_str(self.empty["id"]), + Snapshot.from_dict(remove_id(self.empty)).id, self.empty["id"], ) def test_dangling_branch(self): self.assertEqual( - identifiers.snapshot_identifier(remove_id(self.dangling_branch)), - identifiers.identifier_to_str(self.dangling_branch["id"]), + Snapshot.from_dict(remove_id(self.dangling_branch)).id, + self.dangling_branch["id"], ) def test_unresolved(self): with self.assertRaisesRegex(ValueError, "b'foo' -> b'bar'"): - identifiers.snapshot_identifier(remove_id(self.unresolved)) - - def test_unresolved_force(self): - self.assertEqual( - identifiers.snapshot_identifier( - remove_id(self.unresolved), ignore_unresolved=True, - ), - identifiers.identifier_to_str(self.unresolved["id"]), - ) + Snapshot.from_dict(remove_id(self.unresolved)) def test_all_types(self): self.assertEqual( - identifiers.snapshot_identifier(remove_id(self.all_types)), - identifiers.identifier_to_str(self.all_types["id"]), + Snapshot.from_dict(remove_id(self.all_types)).id, self.all_types["id"], ) @@ -797,11 +739,11 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): **self.minimal, "origin": "https://forge.softwareheritage.org/source/swh-model/", "visit": 42, - "snapshot": CoreSWHID.from_string("swh:1:snp:" + "00" * 20), - "release": CoreSWHID.from_string("swh:1:rel:" + "01" * 20), - "revision": CoreSWHID.from_string("swh:1:rev:" + "02" * 20), + "snapshot": "swh:1:snp:" + "00" * 20, + "release": "swh:1:rel:" + "01" * 20, + "revision": "swh:1:rev:" + "02" * 20, "path": b"/abc/def", - "directory": CoreSWHID.from_string("swh:1:dir:" + "03" * 20), + "directory": "swh:1:dir:" + "03" * 20, } def test_minimal(self): @@ -817,15 +759,18 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(self.minimal), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(self.minimal) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.minimal), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(self.minimal).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.minimal), - "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + RawExtrinsicMetadata.from_dict(self.minimal).id, + _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"), ) def test_maximal(self): @@ -848,15 +793,18 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(self.maximal), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(self.maximal) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.maximal), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(self.maximal).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.maximal), - "f96966e1093d15236a31fde07e47d5b1c9428049", + RawExtrinsicMetadata.from_dict(self.maximal).id, + _x("f96966e1093d15236a31fde07e47d5b1c9428049"), ) def test_nonascii_path(self): @@ -878,15 +826,18 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(metadata).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "7cc83fd1912176510c083f5df43f01b09af4b333", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("7cc83fd1912176510c083f5df43f01b09af4b333"), ) def test_timezone_insensitive(self): @@ -901,16 +852,20 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): } self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(self.minimal), - identifiers.raw_extrinsic_metadata_git_object(metadata), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(self.minimal) + ), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.minimal), - identifiers.raw_extrinsic_metadata_identifier(metadata), + RawExtrinsicMetadata.from_dict(self.minimal).id, + RawExtrinsicMetadata.from_dict(metadata).id, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"), ) def test_microsecond_insensitive(self): @@ -924,16 +879,20 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): } self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(self.minimal), - identifiers.raw_extrinsic_metadata_git_object(metadata), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(self.minimal) + ), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.minimal), - identifiers.raw_extrinsic_metadata_identifier(metadata), + RawExtrinsicMetadata.from_dict(self.minimal).id, + RawExtrinsicMetadata.from_dict(metadata).id, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"), ) def test_noninteger_timezone(self): @@ -948,16 +907,20 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): } self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(self.minimal), - identifiers.raw_extrinsic_metadata_git_object(metadata), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(self.minimal) + ), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.minimal), - identifiers.raw_extrinsic_metadata_identifier(metadata), + RawExtrinsicMetadata.from_dict(self.minimal).id, + RawExtrinsicMetadata.from_dict(metadata).id, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"), ) def test_negative_timestamp(self): @@ -980,15 +943,18 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(metadata).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "895d0821a2991dd376ddc303424aceb7c68280f9", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("895d0821a2991dd376ddc303424aceb7c68280f9"), ) def test_epoch(self): @@ -1011,15 +977,18 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(metadata).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "27a53df54ace35ebd910493cdc70b334d6b7cb88", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("27a53df54ace35ebd910493cdc70b334d6b7cb88"), ) def test_negative_epoch(self): @@ -1042,15 +1011,18 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(metadata).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "be7154a8fd49d87f81547ea634d1e2152907d089", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("be7154a8fd49d87f81547ea634d1e2152907d089"), ) @@ -1062,8 +1034,8 @@ origin_example = { class OriginIdentifier(unittest.TestCase): def test_content_identifier(self): self.assertEqual( - identifiers.origin_identifier(origin_example), - "b63a575fe3faab7692c9f38fb09d4bb45651bb0f", + Origin.from_dict(origin_example).id, + _x("b63a575fe3faab7692c9f38fb09d4bb45651bb0f"), ) @@ -1153,7 +1125,7 @@ TS_DICTS = [ @pytest.mark.parametrize("dict_input,expected", TS_DICTS) def test_normalize_timestamp_dict(dict_input, expected): - assert normalize_timestamp(dict_input) == expected + assert TimestampWithTimezone.from_dict(dict_input).to_dict() == expected TS_DICTS_INVALID_TIMESTAMP = [ @@ -1169,7 +1141,7 @@ TS_DICTS_INVALID_TIMESTAMP = [ @pytest.mark.parametrize("dict_input", TS_DICTS_INVALID_TIMESTAMP) def test_normalize_timestamp_dict_invalid_timestamp(dict_input): with pytest.raises(ValueError, match="non-integer timestamp"): - normalize_timestamp(dict_input) + TimestampWithTimezone.from_dict(dict_input) UTC = datetime.timezone.utc @@ -1194,647 +1166,30 @@ TS_DT_EXPECTED = [1582814359, 4765132799, -11348929020] @pytest.mark.parametrize("microsecond", [0, 1, 10, 100, 1000, 999999]) def test_normalize_timestamp_datetime(date, seconds, tz, offset, microsecond): date = date.astimezone(tz).replace(microsecond=microsecond) - assert normalize_timestamp(date) == { + assert TimestampWithTimezone.from_dict(date).to_dict() == { "timestamp": {"seconds": seconds, "microseconds": microsecond}, "offset": offset, "negative_utc": False, } -# SWHIDs that are outright invalid, no matter the context -INVALID_SWHIDS = [ - "swh:1:cnt", - "swh:1:", - "swh:", - "swh:1:cnt:", - "foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505", - "swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505", - "swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505", - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;malformed", - "swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d", - "swh:1:snp:foo", - # wrong qualifier: ori should be origin - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa - # wrong qualifier: anc should be anchor - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anc=1;visit=1;path=/", # noqa - # wrong qualifier: vis should be visit - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=1;vis=1;path=/", # noqa - # wrong qualifier: pa should be path - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=1;visit=1;pa=/", # noqa - # wrong qualifier: line should be lines - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;line=10;origin=something;anchor=1;visit=1;path=/", # noqa - # wrong qualifier value: it contains space before of after - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin= https://some-url", # noqa - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=some-anchor ", # noqa - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=some-anchor ;visit=1", # noqa - # invalid swhid: whitespaces - "swh :1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa - "swh: 1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa - "swh: 1: dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa - "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d", - "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d; origin=blah", - "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", - # other whitespaces - "swh\t:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", - "swh:1\n:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", - "swh:1:\rdir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d\f;lines=12", - "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12\v", -] - -SWHID_CLASSES = [CoreSWHID, QualifiedSWHID, ExtendedSWHID] - - -@pytest.mark.parametrize( - "invalid_swhid,swhid_class", itertools.product(INVALID_SWHIDS, SWHID_CLASSES) -) -def test_swhid_parsing_error(invalid_swhid, swhid_class): - """Tests SWHID strings that are invalid for all SWHID classes do raise - a ValidationError""" - with pytest.raises(ValidationError): - swhid_class.from_string(invalid_swhid) - - -# string SWHIDs, and how they should be parsed by each of the classes, -# or None if the class does not support it -HASH = "94a9ed024d3859793618152ea559a168bbcbb5e2" -VALID_SWHIDS = [ - ( - f"swh:1:cnt:{HASH}", - CoreSWHID(object_type=ObjectType.CONTENT, object_id=_x(HASH),), - QualifiedSWHID(object_type=ObjectType.CONTENT, object_id=_x(HASH),), - ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=_x(HASH),), - ), - ( - f"swh:1:dir:{HASH}", - CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=_x(HASH),), - QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=_x(HASH),), - ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=_x(HASH),), - ), - ( - f"swh:1:rev:{HASH}", - CoreSWHID(object_type=ObjectType.REVISION, object_id=_x(HASH),), - QualifiedSWHID(object_type=ObjectType.REVISION, object_id=_x(HASH),), - ExtendedSWHID(object_type=ExtendedObjectType.REVISION, object_id=_x(HASH),), - ), - ( - f"swh:1:rel:{HASH}", - CoreSWHID(object_type=ObjectType.RELEASE, object_id=_x(HASH),), - QualifiedSWHID(object_type=ObjectType.RELEASE, object_id=_x(HASH),), - ExtendedSWHID(object_type=ExtendedObjectType.RELEASE, object_id=_x(HASH),), - ), - ( - f"swh:1:snp:{HASH}", - CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH),), - QualifiedSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH),), - ExtendedSWHID(object_type=ExtendedObjectType.SNAPSHOT, object_id=_x(HASH),), - ), - ( - f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython;lines=1-18", - None, # CoreSWHID does not allow qualifiers - QualifiedSWHID( - object_type=ObjectType.CONTENT, - object_id=_x(HASH), - origin="https://github.com/python/cpython", - lines=(1, 18), - ), - None, # Neither does ExtendedSWHID - ), - ( - f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython;lines=1-18/", - None, # likewise - None, - None, # likewise - ), - ( - f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython;lines=18", - None, # likewise - QualifiedSWHID( - object_type=ObjectType.CONTENT, - object_id=_x(HASH), - origin="https://github.com/python/cpython", - lines=(18, None), - ), - None, # likewise - ), - ( - f"swh:1:dir:{HASH};origin=deb://Debian/packages/linuxdoc-tools", - None, # likewise - QualifiedSWHID( - object_type=ObjectType.DIRECTORY, - object_id=_x(HASH), - origin="deb://Debian/packages/linuxdoc-tools", - ), - None, # likewise - ), - ( - f"swh:1:ori:{HASH}", - None, # CoreSWHID does not allow origin pseudo-SWHIDs - None, # Neither does QualifiedSWHID - ExtendedSWHID(object_type=ExtendedObjectType.ORIGIN, object_id=_x(HASH),), - ), - ( - f"swh:1:emd:{HASH}", - None, # likewise for metadata pseudo-SWHIDs - None, # Neither does QualifiedSWHID - ExtendedSWHID( - object_type=ExtendedObjectType.RAW_EXTRINSIC_METADATA, object_id=_x(HASH), - ), - ), - ( - f"swh:1:emd:{HASH};origin=https://github.com/python/cpython", - None, # CoreSWHID does not allow metadata pseudo-SWHIDs or qualifiers - None, # QualifiedSWHID does not allow metadata pseudo-SWHIDs - None, # ExtendedSWHID does not allow qualifiers - ), -] - - -@pytest.mark.parametrize( - "string,core,qualified,extended", - [ - pytest.param(string, core, qualified, extended, id=string) - for (string, core, qualified, extended) in VALID_SWHIDS - ], -) -def test_parse_unparse_swhids(string, core, qualified, extended): - """Tests parsing and serializing valid SWHIDs with the various SWHID classes.""" - classes = [CoreSWHID, QualifiedSWHID, ExtendedSWHID] - for (cls, parsed_swhid) in zip(classes, [core, qualified, extended]): - if parsed_swhid is None: - # This class should not accept this SWHID - with pytest.raises(ValidationError) as excinfo: - cls.from_string(string) - # Check string serialization for exception - assert str(excinfo.value) is not None - else: - # This class should - assert cls.from_string(string) == parsed_swhid - - # Also check serialization - assert string == str(parsed_swhid) - - -@pytest.mark.parametrize( - "core,extended", - [ - pytest.param(core, extended, id=string) - for (string, core, qualified, extended) in VALID_SWHIDS - if core is not None - ], -) -def test_core_to_extended(core, extended): - assert core.to_extended() == extended - - -@pytest.mark.parametrize( - "ns,version,type,id,qualifiers", - [ - ("foo", 1, ObjectType.CONTENT, "abc8bc9d7a6bcf6db04f476d29314f157507d505", {}), - ("swh", 2, ObjectType.CONTENT, "def8bc9d7a6bcf6db04f476d29314f157507d505", {}), - ("swh", 1, ObjectType.DIRECTORY, "aaaa", {}), - ], -) -def test_QualifiedSWHID_validation_error(ns, version, type, id, qualifiers): - with pytest.raises(ValidationError): - QualifiedSWHID( - namespace=ns, - scheme_version=version, - object_type=type, - object_id=_x(id), - **qualifiers, - ) - - -@pytest.mark.parametrize( - "object_type,qualifiers,expected", - [ - # No qualifier: - (ObjectType.CONTENT, {}, f"swh:1:cnt:{HASH}"), - # origin: - (ObjectType.CONTENT, {"origin": None}, f"swh:1:cnt:{HASH}"), - (ObjectType.CONTENT, {"origin": 42}, ValueError), - # visit: - ( - ObjectType.CONTENT, - {"visit": f"swh:1:snp:{HASH}"}, - f"swh:1:cnt:{HASH};visit=swh:1:snp:{HASH}", - ), - ( - ObjectType.CONTENT, - {"visit": CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH))}, - f"swh:1:cnt:{HASH};visit=swh:1:snp:{HASH}", - ), - (ObjectType.CONTENT, {"visit": 42}, TypeError), - (ObjectType.CONTENT, {"visit": f"swh:1:rel:{HASH}"}, ValidationError,), - ( - ObjectType.CONTENT, - {"visit": CoreSWHID(object_type=ObjectType.RELEASE, object_id=_x(HASH))}, - ValidationError, - ), - # anchor: - ( - ObjectType.CONTENT, - {"anchor": f"swh:1:snp:{HASH}"}, - f"swh:1:cnt:{HASH};anchor=swh:1:snp:{HASH}", - ), - ( - ObjectType.CONTENT, - {"anchor": CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH))}, - f"swh:1:cnt:{HASH};anchor=swh:1:snp:{HASH}", - ), - ( - ObjectType.CONTENT, - {"anchor": f"swh:1:dir:{HASH}"}, - f"swh:1:cnt:{HASH};anchor=swh:1:dir:{HASH}", - ), - ( - ObjectType.CONTENT, - {"anchor": CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=_x(HASH))}, - f"swh:1:cnt:{HASH};anchor=swh:1:dir:{HASH}", - ), - (ObjectType.CONTENT, {"anchor": 42}, TypeError), - (ObjectType.CONTENT, {"anchor": f"swh:1:cnt:{HASH}"}, ValidationError,), - ( - ObjectType.CONTENT, - {"anchor": CoreSWHID(object_type=ObjectType.CONTENT, object_id=_x(HASH))}, - ValidationError, - ), - # path: - (ObjectType.CONTENT, {"path": b"/foo"}, f"swh:1:cnt:{HASH};path=/foo",), - ( - ObjectType.CONTENT, - {"path": b"/foo;bar"}, - f"swh:1:cnt:{HASH};path=/foo%3Bbar", - ), - (ObjectType.CONTENT, {"path": "/foo"}, f"swh:1:cnt:{HASH};path=/foo",), - ( - ObjectType.CONTENT, - {"path": "/foo;bar"}, - f"swh:1:cnt:{HASH};path=/foo%3Bbar", - ), - (ObjectType.CONTENT, {"path": 42}, Exception), - # lines: - (ObjectType.CONTENT, {"lines": (42, None)}, f"swh:1:cnt:{HASH};lines=42",), - (ObjectType.CONTENT, {"lines": (21, 42)}, f"swh:1:cnt:{HASH};lines=21-42",), - (ObjectType.CONTENT, {"lines": 42}, TypeError,), - (ObjectType.CONTENT, {"lines": (None, 42)}, ValueError,), - (ObjectType.CONTENT, {"lines": ("42", None)}, ValueError,), - ], -) -def test_QualifiedSWHID_init(object_type, qualifiers, expected): - """Tests validation and converters of qualifiers""" - if isinstance(expected, type): - assert issubclass(expected, Exception) - with pytest.raises(expected): - QualifiedSWHID(object_type=object_type, object_id=_x(HASH), **qualifiers) - else: - assert isinstance(expected, str) - swhid = QualifiedSWHID( - object_type=object_type, object_id=_x(HASH), **qualifiers - ) - - # Check the build object has the right serialization - assert expected == str(swhid) - - # Check the internal state of the object is the same as if parsed from a string - assert QualifiedSWHID.from_string(expected) == swhid - - -def test_QualifiedSWHID_hash(): - object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") - - assert hash( - QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id) - ) == hash(QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id)) - - assert hash( - QualifiedSWHID( - object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, - ) - ) == hash( - QualifiedSWHID( - object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, - ) - ) - - # Different order of the dictionary, so the underlying order of the tuple in - # ImmutableDict is different. - assert hash( - QualifiedSWHID( - object_type=ObjectType.DIRECTORY, - object_id=object_id, - origin="https://example.com", - lines=(42, None), - ) - ) == hash( - QualifiedSWHID( - object_type=ObjectType.DIRECTORY, - object_id=object_id, - lines=(42, None), - origin="https://example.com", - ) - ) - - -def test_QualifiedSWHID_eq(): - object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") - - assert QualifiedSWHID( - object_type=ObjectType.DIRECTORY, object_id=object_id - ) == QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id) - - assert QualifiedSWHID( - object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, - ) == QualifiedSWHID( - object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, - ) - - assert QualifiedSWHID( - object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, - ) == QualifiedSWHID( - object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, - ) - - -QUALIFIED_SWHIDS = [ - # origin: - ( - f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython", - QualifiedSWHID( - object_type=ObjectType.CONTENT, - object_id=_x(HASH), - origin="https://github.com/python/cpython", - ), - ), - ( - f"swh:1:cnt:{HASH};origin=https://example.org/foo%3Bbar%25baz", - QualifiedSWHID( - object_type=ObjectType.CONTENT, - object_id=_x(HASH), - origin="https://example.org/foo%3Bbar%25baz", - ), - ), - ( - f"swh:1:cnt:{HASH};origin=https://example.org?project=test", - QualifiedSWHID( - object_type=ObjectType.CONTENT, - object_id=_x(HASH), - origin="https://example.org?project=test", - ), - ), - # visit: - ( - f"swh:1:cnt:{HASH};visit=swh:1:snp:{HASH}", - QualifiedSWHID( - object_type=ObjectType.CONTENT, - object_id=_x(HASH), - visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH)), - ), - ), - (f"swh:1:cnt:{HASH};visit=swh:1:rel:{HASH}", None,), - # anchor: - ( - f"swh:1:cnt:{HASH};anchor=swh:1:dir:{HASH}", - QualifiedSWHID( - object_type=ObjectType.CONTENT, - object_id=_x(HASH), - anchor=CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=_x(HASH)), - ), - ), - ( - f"swh:1:cnt:{HASH};anchor=swh:1:rev:{HASH}", - QualifiedSWHID( - object_type=ObjectType.CONTENT, - object_id=_x(HASH), - anchor=CoreSWHID(object_type=ObjectType.REVISION, object_id=_x(HASH)), - ), - ), - ( - f"swh:1:cnt:{HASH};anchor=swh:1:cnt:{HASH}", - None, # 'cnt' is not valid in anchor - ), - ( - f"swh:1:cnt:{HASH};anchor=swh:1:ori:{HASH}", - None, # 'ori' is not valid in a CoreSWHID - ), - # path: - ( - f"swh:1:cnt:{HASH};path=/foo", - QualifiedSWHID( - object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo" - ), - ), - ( - f"swh:1:cnt:{HASH};path=/foo%3Bbar", - QualifiedSWHID( - object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo;bar" - ), - ), - ( - f"swh:1:cnt:{HASH};path=/foo%25bar", - QualifiedSWHID( - object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo%bar" - ), - ), - ( - f"swh:1:cnt:{HASH};path=/foo/bar%3Dbaz", - QualifiedSWHID( - object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo/bar=baz" - ), - ), - # lines - ( - f"swh:1:cnt:{HASH};lines=1-18", - QualifiedSWHID( - object_type=ObjectType.CONTENT, object_id=_x(HASH), lines=(1, 18), - ), - ), - ( - f"swh:1:cnt:{HASH};lines=18", - QualifiedSWHID( - object_type=ObjectType.CONTENT, object_id=_x(HASH), lines=(18, None), - ), - ), - (f"swh:1:cnt:{HASH};lines=", None,), - (f"swh:1:cnt:{HASH};lines=aa", None,), - (f"swh:1:cnt:{HASH};lines=18-aa", None,), -] - - -@pytest.mark.parametrize("string,parsed", QUALIFIED_SWHIDS) -def test_QualifiedSWHID_parse_serialize_qualifiers(string, parsed): - """Tests parsing and serializing valid SWHIDs with the various SWHID classes.""" - if parsed is None: - with pytest.raises(ValidationError): - print(repr(QualifiedSWHID.from_string(string))) - else: - assert QualifiedSWHID.from_string(string) == parsed - assert str(parsed) == string - - -def test_QualifiedSWHID_serialize_origin(): - """Checks that semicolon in origins are escaped.""" - string = f"swh:1:cnt:{HASH};origin=https://example.org/foo%3Bbar%25baz" - swhid = QualifiedSWHID( - object_type=ObjectType.CONTENT, - object_id=_x(HASH), - origin="https://example.org/foo;bar%25baz", - ) - assert str(swhid) == string - - -def test_QualifiedSWHID_attributes(): - """Checks the set of QualifiedSWHID attributes match the SWHID_QUALIFIERS - constant.""" - - assert set(attr.fields_dict(QualifiedSWHID)) == { - "namespace", - "scheme_version", - "object_type", - "object_id", - *SWHID_QUALIFIERS, - } - - -@pytest.mark.parametrize( - "ns,version,type,id", - [ - ("foo", 1, ObjectType.CONTENT, "abc8bc9d7a6bcf6db04f476d29314f157507d505"), - ("swh", 2, ObjectType.CONTENT, "def8bc9d7a6bcf6db04f476d29314f157507d505"), - ("swh", 1, ObjectType.DIRECTORY, "aaaa"), - ], -) -def test_CoreSWHID_validation_error(ns, version, type, id): - with pytest.raises(ValidationError): - CoreSWHID( - namespace=ns, scheme_version=version, object_type=type, object_id=_x(id), - ) - - -def test_CoreSWHID_hash(): - object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") - - assert hash( - CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id) - ) == hash(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id)) - - assert hash( - CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,) - ) == hash(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,)) - - # Different order of the dictionary, so the underlying order of the tuple in - # ImmutableDict is different. - assert hash( - CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,) - ) == hash(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,)) - - -def test_CoreSWHID_eq(): - object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") - - assert CoreSWHID( - object_type=ObjectType.DIRECTORY, object_id=object_id - ) == CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id) - - assert CoreSWHID( - object_type=ObjectType.DIRECTORY, object_id=object_id, - ) == CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,) - - assert CoreSWHID( - object_type=ObjectType.DIRECTORY, object_id=object_id, - ) == CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,) - - -@pytest.mark.parametrize( - "ns,version,type,id", - [ - ( - "foo", - 1, - ExtendedObjectType.CONTENT, - "abc8bc9d7a6bcf6db04f476d29314f157507d505", - ), - ( - "swh", - 2, - ExtendedObjectType.CONTENT, - "def8bc9d7a6bcf6db04f476d29314f157507d505", - ), - ("swh", 1, ExtendedObjectType.DIRECTORY, "aaaa"), - ], -) -def test_ExtendedSWHID_validation_error(ns, version, type, id): - with pytest.raises(ValidationError): - ExtendedSWHID( - namespace=ns, scheme_version=version, object_type=type, object_id=_x(id), - ) - - -def test_ExtendedSWHID_hash(): - object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") - - assert hash( - ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id) - ) == hash( - ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id) - ) - - assert hash( - ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) - ) == hash( - ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) - ) - - # Different order of the dictionary, so the underlying order of the tuple in - # ImmutableDict is different. - assert hash( - ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) - ) == hash( - ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) - ) - - -def test_ExtendedSWHID_eq(): - object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") - - assert ExtendedSWHID( - object_type=ExtendedObjectType.DIRECTORY, object_id=object_id - ) == ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id) - - assert ExtendedSWHID( - object_type=ExtendedObjectType.DIRECTORY, object_id=object_id, - ) == ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) - - assert ExtendedSWHID( - object_type=ExtendedObjectType.DIRECTORY, object_id=object_id, - ) == ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) - - def test_extid_identifier_bwcompat(): extid_dict = { "extid_type": "test-type", "extid": b"extid", - "target": ExtendedSWHID( - object_type=ExtendedObjectType.DIRECTORY, object_id=b"\x00" * 20 - ), + "target": "swh:1:dir:" + "00" * 20, } - assert ( - identifiers.extid_identifier(extid_dict) - == "b9295e1931c31e40a7e3e1e967decd1c89426455" + assert ExtID.from_dict(extid_dict).id == _x( + "b9295e1931c31e40a7e3e1e967decd1c89426455" ) - assert identifiers.extid_identifier( - {**extid_dict, "extid_version": 0} - ) == identifiers.extid_identifier(extid_dict) - - assert identifiers.extid_identifier( - {**extid_dict, "extid_version": 1} - ) != identifiers.extid_identifier(extid_dict) - + assert ( + ExtID.from_dict({**extid_dict, "extid_version": 0}).id + == ExtID.from_dict(extid_dict).id + ) -def test_object_types(): - """Checks ExtendedObjectType is a superset of ObjectType""" - for member in ObjectType: - assert getattr(ExtendedObjectType, member.name).value == member.value + assert ( + ExtID.from_dict({**extid_dict, "extid_version": 1}).id + != ExtID.from_dict(extid_dict).id + ) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index caad5e2a2642947d73c60635af2d928f09a7733c..781cfa46608aa72b563f6c0b8f40dbb6e6879025 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -12,20 +12,8 @@ from hypothesis import given from hypothesis.strategies import binary import pytest -from swh.model.hashutil import MultiHash, hash_to_bytes, hash_to_hex +from swh.model.hashutil import MultiHash, hash_to_bytes import swh.model.hypothesis_strategies as strategies -from swh.model.identifiers import ( - CoreSWHID, - ExtendedSWHID, - ObjectType, - content_identifier, - directory_identifier, - origin_identifier, - raw_extrinsic_metadata_identifier, - release_identifier, - revision_identifier, - snapshot_identifier, -) from swh.model.model import ( BaseModel, Content, @@ -46,14 +34,13 @@ from swh.model.model import ( Timestamp, TimestampWithTimezone, ) +from swh.model.swhids import CoreSWHID, ExtendedSWHID, ObjectType from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.model.tests.test_identifiers import ( TS_DATETIMES, TS_TIMEZONES, - content_example, directory_example, metadata_example, - origin_example, release_example, revision_example, snapshot_example, @@ -736,94 +723,6 @@ def test_revision_extra_headers_as_lists_from_dict(): assert rev_model.extra_headers == extra_headers -# ID computation - - -def test_content_model_id_computation(): - cnt_dict = content_example.copy() - - cnt_id_str = hash_to_hex(content_identifier(cnt_dict)["sha1_git"]) - cnt_model = Content.from_data(cnt_dict["data"]) - assert str(cnt_model.swhid()) == "swh:1:cnt:" + cnt_id_str - - -def test_directory_model_id_computation(): - dir_dict = directory_example.copy() - del dir_dict["id"] - - dir_id_str = directory_identifier(dir_dict) - dir_id = hash_to_bytes(dir_id_str) - dir_model = Directory.from_dict(dir_dict) - assert dir_model.id == dir_id - assert str(dir_model.swhid()) == "swh:1:dir:" + dir_id_str - - -def test_revision_model_id_computation(): - rev_dict = revision_example.copy() - del rev_dict["id"] - - rev_id_str = revision_identifier(rev_dict) - rev_id = hash_to_bytes(rev_id_str) - rev_model = Revision.from_dict(rev_dict) - assert rev_model.id == rev_id - assert str(rev_model.swhid()) == "swh:1:rev:" + rev_id_str - - -def test_revision_model_id_computation_with_no_date(): - """We can have revision with date to None - - """ - rev_dict = revision_example.copy() - rev_dict["date"] = None - rev_dict["committer_date"] = None - del rev_dict["id"] - - rev_id = hash_to_bytes(revision_identifier(rev_dict)) - rev_model = Revision.from_dict(rev_dict) - assert rev_model.date is None - assert rev_model.committer_date is None - assert rev_model.id == rev_id - - -def test_release_model_id_computation(): - rel_dict = release_example.copy() - del rel_dict["id"] - - rel_id_str = release_identifier(rel_dict) - rel_id = hash_to_bytes(rel_id_str) - rel_model = Release.from_dict(rel_dict) - assert isinstance(rel_model.date, TimestampWithTimezone) - assert rel_model.id == hash_to_bytes(rel_id) - assert str(rel_model.swhid()) == "swh:1:rel:" + rel_id_str - - -def test_snapshot_model_id_computation(): - snp_dict = snapshot_example.copy() - del snp_dict["id"] - - snp_id_str = snapshot_identifier(snp_dict) - snp_id = hash_to_bytes(snp_id_str) - snp_model = Snapshot.from_dict(snp_dict) - assert snp_model.id == snp_id - assert str(snp_model.swhid()) == "swh:1:snp:" + snp_id_str - - -def test_origin_model_id_computation(): - ori_dict = origin_example.copy() - - ori_id_str = origin_identifier(ori_dict) - ori_model = Origin.from_dict(ori_dict) - assert str(ori_model.swhid()) == "swh:1:ori:" + ori_id_str - - -def test_raw_extrinsic_metadata_model_id_computation(): - emd_dict = metadata_example.copy() - - emd_id_str = raw_extrinsic_metadata_identifier(emd_dict) - emd_model = RawExtrinsicMetadata.from_dict(emd_dict) - assert str(emd_model.swhid()) == "swh:1:emd:" + emd_id_str - - @given(strategies.objects(split_content=True)) def test_object_type(objtype_and_obj): obj_type, obj = objtype_and_obj diff --git a/swh/model/tests/test_swhids.py b/swh/model/tests/test_swhids.py new file mode 100644 index 0000000000000000000000000000000000000000..34a55f640dfbcdf2ed22c9837c582635a39bc7d1 --- /dev/null +++ b/swh/model/tests/test_swhids.py @@ -0,0 +1,638 @@ +# Copyright (C) 2015-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import itertools + +import attr +import pytest + +from swh.model.exceptions import ValidationError +from swh.model.hashutil import hash_to_bytes as _x +from swh.model.swhids import ( + SWHID_QUALIFIERS, + CoreSWHID, + ExtendedObjectType, + ExtendedSWHID, + ObjectType, + QualifiedSWHID, +) + +dummy_qualifiers = {"origin": "https://example.com", "lines": "42"} + + +# SWHIDs that are outright invalid, no matter the context +INVALID_SWHIDS = [ + "swh:1:cnt", + "swh:1:", + "swh:", + "swh:1:cnt:", + "foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;malformed", + "swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d", + "swh:1:snp:foo", + # wrong qualifier: ori should be origin + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa + # wrong qualifier: anc should be anchor + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anc=1;visit=1;path=/", # noqa + # wrong qualifier: vis should be visit + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=1;vis=1;path=/", # noqa + # wrong qualifier: pa should be path + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=1;visit=1;pa=/", # noqa + # wrong qualifier: line should be lines + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;line=10;origin=something;anchor=1;visit=1;path=/", # noqa + # wrong qualifier value: it contains space before of after + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin= https://some-url", # noqa + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=some-anchor ", # noqa + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=some-anchor ;visit=1", # noqa + # invalid swhid: whitespaces + "swh :1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa + "swh: 1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa + "swh: 1: dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa + "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d", + "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d; origin=blah", + "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", + # other whitespaces + "swh\t:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", + "swh:1\n:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", + "swh:1:\rdir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d\f;lines=12", + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12\v", +] + +SWHID_CLASSES = [CoreSWHID, QualifiedSWHID, ExtendedSWHID] + + +@pytest.mark.parametrize( + "invalid_swhid,swhid_class", itertools.product(INVALID_SWHIDS, SWHID_CLASSES) +) +def test_swhid_parsing_error(invalid_swhid, swhid_class): + """Tests SWHID strings that are invalid for all SWHID classes do raise + a ValidationError""" + with pytest.raises(ValidationError): + swhid_class.from_string(invalid_swhid) + + +# string SWHIDs, and how they should be parsed by each of the classes, +# or None if the class does not support it +HASH = "94a9ed024d3859793618152ea559a168bbcbb5e2" +VALID_SWHIDS = [ + ( + f"swh:1:cnt:{HASH}", + CoreSWHID(object_type=ObjectType.CONTENT, object_id=_x(HASH),), + QualifiedSWHID(object_type=ObjectType.CONTENT, object_id=_x(HASH),), + ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=_x(HASH),), + ), + ( + f"swh:1:dir:{HASH}", + CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=_x(HASH),), + QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=_x(HASH),), + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=_x(HASH),), + ), + ( + f"swh:1:rev:{HASH}", + CoreSWHID(object_type=ObjectType.REVISION, object_id=_x(HASH),), + QualifiedSWHID(object_type=ObjectType.REVISION, object_id=_x(HASH),), + ExtendedSWHID(object_type=ExtendedObjectType.REVISION, object_id=_x(HASH),), + ), + ( + f"swh:1:rel:{HASH}", + CoreSWHID(object_type=ObjectType.RELEASE, object_id=_x(HASH),), + QualifiedSWHID(object_type=ObjectType.RELEASE, object_id=_x(HASH),), + ExtendedSWHID(object_type=ExtendedObjectType.RELEASE, object_id=_x(HASH),), + ), + ( + f"swh:1:snp:{HASH}", + CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH),), + QualifiedSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH),), + ExtendedSWHID(object_type=ExtendedObjectType.SNAPSHOT, object_id=_x(HASH),), + ), + ( + f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython;lines=1-18", + None, # CoreSWHID does not allow qualifiers + QualifiedSWHID( + object_type=ObjectType.CONTENT, + object_id=_x(HASH), + origin="https://github.com/python/cpython", + lines=(1, 18), + ), + None, # Neither does ExtendedSWHID + ), + ( + f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython;lines=1-18/", + None, # likewise + None, + None, # likewise + ), + ( + f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython;lines=18", + None, # likewise + QualifiedSWHID( + object_type=ObjectType.CONTENT, + object_id=_x(HASH), + origin="https://github.com/python/cpython", + lines=(18, None), + ), + None, # likewise + ), + ( + f"swh:1:dir:{HASH};origin=deb://Debian/packages/linuxdoc-tools", + None, # likewise + QualifiedSWHID( + object_type=ObjectType.DIRECTORY, + object_id=_x(HASH), + origin="deb://Debian/packages/linuxdoc-tools", + ), + None, # likewise + ), + ( + f"swh:1:ori:{HASH}", + None, # CoreSWHID does not allow origin pseudo-SWHIDs + None, # Neither does QualifiedSWHID + ExtendedSWHID(object_type=ExtendedObjectType.ORIGIN, object_id=_x(HASH),), + ), + ( + f"swh:1:emd:{HASH}", + None, # likewise for metadata pseudo-SWHIDs + None, # Neither does QualifiedSWHID + ExtendedSWHID( + object_type=ExtendedObjectType.RAW_EXTRINSIC_METADATA, object_id=_x(HASH), + ), + ), + ( + f"swh:1:emd:{HASH};origin=https://github.com/python/cpython", + None, # CoreSWHID does not allow metadata pseudo-SWHIDs or qualifiers + None, # QualifiedSWHID does not allow metadata pseudo-SWHIDs + None, # ExtendedSWHID does not allow qualifiers + ), +] + + +@pytest.mark.parametrize( + "string,core,qualified,extended", + [ + pytest.param(string, core, qualified, extended, id=string) + for (string, core, qualified, extended) in VALID_SWHIDS + ], +) +def test_parse_unparse_swhids(string, core, qualified, extended): + """Tests parsing and serializing valid SWHIDs with the various SWHID classes.""" + classes = [CoreSWHID, QualifiedSWHID, ExtendedSWHID] + for (cls, parsed_swhid) in zip(classes, [core, qualified, extended]): + if parsed_swhid is None: + # This class should not accept this SWHID + with pytest.raises(ValidationError) as excinfo: + cls.from_string(string) + # Check string serialization for exception + assert str(excinfo.value) is not None + else: + # This class should + assert cls.from_string(string) == parsed_swhid + + # Also check serialization + assert string == str(parsed_swhid) + + +@pytest.mark.parametrize( + "core,extended", + [ + pytest.param(core, extended, id=string) + for (string, core, qualified, extended) in VALID_SWHIDS + if core is not None + ], +) +def test_core_to_extended(core, extended): + assert core.to_extended() == extended + + +@pytest.mark.parametrize( + "ns,version,type,id,qualifiers", + [ + ("foo", 1, ObjectType.CONTENT, "abc8bc9d7a6bcf6db04f476d29314f157507d505", {}), + ("swh", 2, ObjectType.CONTENT, "def8bc9d7a6bcf6db04f476d29314f157507d505", {}), + ("swh", 1, ObjectType.DIRECTORY, "aaaa", {}), + ], +) +def test_QualifiedSWHID_validation_error(ns, version, type, id, qualifiers): + with pytest.raises(ValidationError): + QualifiedSWHID( + namespace=ns, + scheme_version=version, + object_type=type, + object_id=_x(id), + **qualifiers, + ) + + +@pytest.mark.parametrize( + "object_type,qualifiers,expected", + [ + # No qualifier: + (ObjectType.CONTENT, {}, f"swh:1:cnt:{HASH}"), + # origin: + (ObjectType.CONTENT, {"origin": None}, f"swh:1:cnt:{HASH}"), + (ObjectType.CONTENT, {"origin": 42}, ValueError), + # visit: + ( + ObjectType.CONTENT, + {"visit": f"swh:1:snp:{HASH}"}, + f"swh:1:cnt:{HASH};visit=swh:1:snp:{HASH}", + ), + ( + ObjectType.CONTENT, + {"visit": CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH))}, + f"swh:1:cnt:{HASH};visit=swh:1:snp:{HASH}", + ), + (ObjectType.CONTENT, {"visit": 42}, TypeError), + (ObjectType.CONTENT, {"visit": f"swh:1:rel:{HASH}"}, ValidationError,), + ( + ObjectType.CONTENT, + {"visit": CoreSWHID(object_type=ObjectType.RELEASE, object_id=_x(HASH))}, + ValidationError, + ), + # anchor: + ( + ObjectType.CONTENT, + {"anchor": f"swh:1:snp:{HASH}"}, + f"swh:1:cnt:{HASH};anchor=swh:1:snp:{HASH}", + ), + ( + ObjectType.CONTENT, + {"anchor": CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH))}, + f"swh:1:cnt:{HASH};anchor=swh:1:snp:{HASH}", + ), + ( + ObjectType.CONTENT, + {"anchor": f"swh:1:dir:{HASH}"}, + f"swh:1:cnt:{HASH};anchor=swh:1:dir:{HASH}", + ), + ( + ObjectType.CONTENT, + {"anchor": CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=_x(HASH))}, + f"swh:1:cnt:{HASH};anchor=swh:1:dir:{HASH}", + ), + (ObjectType.CONTENT, {"anchor": 42}, TypeError), + (ObjectType.CONTENT, {"anchor": f"swh:1:cnt:{HASH}"}, ValidationError,), + ( + ObjectType.CONTENT, + {"anchor": CoreSWHID(object_type=ObjectType.CONTENT, object_id=_x(HASH))}, + ValidationError, + ), + # path: + (ObjectType.CONTENT, {"path": b"/foo"}, f"swh:1:cnt:{HASH};path=/foo",), + ( + ObjectType.CONTENT, + {"path": b"/foo;bar"}, + f"swh:1:cnt:{HASH};path=/foo%3Bbar", + ), + (ObjectType.CONTENT, {"path": "/foo"}, f"swh:1:cnt:{HASH};path=/foo",), + ( + ObjectType.CONTENT, + {"path": "/foo;bar"}, + f"swh:1:cnt:{HASH};path=/foo%3Bbar", + ), + (ObjectType.CONTENT, {"path": 42}, Exception), + # lines: + (ObjectType.CONTENT, {"lines": (42, None)}, f"swh:1:cnt:{HASH};lines=42",), + (ObjectType.CONTENT, {"lines": (21, 42)}, f"swh:1:cnt:{HASH};lines=21-42",), + (ObjectType.CONTENT, {"lines": 42}, TypeError,), + (ObjectType.CONTENT, {"lines": (None, 42)}, ValueError,), + (ObjectType.CONTENT, {"lines": ("42", None)}, ValueError,), + ], +) +def test_QualifiedSWHID_init(object_type, qualifiers, expected): + """Tests validation and converters of qualifiers""" + if isinstance(expected, type): + assert issubclass(expected, Exception) + with pytest.raises(expected): + QualifiedSWHID(object_type=object_type, object_id=_x(HASH), **qualifiers) + else: + assert isinstance(expected, str) + swhid = QualifiedSWHID( + object_type=object_type, object_id=_x(HASH), **qualifiers + ) + + # Check the build object has the right serialization + assert expected == str(swhid) + + # Check the internal state of the object is the same as if parsed from a string + assert QualifiedSWHID.from_string(expected) == swhid + + +def test_QualifiedSWHID_hash(): + object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") + + assert hash( + QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id) + ) == hash(QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id)) + + assert hash( + QualifiedSWHID( + object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, + ) + ) == hash( + QualifiedSWHID( + object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, + ) + ) + + # Different order of the dictionary, so the underlying order of the tuple in + # ImmutableDict is different. + assert hash( + QualifiedSWHID( + object_type=ObjectType.DIRECTORY, + object_id=object_id, + origin="https://example.com", + lines=(42, None), + ) + ) == hash( + QualifiedSWHID( + object_type=ObjectType.DIRECTORY, + object_id=object_id, + lines=(42, None), + origin="https://example.com", + ) + ) + + +def test_QualifiedSWHID_eq(): + object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") + + assert QualifiedSWHID( + object_type=ObjectType.DIRECTORY, object_id=object_id + ) == QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id) + + assert QualifiedSWHID( + object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, + ) == QualifiedSWHID( + object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, + ) + + assert QualifiedSWHID( + object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, + ) == QualifiedSWHID( + object_type=ObjectType.DIRECTORY, object_id=object_id, **dummy_qualifiers, + ) + + +QUALIFIED_SWHIDS = [ + # origin: + ( + f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython", + QualifiedSWHID( + object_type=ObjectType.CONTENT, + object_id=_x(HASH), + origin="https://github.com/python/cpython", + ), + ), + ( + f"swh:1:cnt:{HASH};origin=https://example.org/foo%3Bbar%25baz", + QualifiedSWHID( + object_type=ObjectType.CONTENT, + object_id=_x(HASH), + origin="https://example.org/foo%3Bbar%25baz", + ), + ), + ( + f"swh:1:cnt:{HASH};origin=https://example.org?project=test", + QualifiedSWHID( + object_type=ObjectType.CONTENT, + object_id=_x(HASH), + origin="https://example.org?project=test", + ), + ), + # visit: + ( + f"swh:1:cnt:{HASH};visit=swh:1:snp:{HASH}", + QualifiedSWHID( + object_type=ObjectType.CONTENT, + object_id=_x(HASH), + visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH)), + ), + ), + (f"swh:1:cnt:{HASH};visit=swh:1:rel:{HASH}", None,), + # anchor: + ( + f"swh:1:cnt:{HASH};anchor=swh:1:dir:{HASH}", + QualifiedSWHID( + object_type=ObjectType.CONTENT, + object_id=_x(HASH), + anchor=CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=_x(HASH)), + ), + ), + ( + f"swh:1:cnt:{HASH};anchor=swh:1:rev:{HASH}", + QualifiedSWHID( + object_type=ObjectType.CONTENT, + object_id=_x(HASH), + anchor=CoreSWHID(object_type=ObjectType.REVISION, object_id=_x(HASH)), + ), + ), + ( + f"swh:1:cnt:{HASH};anchor=swh:1:cnt:{HASH}", + None, # 'cnt' is not valid in anchor + ), + ( + f"swh:1:cnt:{HASH};anchor=swh:1:ori:{HASH}", + None, # 'ori' is not valid in a CoreSWHID + ), + # path: + ( + f"swh:1:cnt:{HASH};path=/foo", + QualifiedSWHID( + object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo" + ), + ), + ( + f"swh:1:cnt:{HASH};path=/foo%3Bbar", + QualifiedSWHID( + object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo;bar" + ), + ), + ( + f"swh:1:cnt:{HASH};path=/foo%25bar", + QualifiedSWHID( + object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo%bar" + ), + ), + ( + f"swh:1:cnt:{HASH};path=/foo/bar%3Dbaz", + QualifiedSWHID( + object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo/bar=baz" + ), + ), + # lines + ( + f"swh:1:cnt:{HASH};lines=1-18", + QualifiedSWHID( + object_type=ObjectType.CONTENT, object_id=_x(HASH), lines=(1, 18), + ), + ), + ( + f"swh:1:cnt:{HASH};lines=18", + QualifiedSWHID( + object_type=ObjectType.CONTENT, object_id=_x(HASH), lines=(18, None), + ), + ), + (f"swh:1:cnt:{HASH};lines=", None,), + (f"swh:1:cnt:{HASH};lines=aa", None,), + (f"swh:1:cnt:{HASH};lines=18-aa", None,), +] + + +@pytest.mark.parametrize("string,parsed", QUALIFIED_SWHIDS) +def test_QualifiedSWHID_parse_serialize_qualifiers(string, parsed): + """Tests parsing and serializing valid SWHIDs with the various SWHID classes.""" + if parsed is None: + with pytest.raises(ValidationError): + print(repr(QualifiedSWHID.from_string(string))) + else: + assert QualifiedSWHID.from_string(string) == parsed + assert str(parsed) == string + + +def test_QualifiedSWHID_serialize_origin(): + """Checks that semicolon in origins are escaped.""" + string = f"swh:1:cnt:{HASH};origin=https://example.org/foo%3Bbar%25baz" + swhid = QualifiedSWHID( + object_type=ObjectType.CONTENT, + object_id=_x(HASH), + origin="https://example.org/foo;bar%25baz", + ) + assert str(swhid) == string + + +def test_QualifiedSWHID_attributes(): + """Checks the set of QualifiedSWHID attributes match the SWHID_QUALIFIERS + constant.""" + + assert set(attr.fields_dict(QualifiedSWHID)) == { + "namespace", + "scheme_version", + "object_type", + "object_id", + *SWHID_QUALIFIERS, + } + + +@pytest.mark.parametrize( + "ns,version,type,id", + [ + ("foo", 1, ObjectType.CONTENT, "abc8bc9d7a6bcf6db04f476d29314f157507d505"), + ("swh", 2, ObjectType.CONTENT, "def8bc9d7a6bcf6db04f476d29314f157507d505"), + ("swh", 1, ObjectType.DIRECTORY, "aaaa"), + ], +) +def test_CoreSWHID_validation_error(ns, version, type, id): + with pytest.raises(ValidationError): + CoreSWHID( + namespace=ns, scheme_version=version, object_type=type, object_id=_x(id), + ) + + +def test_CoreSWHID_hash(): + object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") + + assert hash( + CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id) + ) == hash(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id)) + + assert hash( + CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,) + ) == hash(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,)) + + # Different order of the dictionary, so the underlying order of the tuple in + # ImmutableDict is different. + assert hash( + CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,) + ) == hash(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,)) + + +def test_CoreSWHID_eq(): + object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") + + assert CoreSWHID( + object_type=ObjectType.DIRECTORY, object_id=object_id + ) == CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id) + + assert CoreSWHID( + object_type=ObjectType.DIRECTORY, object_id=object_id, + ) == CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,) + + assert CoreSWHID( + object_type=ObjectType.DIRECTORY, object_id=object_id, + ) == CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,) + + +@pytest.mark.parametrize( + "ns,version,type,id", + [ + ( + "foo", + 1, + ExtendedObjectType.CONTENT, + "abc8bc9d7a6bcf6db04f476d29314f157507d505", + ), + ( + "swh", + 2, + ExtendedObjectType.CONTENT, + "def8bc9d7a6bcf6db04f476d29314f157507d505", + ), + ("swh", 1, ExtendedObjectType.DIRECTORY, "aaaa"), + ], +) +def test_ExtendedSWHID_validation_error(ns, version, type, id): + with pytest.raises(ValidationError): + ExtendedSWHID( + namespace=ns, scheme_version=version, object_type=type, object_id=_x(id), + ) + + +def test_ExtendedSWHID_hash(): + object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") + + assert hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id) + ) == hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id) + ) + + assert hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + ) == hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + ) + + # Different order of the dictionary, so the underlying order of the tuple in + # ImmutableDict is different. + assert hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + ) == hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + ) + + +def test_ExtendedSWHID_eq(): + object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") + + assert ExtendedSWHID( + object_type=ExtendedObjectType.DIRECTORY, object_id=object_id + ) == ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id) + + assert ExtendedSWHID( + object_type=ExtendedObjectType.DIRECTORY, object_id=object_id, + ) == ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + + assert ExtendedSWHID( + object_type=ExtendedObjectType.DIRECTORY, object_id=object_id, + ) == ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + + +def test_object_types(): + """Checks ExtendedObjectType is a superset of ObjectType""" + for member in ObjectType: + assert getattr(ExtendedObjectType, member.name).value == member.value