diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 168824fec62634e027fb71c378657b25e84ffeaf..7565e3d6b69656b1bbdda876e2e8e5fdb1f6b4f7 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1,2 +1,3 @@ Daniele Serafini Ishan Bhanuka +Antoine Cezar diff --git a/PKG-INFO b/PKG-INFO index 5b1df17ca4ed455f9f10168287578059c9f73992..b9d73f8d1b8d9f22f1bd325773ba8307092ab76a 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 0.7.1 +Version: 0.7.2 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 5b1df17ca4ed455f9f10168287578059c9f73992..b9d73f8d1b8d9f22f1bd325773ba8307092ab76a 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 0.7.1 +Version: 0.7.2 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh/model/cli.py b/swh/model/cli.py index d14d6f983bae18a88b6797edf9892c663aac99d4..6e69d1d861fd68eca64730b9a0b3fb83061c1bd7 100644 --- a/swh/model/cli.py +++ b/swh/model/cli.py @@ -5,6 +5,7 @@ import os import sys +from typing import List # WARNING: do not import unnecessary things here to keep cli startup time under # control @@ -57,11 +58,21 @@ def swhid_of_file_content(data): return swhid(CONTENT, object) -def swhid_of_dir(path): - from swh.model.from_disk import Directory +def swhid_of_dir(path: bytes, exclude_patterns: List[bytes] = None) -> str: + from swh.model.from_disk import ( + Directory, + accept_all_directories, + ignore_directories_patterns, + ) from swh.model.identifiers import DIRECTORY, swhid - object = Directory.from_disk(path=path).get_data() + dir_filter = ( + ignore_directories_patterns(path, exclude_patterns) + if exclude_patterns + else accept_all_directories + ) + + object = Directory.from_disk(path=path, dir_filter=dir_filter).get_data() return swhid(DIRECTORY, object) @@ -101,7 +112,7 @@ def swhid_of_git_repo(path): return str(SWHID(object_type="snapshot", object_id=snapshot_identifier(snapshot))) -def identify_object(obj_type, follow_symlinks, obj): +def identify_object(obj_type, follow_symlinks, exclude_patterns, obj): from urllib.parse import urlparse if obj_type == "auto": @@ -130,7 +141,9 @@ def identify_object(obj_type, follow_symlinks, obj): if obj_type == "content": swhid = swhid_of_file(path) elif obj_type == "directory": - swhid = swhid_of_dir(path) + swhid = swhid_of_dir( + path, [pattern.encode() for pattern in exclude_patterns] + ) elif obj_type == "origin": swhid = swhid_of_origin(obj) elif obj_type == "snapshot": @@ -165,6 +178,15 @@ def identify_object(obj_type, follow_symlinks, obj): type=click.Choice(["auto", "content", "directory", "origin", "snapshot"]), help="type of object to identify (default: auto)", ) +@click.option( + "--exclude", + "-x", + "exclude_patterns", + metavar="PATTERN", + multiple=True, + help="Exclude directories using glob patterns \ + (e.g., '*.git' to exclude all .git directories)", +) @click.option( "--verify", "-v", @@ -173,7 +195,9 @@ def identify_object(obj_type, follow_symlinks, obj): help="reference identifier to be compared with computed one", ) @click.argument("objects", nargs=-1, required=True) -def identify(obj_type, verify, show_filename, follow_symlinks, objects): +def identify( + obj_type, verify, show_filename, follow_symlinks, objects, exclude_patterns, +): """Compute the Software Heritage persistent identifier (SWHID) for the given source code object(s). @@ -208,7 +232,9 @@ def identify(obj_type, verify, show_filename, follow_symlinks, objects): if verify and len(objects) != 1: raise click.BadParameter("verification requires a single object") - results = map(partial(identify_object, obj_type, follow_symlinks), objects) + results = map( + partial(identify_object, obj_type, follow_symlinks, exclude_patterns), objects, + ) if verify: swhid = next(results)[1] diff --git a/swh/model/exceptions.py b/swh/model/exceptions.py index 774dfc22aa0c787be71fd3b65b761dfac95fc74d..38b01c32c70c1148d69010ac32f66c0895624d10 100644 --- a/swh/model/exceptions.py +++ b/swh/model/exceptions.py @@ -129,3 +129,7 @@ class ValidationError(Exception): def __repr__(self): return "ValidationError(%s)" % self + + +class InvalidDirectoryPath(Exception): + pass diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py index 719599d0563fef48b00973b8daf6e92168bf4a73..c46bcd6c873f66d6ee36fc30a75612cdc4bbe809 100644 --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -5,15 +5,19 @@ import datetime import enum +import fnmatch +import glob import os +import re import stat -from typing import Any, Iterable, List, Optional, Tuple +from typing import Any, Iterable, Iterator, List, Optional, Pattern, Tuple import attr from attrs_strict import type_validator from typing_extensions import Final from . import model +from .exceptions import InvalidDirectoryPath from .hashutil import MultiHash from .identifiers import directory_entry_sort_key, directory_identifier from .identifiers import identifier_to_bytes as id_to_bytes @@ -276,6 +280,63 @@ def ignore_named_directories(names, *, case_sensitive=True): return named_filter +# TODO: `extract_regex_objs` has been copied and adapted from `swh.scanner`. +# In the future `swh.scanner` should use the `swh.model` version and remove its own. +def extract_regex_objs( + root_path: bytes, patterns: Iterable[bytes] +) -> Iterator[Pattern[bytes]]: + """Generates a regex object for each pattern given in input and checks if + the path is a subdirectory or relative to the root path. + + Args: + root_path (bytes): path to the root directory + patterns (list of byte): patterns to match + + Yields: + an SRE_Pattern object + """ + absolute_root_path = os.path.abspath(root_path) + for pattern in patterns: + for path in glob.glob(pattern): + absolute_path = os.path.abspath(path) + if not absolute_path.startswith(absolute_root_path): + error_msg = ( + b'The path "' + path + b'" is not a subdirectory or relative ' + b'to the root directory path: "' + root_path + b'"' + ) + raise InvalidDirectoryPath(error_msg) + + regex = fnmatch.translate((pattern.decode())) + yield re.compile(regex.encode()) + + +def ignore_directories_patterns(root_path: bytes, patterns: Iterable[bytes]): + """Filter for :func:`directory_to_objects` to ignore directories + matching certain patterns. + + Args: + root_path (bytes): path of the root directory + patterns (list of byte): patterns to ignore + + Returns: + a directory filter for :func:`directory_to_objects` + """ + sre_patterns = set(extract_regex_objs(root_path, patterns)) + + def pattern_filter( + dirpath: bytes, + dirname: bytes, + entries: Iterable[Any], + patterns: Iterable[Any] = sre_patterns, + root_path: bytes = os.path.abspath(root_path), + ): + full_path = os.path.abspath(dirpath) + relative_path = os.path.relpath(full_path, root_path) + return not any([pattern.match(relative_path) for pattern in patterns]) + + return pattern_filter + + def iter_directory( directory, ) -> Tuple[List[model.Content], List[model.SkippedContent], List[model.Directory]]: diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index 6c61317c3e22d1e37c9f6096389dc66615aee1bd..92664cc4f922593e0afe6ea672be904917f99112 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -7,7 +7,7 @@ import binascii import datetime from functools import lru_cache import hashlib -from typing import Any, Dict, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import attr @@ -355,45 +355,101 @@ def format_author(author): return b" ".join(ret) -def format_author_line(header, author, date_offset): - """Format a an author line according to git standards. +def format_manifest( + headers: Iterable[Tuple[bytes, bytes]], message: Optional[bytes] = None, +) -> bytes: + """Format a manifest comprised of a sequence of `headers` and an optional `message`. - An author line has three components: + The manifest format, compatible with the git format for tag and commit + objects, is as follows: - - a header, describing the type of author (author, committer, tagger) - - a name and email, which is an arbitrary bytestring - - optionally, a timestamp with UTC offset specification + - for each `key`, `value` in `headers`, emit: - The author line is formatted thus:: + - the `key`, literally + - an ascii space (``\\x20``) + - the `value`, with newlines escaped using :func:`escape_newlines`, + - an ascii newline (``\\x0a``) - `header` `name and email`[ `timestamp` `utc_offset`] + - if the `message` is not None, emit: + + - an ascii newline (``\\x0a``) + - the `message`, literally + + Args: + headers: a sequence of key/value headers stored in the manifest; + message: an optional message used to trail the manifest. + + Returns: + the formatted manifest as bytes + """ + entries: List[bytes] = [] + + for key, value in headers: + entries.extend((key, b" ", escape_newlines(value), b"\n")) + + if message is not None: + entries.extend((b"\n", message)) + + return b"".join(entries) + + +def hash_manifest( + type: str, headers: Iterable[Tuple[bytes, bytes]], message: Optional[bytes] = None, +): + """Hash the manifest of an object of type `type`, comprised of a sequence + of `headers` and an optional `message`. + + Before hashing, the manifest is serialized with the :func:`format_manifest` + function. + + We then use the git "salted sha1" (:func:`swh.model.hashutil.hash_git_data`) + with the given `type` to hash the manifest. + + Args: + type: the type of object for which we're computing a manifest (e.g. + "tag", "commit", ...) + headers: a sequence of key/value headers stored in the manifest; + message: an optional message used to trail the manifest. + + """ + manifest = format_manifest(headers, message) + return hash_git_data(manifest, type) + + +def format_author_data(author, date_offset) -> bytes: + """Format authorship data according to git standards. + + Git authorship data has two components: + + - an author specification, usually a name and email, but in practice an + arbitrary bytestring + - optionally, a timestamp with a UTC offset specification + + The authorship data is formatted thus:: + + `name and email`[ `timestamp` `utc_offset`] The timestamp is encoded as a (decimal) number of seconds since the UNIX epoch (1970-01-01 at 00:00 UTC). As an extension to the git format, we support fractional timestamps, using a dot as the separator for the decimal part. - The utc offset is a number of minutes encoded as '[+-]HHMM'. Note some + The utc offset is a number of minutes encoded as '[+-]HHMM'. Note that some tools can pass a negative offset corresponding to the UTC timezone ('-0000'), which is valid and is encoded as such. - For convenience, this function returns the whole line with its trailing - newline. - Args: - header: the header of the author line (one of 'author', 'committer', - 'tagger') author: an author specification (dict with two bytes values: name and email, or byte value) date_offset: a normalized date/time representation as returned by :func:`normalize_timestamp`. Returns: - the newline-terminated byte string containing the author line + the byte string containing the authorship data """ - ret = [header.encode(), b" ", escape_newlines(format_author(author))] + ret = [format_author(author)] date_offset = normalize_timestamp(date_offset) @@ -403,7 +459,6 @@ def format_author_line(header, author, date_offset): ret.extend([b" ", date_f, b" ", offset_f]) - ret.append(b"\n") return b"".join(ret) @@ -457,24 +512,19 @@ def revision_identifier(revision): type. """ - components = [ - b"tree ", - identifier_to_str(revision["directory"]).encode(), - b"\n", - ] + headers = [(b"tree", identifier_to_str(revision["directory"]).encode())] for parent in revision["parents"]: if parent: - components.extend( - [b"parent ", identifier_to_str(parent).encode(), b"\n",] - ) + headers.append((b"parent", identifier_to_str(parent).encode())) - components.extend( - [ - format_author_line("author", revision["author"], revision["date"]), - format_author_line( - "committer", revision["committer"], revision["committer_date"] - ), - ] + headers.append( + (b"author", format_author_data(revision["author"], revision["date"])) + ) + headers.append( + ( + b"committer", + format_author_data(revision["committer"], revision["committer_date"]), + ) ) # Handle extra headers @@ -483,14 +533,9 @@ def revision_identifier(revision): if not extra_headers and "extra_headers" in metadata: extra_headers = metadata["extra_headers"] - for key, value in extra_headers: - components.extend([key, b" ", escape_newlines(value), b"\n"]) - - if revision["message"] is not None: - components.extend([b"\n", revision["message"]]) + headers.extend(extra_headers) - commit_raw = b"".join(components) - return identifier_to_str(hash_git_data(commit_raw, "commit")) + return identifier_to_str(hash_manifest("commit", headers, revision["message"])) def target_type_to_git(target_type): @@ -506,27 +551,18 @@ def target_type_to_git(target_type): def release_identifier(release): """Return the intrinsic identifier for a release.""" - components = [ - b"object ", - identifier_to_str(release["target"]).encode(), - b"\n", - b"type ", - target_type_to_git(release["target_type"]), - b"\n", - b"tag ", - release["name"], - b"\n", + headers = [ + (b"object", identifier_to_str(release["target"]).encode()), + (b"type", target_type_to_git(release["target_type"])), + (b"tag", release["name"]), ] if "author" in release and release["author"]: - components.append( - format_author_line("tagger", release["author"], release["date"]) + headers.append( + (b"tagger", format_author_data(release["author"], release["date"])) ) - if release["message"] is not None: - components.extend([b"\n", release["message"]]) - - return identifier_to_str(hash_git_data(b"".join(components), "tag")) + return identifier_to_str(hash_manifest("tag", headers, release["message"])) def snapshot_identifier(snapshot, *, ignore_unresolved=False): diff --git a/swh/model/model.py b/swh/model/model.py index e962dbad783830620fc287c9da9f7cfafa583464..1df94630f19b7ad6b199fb07c373639e270b578f 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -8,6 +8,7 @@ import datetime from enum import Enum from hashlib import sha256 from typing import Any, Dict, Iterable, Optional, Tuple, TypeVar, Union +import warnings import attr from attrs_strict import type_validator @@ -112,16 +113,19 @@ class HashableObject(metaclass=ABCMeta): """Mixin to automatically compute object identifier hash when the associated model is instantiated.""" - @staticmethod @abstractmethod - def compute_hash(object_dict): + def compute_hash(self) -> bytes: """Derived model classes must implement this to compute - the object hash from its dict representation.""" + the object hash. + + This method is called by the object initialization if the `id` + attribute is set to an empty value. + """ pass def __attrs_post_init__(self): if not self.id: - obj_id = hash_to_bytes(self.compute_hash(self.to_dict())) + obj_id = self.compute_hash() object.__setattr__(self, "id", obj_id) def unique_key(self) -> KeyType: @@ -390,9 +394,8 @@ class Snapshot(HashableObject, BaseModel): ) id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") - @staticmethod - def compute_hash(object_dict): - return snapshot_identifier(object_dict) + def compute_hash(self) -> bytes: + return hash_to_bytes(snapshot_identifier(self.to_dict())) @classmethod def from_dict(cls, d): @@ -427,9 +430,8 @@ class Release(HashableObject, BaseModel): ) id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") - @staticmethod - def compute_hash(object_dict): - return release_identifier(object_dict) + def compute_hash(self) -> bytes: + return hash_to_bytes(release_identifier(self.to_dict())) @author.validator def check_author(self, attribute, value): @@ -516,9 +518,8 @@ class Revision(HashableObject, BaseModel): attr.validate(self) object.__setattr__(self, "metadata", metadata) - @staticmethod - def compute_hash(object_dict): - return revision_identifier(object_dict) + def compute_hash(self) -> bytes: + return hash_to_bytes(revision_identifier(self.to_dict())) @classmethod def from_dict(cls, d): @@ -570,9 +571,8 @@ class Directory(HashableObject, BaseModel): entries = attr.ib(type=Tuple[DirectoryEntry, ...], validator=type_validator()) id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") - @staticmethod - def compute_hash(object_dict): - return directory_identifier(object_dict) + def compute_hash(self) -> bytes: + return hash_to_bytes(directory_identifier(self.to_dict())) @classmethod def from_dict(cls, d): @@ -855,12 +855,10 @@ class MetadataTargetType(Enum): @attr.s(frozen=True) -class RawExtrinsicMetadata(BaseModel): - object_type: Final = "raw_extrinsic_metadata" - +class _RawExtrinsicMetadata(BaseModel): # target object type = attr.ib(type=MetadataTargetType, validator=type_validator()) - id = attr.ib(type=Union[str, SWHID], validator=type_validator()) + target = attr.ib(type=Union[str, SWHID], validator=type_validator()) """URL if type=MetadataTargetType.ORIGIN, else core SWHID""" # source @@ -881,12 +879,12 @@ class RawExtrinsicMetadata(BaseModel): path = attr.ib(type=Optional[bytes], default=None, validator=type_validator()) directory = attr.ib(type=Optional[SWHID], default=None, validator=type_validator()) - @id.validator - def check_id(self, attribute, value): + @target.validator + def check_target(self, attribute, value): if self.type == MetadataTargetType.ORIGIN: if isinstance(value, SWHID) or value.startswith("swh:"): raise ValueError( - "Got SWHID as id for origin metadata (expected an URL)." + "Got SWHID as target for origin metadata (expected an URL)." ) else: self._check_swhid(self.type.value, value) @@ -1025,6 +1023,7 @@ class RawExtrinsicMetadata(BaseModel): def to_dict(self): d = super().to_dict() + d["id"] = d["target"] context_keys = ( "origin", "visit", @@ -1048,8 +1047,16 @@ class RawExtrinsicMetadata(BaseModel): "fetcher": MetadataFetcher.from_dict(d["fetcher"]), } + if "id" in d: + warnings.warn( + "RawExtrinsicMetadata `id` attribute is now called `target`", + DeprecationWarning, + ) + # Backwards-compatibility for id -> target migration + d["target"] = d.pop("id") + if d["type"] != MetadataTargetType.ORIGIN: - d["id"] = parse_swhid(d["id"]) + d["target"] = parse_swhid(d["target"]) swhid_keys = ("snapshot", "release", "revision", "directory") for swhid_key in swhid_keys: @@ -1061,10 +1068,32 @@ class RawExtrinsicMetadata(BaseModel): def unique_key(self) -> KeyType: return { "type": self.type.value, - "id": str(self.id), + "target": str(self.target), "authority_type": self.authority.type.value, "authority_url": self.authority.url, "discovery_date": str(self.discovery_date), "fetcher_name": self.fetcher.name, "fetcher_version": self.fetcher.version, } + + +class RawExtrinsicMetadata(_RawExtrinsicMetadata): + object_type: Final = "raw_extrinsic_metadata" + + def __init__(self, **kwargs): + if "id" in kwargs: + warnings.warn( + "RawExtrinsicMetadata `id` attribute is now called `target`", + DeprecationWarning, + ) + kwargs["target"] = kwargs.pop("id") + + super().__init__(**kwargs) + + @property + def id(self): + warnings.warn( + "RawExtrinsicMetadata `id` attribute is now called `target`", + DeprecationWarning, + ) + return self.target diff --git a/swh/model/tests/test_cli.py b/swh/model/tests/test_cli.py index b65ea03707c1ac6df0b17bed29bb3dbd424a0a7e..3d86ede6ba648957aca108cc310d7ffeaf897ac5 100644 --- a/swh/model/tests/test_cli.py +++ b/swh/model/tests/test_cli.py @@ -146,3 +146,19 @@ class TestIdentify(DataMixin, unittest.TestCase): f.write("trailing garbage to make verification fail") result = self.runner.invoke(cli.identify, ["--verify", expected_id, path]) self.assertEqual(result.exit_code, 1) + + def test_exclude(self): + """exclude patterns""" + self.make_from_tarball(self.tmpdir_name) + path = os.path.join(self.tmpdir_name, b"sample-folder") + + excluded_dir = os.path.join(path, b"excluded_dir\x96") + os.mkdir(excluded_dir) + with open(os.path.join(excluded_dir, b"some_file"), "w") as f: + f.write("content") + + result = self.runner.invoke( + cli.identify, ["--type", "directory", "--exclude", "excluded_*", path] + ) + + self.assertSWHID(result, "swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759") diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index 990a170705c1fa22980dc0e33f0af227f6e94080..07afe2152ce15ddbfa1a38d71baf8a48cfaa641f 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -796,20 +796,24 @@ def test_metadata_valid(): # Simplest case RawExtrinsicMetadata( - type=MetadataTargetType.ORIGIN, id=_origin_url, **_common_metadata_fields + type=MetadataTargetType.ORIGIN, target=_origin_url, **_common_metadata_fields ) # Object with an SWHID RawExtrinsicMetadata( - type=MetadataTargetType.CONTENT, id=_content_swhid, **_common_metadata_fields + type=MetadataTargetType.CONTENT, + target=_content_swhid, + **_common_metadata_fields, ) -def test_metadata_to_dict(): +@pytest.mark.filterwarnings("ignore: RawExtrinsicMetadata `id`") +@pytest.mark.parametrize("argument_name", ["id", "target"]) +def test_metadata_to_dict(argument_name): """Checks valid RawExtrinsicMetadata objects don't raise an error.""" common_fields = { - "authority": {"type": "forge", "url": "https://forge.softwareheritage.org",}, + "authority": {"type": "forge", "url": "https://forge.softwareheritage.org"}, "fetcher": {"name": "test-fetcher", "version": "0.0.1",}, "discovery_date": _common_metadata_fields["discovery_date"], "format": "json", @@ -817,54 +821,62 @@ def test_metadata_to_dict(): } m = RawExtrinsicMetadata( - type=MetadataTargetType.ORIGIN, id=_origin_url, **_common_metadata_fields + type=MetadataTargetType.ORIGIN, + **{argument_name: _origin_url, **_common_metadata_fields}, ) assert m.to_dict() == { "type": "origin", + "target": _origin_url, "id": _origin_url, **common_fields, } assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m m = RawExtrinsicMetadata( - type=MetadataTargetType.CONTENT, id=_content_swhid, **_common_metadata_fields + type=MetadataTargetType.CONTENT, + **{argument_name: _content_swhid, **_common_metadata_fields}, ) assert m.to_dict() == { "type": "content", + "target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", "id": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", **common_fields, } assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m -def test_metadata_invalid_id(): - """Checks various invalid values for the 'id' field.""" +def test_metadata_invalid_target(): + """Checks various invalid values for the 'target' field.""" # SWHID for an origin with pytest.raises(ValueError, match="expected an URL"): RawExtrinsicMetadata( - type=MetadataTargetType.ORIGIN, id=_content_swhid, **_common_metadata_fields + type=MetadataTargetType.ORIGIN, + target=_content_swhid, + **_common_metadata_fields, ) # SWHID for an origin (even when passed as string) with pytest.raises(ValueError, match="expected an URL"): RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, - id="swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", + target="swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", **_common_metadata_fields, ) # URL for a non-origin with pytest.raises(ValueError, match="Expected SWHID, got a string"): RawExtrinsicMetadata( - type=MetadataTargetType.CONTENT, id=_origin_url, **_common_metadata_fields + type=MetadataTargetType.CONTENT, + target=_origin_url, + **_common_metadata_fields, ) # SWHID passed as string instead of SWHID with pytest.raises(ValueError, match="Expected SWHID, got a string"): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id="swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", + target="swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", **_common_metadata_fields, ) @@ -874,7 +886,7 @@ def test_metadata_invalid_id(): ): RawExtrinsicMetadata( type=MetadataTargetType.REVISION, - id=_content_swhid, + target=_content_swhid, **_common_metadata_fields, ) @@ -882,7 +894,7 @@ def test_metadata_invalid_id(): with pytest.raises(ValueError, match="Expected core SWHID"): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=SWHID( + target=SWHID( object_type="content", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", metadata={"foo": "bar"}, @@ -895,7 +907,7 @@ def test_metadata_naive_datetime(): with pytest.raises(ValueError, match="must be a timezone-aware datetime"): RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, - id=_origin_url, + target=_origin_url, **{**_common_metadata_fields, "discovery_date": datetime.datetime.now()}, ) @@ -909,7 +921,7 @@ def test_metadata_validate_context_origin(): ): RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, - id=_origin_url, + target=_origin_url, origin=_origin_url, **_common_metadata_fields, ) @@ -917,7 +929,7 @@ def test_metadata_validate_context_origin(): # but all other types can RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, origin=_origin_url, **_common_metadata_fields, ) @@ -926,7 +938,7 @@ def test_metadata_validate_context_origin(): with pytest.raises(ValueError, match="SWHID used as context origin URL"): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, origin="swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", **_common_metadata_fields, ) @@ -941,7 +953,7 @@ def test_metadata_validate_context_visit(): ): RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, - id=_origin_url, + target=_origin_url, visit=42, **_common_metadata_fields, ) @@ -949,7 +961,7 @@ def test_metadata_validate_context_visit(): # but all other types can RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, origin=_origin_url, visit=42, **_common_metadata_fields, @@ -959,7 +971,7 @@ def test_metadata_validate_context_visit(): with pytest.raises(ValueError, match="'origin' context must be set if 'visit' is"): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, visit=42, **_common_metadata_fields, ) @@ -968,7 +980,7 @@ def test_metadata_validate_context_visit(): with pytest.raises(ValueError, match="Nonpositive visit id"): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, origin=_origin_url, visit=-42, **_common_metadata_fields, @@ -984,7 +996,7 @@ def test_metadata_validate_context_snapshot(): ): RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, - id=_origin_url, + target=_origin_url, snapshot=SWHID( object_type="snapshot", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -995,7 +1007,7 @@ def test_metadata_validate_context_snapshot(): # but content can RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, snapshot=SWHID( object_type="snapshot", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2" ), @@ -1006,7 +1018,7 @@ def test_metadata_validate_context_snapshot(): with pytest.raises(ValueError, match="Expected core SWHID"): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, snapshot=SWHID( object_type="snapshot", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1021,7 +1033,7 @@ def test_metadata_validate_context_snapshot(): ): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, snapshot=SWHID( object_type="content", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1039,7 +1051,7 @@ def test_metadata_validate_context_release(): ): RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, - id=_origin_url, + target=_origin_url, release=SWHID( object_type="release", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1050,7 +1062,7 @@ def test_metadata_validate_context_release(): # but content can RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, release=SWHID( object_type="release", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2" ), @@ -1061,7 +1073,7 @@ def test_metadata_validate_context_release(): with pytest.raises(ValueError, match="Expected core SWHID"): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, release=SWHID( object_type="release", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1076,7 +1088,7 @@ def test_metadata_validate_context_release(): ): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, release=SWHID( object_type="content", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1094,7 +1106,7 @@ def test_metadata_validate_context_revision(): ): RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, - id=_origin_url, + target=_origin_url, revision=SWHID( object_type="revision", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1105,7 +1117,7 @@ def test_metadata_validate_context_revision(): # but content can RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, revision=SWHID( object_type="revision", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2" ), @@ -1116,7 +1128,7 @@ def test_metadata_validate_context_revision(): with pytest.raises(ValueError, match="Expected core SWHID"): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, revision=SWHID( object_type="revision", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1131,7 +1143,7 @@ def test_metadata_validate_context_revision(): ): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, revision=SWHID( object_type="content", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1147,7 +1159,7 @@ def test_metadata_validate_context_path(): with pytest.raises(ValueError, match="Unexpected 'path' context for origin object"): RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, - id=_origin_url, + target=_origin_url, path=b"/foo/bar", **_common_metadata_fields, ) @@ -1155,7 +1167,7 @@ def test_metadata_validate_context_path(): # but content can RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, path=b"/foo/bar", **_common_metadata_fields, ) @@ -1170,7 +1182,7 @@ def test_metadata_validate_context_directory(): ): RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, - id=_origin_url, + target=_origin_url, directory=SWHID( object_type="directory", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1181,7 +1193,7 @@ def test_metadata_validate_context_directory(): # but content can RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, directory=SWHID( object_type="directory", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1193,7 +1205,7 @@ def test_metadata_validate_context_directory(): with pytest.raises(ValueError, match="Expected core SWHID"): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, directory=SWHID( object_type="directory", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", @@ -1208,10 +1220,25 @@ def test_metadata_validate_context_directory(): ): RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, - id=_content_swhid, + target=_content_swhid, directory=SWHID( object_type="content", object_id="94a9ed024d3859793618152ea559a168bbcbb5e2", ), **_common_metadata_fields, ) + + +def test_metadata_id_attr(): + """Checks the legacy id attribute on RawExtrinsicMetadata objects""" + # Simplest case + meta = RawExtrinsicMetadata( + type=MetadataTargetType.ORIGIN, target=_origin_url, **_common_metadata_fields + ) + + assert meta is not None + + with pytest.deprecated_call() as messages: + assert meta.id == _origin_url + + assert "RawExtrinsicMetadata `id`" in str(messages[0].message)