diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index bc1957ec293d6ecd3a4aa5866699f1d498359143..22449c688100f05e8a3e3d14eddb5f8a2af1fbbf 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1,5 +1,4 @@ -# Enable black +# python: Reformat code with black bf3f1cec8685c8f480ddd95027852f8caa10b8e3 - -# python: Reformat code with black 22.3.0 4c39334b2aa9f782950aaee72781dc1df9d37550 +5ff7c5b592ce1d76f5696a7f089680807ad557a6 diff --git a/PKG-INFO b/PKG-INFO index 3a5db5f62b9644ca76fa6a426b9450a389a637a4..5dc24b02863108fc113a0fb5de93d0120d29703a 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 6.10.0 +Version: 6.11.0 Summary: Software Heritage data model Author-email: Software Heritage developers <swh-devel@inria.fr> Project-URL: Homepage, https://gitlab.softwareheritage.org/swh/devel/swh-model diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 3a5db5f62b9644ca76fa6a426b9450a389a637a4..5dc24b02863108fc113a0fb5de93d0120d29703a 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 6.10.0 +Version: 6.11.0 Summary: Software Heritage data model Author-email: Software Heritage developers <swh-devel@inria.fr> Project-URL: Homepage, https://gitlab.softwareheritage.org/swh/devel/swh-model diff --git a/swh/model/discovery.py b/swh/model/discovery.py index 7cd9e7acdef28080d74f9a5cbf27ab03a9e322e2..eba4cd4d6924ccf61ab81413ea8022978bb7b642 100644 --- a/swh/model/discovery.py +++ b/swh/model/discovery.py @@ -11,7 +11,17 @@ from collections import namedtuple import itertools import logging import random -from typing import Any, Iterable, List, Mapping, NamedTuple, Set, Union +from typing import ( + Any, + Callable, + Iterable, + List, + Mapping, + NamedTuple, + Optional, + Set, + Union, +) from typing_extensions import Protocol, runtime_checkable @@ -49,29 +59,42 @@ class ArchiveDiscoveryInterface(Protocol): self.skipped_contents = skipped_contents self.directories = directories - async def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]: + def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]: """List content missing from the archive by sha1""" - async def skipped_content_missing( + def skipped_content_missing( self, skipped_contents: List[Sha1Git] ) -> Iterable[Sha1Git]: """List skipped content missing from the archive by sha1""" - async def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]: + def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]: """List directories missing from the archive by sha1""" class BaseDiscoveryGraph: """Creates the base structures and methods needed for discovery algorithms. - Subclasses should override ``get_sample`` to affect how the discovery is made.""" + Subclasses should override ``get_sample`` to affect how the discovery is made. + + The `update_info_callback` is an optional argument that will get called for + each new piece of information we get. The callback arguments are `(content, + known)`. + - content: the relevant model.Content object, + - known: a boolean, True if the file is known to the archive False otherwise. + """ - def __init__(self, contents, skipped_contents, directories): + def __init__( + self, + contents, + skipped_contents, + directories, + update_info_callback: Optional[Callable[[Any, bool], None]] = None, + ): self._all_contents: Mapping[ Sha1Git, Union[model.Content, model.SkippedContent] ] = {} self._undecided_directories: Set[Sha1Git] = set() - self._children: Mapping[Sha1Git, model.DirectoryEntry] = {} - self._parents: Mapping[model.DirectoryEntry, Sha1Git] = {} + self._children: Mapping[Sha1Git, Set[Sha1Git]] = {} + self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {} self.undecided: Set[Sha1Git] = set() for content in itertools.chain(contents, skipped_contents): @@ -88,6 +111,12 @@ class BaseDiscoveryGraph: self.undecided |= self._undecided_directories self.known: Set[Sha1Git] = set() self.unknown: Set[Sha1Git] = set() + self._update_info_callback = update_info_callback + self._sha1_to_obj = {} + for content in itertools.chain(contents, skipped_contents): + self._sha1_to_obj[content.sha1_git] = content + for directory in directories: + self._sha1_to_obj[directory.id] = directory def mark_known(self, entries: Iterable[Sha1Git]): """Mark ``entries`` and those they imply as known in the SWH archive""" @@ -115,16 +144,21 @@ class BaseDiscoveryGraph: - ``target_set``: set where marked entries will be added. """ + callback = self._update_info_callback to_process = set(entries) while to_process: current = to_process.pop() target_set.add(current) + new = current in self.undecided self.undecided.discard(current) self._undecided_directories.discard(current) next_entries = transitive_mapping.get(current, set()) & self.undecided to_process.update(next_entries) + if new and callback is not None: + obj = self._sha1_to_obj[current] + callback(obj, current in self.known) - async def get_sample( + def get_sample( self, ) -> Sample: """Return a three-tuple of samples from the undecided sets of contents, @@ -133,9 +167,7 @@ class BaseDiscoveryGraph: which are known.""" raise NotImplementedError() - async def do_query( - self, archive: ArchiveDiscoveryInterface, sample: Sample - ) -> None: + def do_query(self, archive: ArchiveDiscoveryInterface, sample: Sample) -> None: """Given a three-tuple of samples, ask the archive which are known or unknown and mark them as such.""" @@ -149,7 +181,7 @@ class BaseDiscoveryGraph: if not sample_per_type: continue known = set(sample_per_type) - unknown = set(await method(list(sample_per_type))) + unknown = set(method(list(sample_per_type))) known -= unknown self.mark_known(known) @@ -165,7 +197,7 @@ class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph): are left: we send them directly to the storage since they should be few and their structure flat.""" - async def get_sample(self) -> Sample: + def get_sample(self) -> Sample: if self._undecided_directories: if len(self._undecided_directories) <= SAMPLE_SIZE: return Sample( @@ -197,10 +229,20 @@ class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph): ) -async def filter_known_objects(archive: ArchiveDiscoveryInterface): +def filter_known_objects( + archive: ArchiveDiscoveryInterface, + update_info_callback: Optional[Callable[[Any, bool], None]] = None, +): """Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories`` to only return those that are unknown to the SWH archive using a discovery - algorithm.""" + algorithm. + + The `update_info_callback` is an optional argument that will get called for + each new piece of information we get. The callback arguments are `(content, + known)`. + - content: the relevant model.Content object, + - known: a boolean, True if the file is known to the archive False otherwise. + """ contents = archive.contents skipped_contents = archive.skipped_contents directories = archive.directories @@ -209,11 +251,16 @@ async def filter_known_objects(archive: ArchiveDiscoveryInterface): skipped_contents_count = len(skipped_contents) directories_count = len(directories) - graph = RandomDirSamplingDiscoveryGraph(contents, skipped_contents, directories) + graph = RandomDirSamplingDiscoveryGraph( + contents, + skipped_contents, + directories, + update_info_callback=update_info_callback, + ) while graph.undecided: - sample = await graph.get_sample() - await graph.do_query(archive, sample) + sample = graph.get_sample() + graph.do_query(archive, sample) contents = [c for c in contents if c.sha1_git in graph.unknown] skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown] diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py index 798851a23514c85a44cca0dd6c816153c6ae7485..48d701612424083b2d70db9125d9dca723068152 100644 --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -138,7 +138,7 @@ class Content(MerkleLeaf): """ - __slots__ = [] # type: List[str] + __slots__: List[str] = [] object_type: Final = "content" @classmethod @@ -567,7 +567,6 @@ class Directory(MerkleNode): """Builds a `model.Directory` object based on this node; ignoring its children.""" if self.__model_object is None: - DirectoryEntry = model.DirectoryEntry entries = [] diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index 8d2cb908e35f9bb7fc85e316b40f8750c0f562d4..e686547b1ce24300e59e3f52a70cc4542f6e8a4e 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -72,7 +72,7 @@ Subset of :const:`ALGORITHMS`. HASH_BLOCK_SIZE = 32768 """Block size for streaming hash computations made in this module""" -_blake2_hash_cache = {} # type: Dict[str, Callable] +_blake2_hash_cache: Dict[str, Callable] = {} class MultiHash: diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py index 45671861f8c8a6f1215bc1fc758505d9b7722845..8beca3427c8d79bda322f4711b4ed896c97f0b2d 100644 --- a/swh/model/hypothesis_strategies.py +++ b/swh/model/hypothesis_strategies.py @@ -479,7 +479,7 @@ def snapshots_d(draw, *, min_size=0, max_size=100, only_objects=False): } ) except ValueError as e: - for (source, target) in e.args[1]: + for source, target in e.args[1]: branches[source] = draw(branch_targets_d(only_objects=True)) else: break diff --git a/swh/model/merkle.py b/swh/model/merkle.py index b224840782e38d09a9ccad1786b6e99a4b6cf630..33814be6b2c32cc9602d27356bfefc3760898a00 100644 --- a/swh/model/merkle.py +++ b/swh/model/merkle.py @@ -217,7 +217,7 @@ class MerkleLeaf(MerkleNode): A Merkle leaf is simply a Merkle node with children disabled. """ - __slots__ = [] # type: List[str] + __slots__: List[str] = [] def __setitem__(self, name, child): raise ValueError("%s is a leaf" % self.__class__.__name__) diff --git a/swh/model/model.py b/swh/model/model.py index 0170bcaf350579c995f3f39ea36fe7ad55dd9f56..c1a6481fdb7ec3a58a9fca1d4c47991ce95bc562 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -229,7 +229,7 @@ def _immutable_dict_validator( key_validator = optimized_validator(expected_key_type) value_validator = optimized_validator(expected_value_type) - for (item_key, item_value) in value.items(): + for item_key, item_value in value.items(): key_validator( instance, attribute, @@ -282,7 +282,7 @@ def optimized_validator(type_): ): if origin_value is None: origin_value = value - for (validator, type_) in all_validators: + for validator, type_ in all_validators: try: validator( instance, diff --git a/swh/model/tests/test_discovery.py b/swh/model/tests/test_discovery.py index 095192418ac5c4dbf7f11bc827cdd852aaa0d871..4106d232a1399b1415bf90666a76bf1d21a2c986 100644 --- a/swh/model/tests/test_discovery.py +++ b/swh/model/tests/test_discovery.py @@ -25,21 +25,21 @@ class FakeArchive: skipped_contents: List[model.SkippedContent] directories: List[model.Directory] - async def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]: + def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]: return [] - async def skipped_content_missing( + def skipped_content_missing( self, skipped_contents: List[Sha1Git] ) -> Iterable[Sha1Git]: """List skipped content missing from the archive by sha1""" return [] - async def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]: + def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]: """List directories missing from the archive by sha1""" return [] -async def test_filter_known_objects(monkeypatch): +def test_filter_known_objects(monkeypatch): # Test with smaller sample sizes to actually trigger the random sampling monkeypatch.setattr(discovery, "SAMPLE_SIZE", 1) @@ -60,9 +60,7 @@ async def test_filter_known_objects(monkeypatch): assert archive.contents[0].sha1_git == KNOWN_CONTENT_HASH assert archive.directories[0].id == KNOWN_DIRECTORY_HASH assert archive.directories[1].id == KNOWN_DIRECTORY_HASH_2 - (contents, skipped_contents, directories) = await discovery.filter_known_objects( - archive - ) + (contents, skipped_contents, directories) = discovery.filter_known_objects(archive) assert len(contents) == 0 assert len(skipped_contents) == 0 assert len(directories) == 0 diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py index 3ab7ba242b836c16934175d3bbd58555721b6755..946fd533428a2d428794e1d99749de1c330e744c 100644 --- a/swh/model/tests/test_from_disk.py +++ b/swh/model/tests/test_from_disk.py @@ -136,7 +136,7 @@ class TestDiskBackedContent(unittest.TestCase): class DataMixin: - maxDiff = None # type: ClassVar[Optional[int]] + maxDiff: ClassVar[Optional[int]] = None def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix="swh.model.from_disk") diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index b279f2fa9d15bf85ab2ffa8a32deefc27545332a..3153ef0428e3345d147700541fb7d100cd6d6cf7 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -36,7 +36,6 @@ def blake2_hash_cache_reset(): @pytest.fixture def hash_test_data(): class HashTestData: - data = b"1984\n" hex_checksums = { "sha1": "62be35bf00ff0c624f4a621e2ea5595a049e0731", diff --git a/swh/model/tests/test_hypothesis_strategies.py b/swh/model/tests/test_hypothesis_strategies.py index 1531ff506f023d2b8ff854a4f26d3298bb29cce0..e43f3127c8c7bc50c626f6155085ed0cf23d7033 100644 --- a/swh/model/tests/test_hypothesis_strategies.py +++ b/swh/model/tests/test_hypothesis_strategies.py @@ -63,7 +63,7 @@ def assert_nested_dict(obj): """Tests the object is a nested dict and contains no more class from swh.model.model.""" if isinstance(obj, dict): - for (key, value) in obj.items(): + for key, value in obj.items(): assert isinstance(key, (str, bytes)), key assert_nested_dict(value) elif isinstance(obj, tuple): @@ -164,7 +164,6 @@ _max_snp_size = 100 @given(snapshots(min_size=_min_snp_size, max_size=_max_snp_size)) @settings(max_examples=1) def test_snapshots_strategy(snapshot): - branches = snapshot.branches assert len(branches) >= _min_snp_size diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index 248410f99fcbce422207294493b9993c11a2106e..e6182ee326bf2a92381dffe8420d0a282180aadf 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -769,7 +769,7 @@ def test_person_comparison(): def test_content_get_hash(): hashes = dict(sha1=b"foo", sha1_git=b"bar", sha256=b"baz", blake2s256=b"qux") c = Content(length=42, status="visible", **hashes) - for (hash_name, hash_) in hashes.items(): + for hash_name, hash_ in hashes.items(): assert c.get_hash(hash_name) == hash_ @@ -1225,7 +1225,6 @@ def test_revision_check(revision): @given(strategies.revisions(raw_manifest=none())) def test_revision_raw_manifest(revision): - raw_manifest = b"foo" id_ = hashlib.new("sha1", raw_manifest).digest() diff --git a/swh/model/tests/test_swhids.py b/swh/model/tests/test_swhids.py index 5c3cab22ee2cbeab65958ee230562dfab1ed3d58..12a46a010b39465bccabd48390e286d2c64b52db 100644 --- a/swh/model/tests/test_swhids.py +++ b/swh/model/tests/test_swhids.py @@ -230,7 +230,7 @@ VALID_SWHIDS = [ def test_parse_unparse_swhids(string, core, qualified, extended): """Tests parsing and serializing valid SWHIDs with the various SWHID classes.""" classes = [CoreSWHID, QualifiedSWHID, ExtendedSWHID] - for (cls, parsed_swhid) in zip(classes, [core, qualified, extended]): + for cls, parsed_swhid in zip(classes, [core, qualified, extended]): if parsed_swhid is None: # This class should not accept this SWHID with pytest.raises(ValidationError) as excinfo: