From e54151a43758ea162dbe85dec4f774de226ca675 Mon Sep 17 00:00:00 2001 From: Pierre-Yves David <pierre-yves.david@ens-lyon.org> Date: Wed, 15 Nov 2023 02:08:47 +0100 Subject: [PATCH] discovery: support optional callback for information Right now, the discovery process offered by `filter_known_objects` returns all results after the discovery is complete. The new callback provides a way to get information "in real time" which is useful for at least a couple of planned use case in the SWH scanner: - displaying progress information while processing - update a graphical UI in real time. This simple callback fits this need without too much troubles. For some reason, mypy complained about the existing type hint in this file for unclear reason. So I fixed them. --- swh/model/discovery.py | 65 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/swh/model/discovery.py b/swh/model/discovery.py index 95c8e14d..eba4cd4d 100644 --- a/swh/model/discovery.py +++ b/swh/model/discovery.py @@ -11,7 +11,17 @@ from collections import namedtuple import itertools import logging import random -from typing import Any, Iterable, List, Mapping, NamedTuple, Set, Union +from typing import ( + Any, + Callable, + Iterable, + List, + Mapping, + NamedTuple, + Optional, + Set, + Union, +) from typing_extensions import Protocol, runtime_checkable @@ -63,15 +73,28 @@ class ArchiveDiscoveryInterface(Protocol): class BaseDiscoveryGraph: """Creates the base structures and methods needed for discovery algorithms. - Subclasses should override ``get_sample`` to affect how the discovery is made.""" + Subclasses should override ``get_sample`` to affect how the discovery is made. + + The `update_info_callback` is an optional argument that will get called for + each new piece of information we get. The callback arguments are `(content, + known)`. + - content: the relevant model.Content object, + - known: a boolean, True if the file is known to the archive False otherwise. + """ - def __init__(self, contents, skipped_contents, directories): + def __init__( + self, + contents, + skipped_contents, + directories, + update_info_callback: Optional[Callable[[Any, bool], None]] = None, + ): self._all_contents: Mapping[ Sha1Git, Union[model.Content, model.SkippedContent] ] = {} self._undecided_directories: Set[Sha1Git] = set() - self._children: Mapping[Sha1Git, model.DirectoryEntry] = {} - self._parents: Mapping[model.DirectoryEntry, Sha1Git] = {} + self._children: Mapping[Sha1Git, Set[Sha1Git]] = {} + self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {} self.undecided: Set[Sha1Git] = set() for content in itertools.chain(contents, skipped_contents): @@ -88,6 +111,12 @@ class BaseDiscoveryGraph: self.undecided |= self._undecided_directories self.known: Set[Sha1Git] = set() self.unknown: Set[Sha1Git] = set() + self._update_info_callback = update_info_callback + self._sha1_to_obj = {} + for content in itertools.chain(contents, skipped_contents): + self._sha1_to_obj[content.sha1_git] = content + for directory in directories: + self._sha1_to_obj[directory.id] = directory def mark_known(self, entries: Iterable[Sha1Git]): """Mark ``entries`` and those they imply as known in the SWH archive""" @@ -115,14 +144,19 @@ class BaseDiscoveryGraph: - ``target_set``: set where marked entries will be added. """ + callback = self._update_info_callback to_process = set(entries) while to_process: current = to_process.pop() target_set.add(current) + new = current in self.undecided self.undecided.discard(current) self._undecided_directories.discard(current) next_entries = transitive_mapping.get(current, set()) & self.undecided to_process.update(next_entries) + if new and callback is not None: + obj = self._sha1_to_obj[current] + callback(obj, current in self.known) def get_sample( self, @@ -195,10 +229,20 @@ class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph): ) -def filter_known_objects(archive: ArchiveDiscoveryInterface): +def filter_known_objects( + archive: ArchiveDiscoveryInterface, + update_info_callback: Optional[Callable[[Any, bool], None]] = None, +): """Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories`` to only return those that are unknown to the SWH archive using a discovery - algorithm.""" + algorithm. + + The `update_info_callback` is an optional argument that will get called for + each new piece of information we get. The callback arguments are `(content, + known)`. + - content: the relevant model.Content object, + - known: a boolean, True if the file is known to the archive False otherwise. + """ contents = archive.contents skipped_contents = archive.skipped_contents directories = archive.directories @@ -207,7 +251,12 @@ def filter_known_objects(archive: ArchiveDiscoveryInterface): skipped_contents_count = len(skipped_contents) directories_count = len(directories) - graph = RandomDirSamplingDiscoveryGraph(contents, skipped_contents, directories) + graph = RandomDirSamplingDiscoveryGraph( + contents, + skipped_contents, + directories, + update_info_callback=update_info_callback, + ) while graph.undecided: sample = graph.get_sample() -- GitLab