From e54151a43758ea162dbe85dec4f774de226ca675 Mon Sep 17 00:00:00 2001
From: Pierre-Yves David <pierre-yves.david@ens-lyon.org>
Date: Wed, 15 Nov 2023 02:08:47 +0100
Subject: [PATCH] discovery: support optional callback for information

Right now, the discovery process offered by `filter_known_objects`
returns all results after the discovery is complete. The new callback
provides a way to get information "in real time" which is useful for at
least a couple of planned use case in the SWH scanner:
- displaying progress information while processing
- update a graphical UI in real time.

This simple callback fits this need without too much troubles.

For some reason, mypy complained about the existing type hint in this
file for unclear reason. So I fixed them.
---
 swh/model/discovery.py | 65 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 8 deletions(-)

diff --git a/swh/model/discovery.py b/swh/model/discovery.py
index 95c8e14d..eba4cd4d 100644
--- a/swh/model/discovery.py
+++ b/swh/model/discovery.py
@@ -11,7 +11,17 @@ from collections import namedtuple
 import itertools
 import logging
 import random
-from typing import Any, Iterable, List, Mapping, NamedTuple, Set, Union
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Set,
+    Union,
+)
 
 from typing_extensions import Protocol, runtime_checkable
 
@@ -63,15 +73,28 @@ class ArchiveDiscoveryInterface(Protocol):
 
 class BaseDiscoveryGraph:
     """Creates the base structures and methods needed for discovery algorithms.
-    Subclasses should override ``get_sample`` to affect how the discovery is made."""
+    Subclasses should override ``get_sample`` to affect how the discovery is made.
+
+    The `update_info_callback` is an optional argument that will get called for
+    each new piece of information we get. The callback arguments are `(content,
+    known)`.
+    - content: the relevant model.Content object,
+    - known: a boolean, True if the file is known to the archive False otherwise.
+    """
 
-    def __init__(self, contents, skipped_contents, directories):
+    def __init__(
+        self,
+        contents,
+        skipped_contents,
+        directories,
+        update_info_callback: Optional[Callable[[Any, bool], None]] = None,
+    ):
         self._all_contents: Mapping[
             Sha1Git, Union[model.Content, model.SkippedContent]
         ] = {}
         self._undecided_directories: Set[Sha1Git] = set()
-        self._children: Mapping[Sha1Git, model.DirectoryEntry] = {}
-        self._parents: Mapping[model.DirectoryEntry, Sha1Git] = {}
+        self._children: Mapping[Sha1Git, Set[Sha1Git]] = {}
+        self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {}
         self.undecided: Set[Sha1Git] = set()
 
         for content in itertools.chain(contents, skipped_contents):
@@ -88,6 +111,12 @@ class BaseDiscoveryGraph:
         self.undecided |= self._undecided_directories
         self.known: Set[Sha1Git] = set()
         self.unknown: Set[Sha1Git] = set()
+        self._update_info_callback = update_info_callback
+        self._sha1_to_obj = {}
+        for content in itertools.chain(contents, skipped_contents):
+            self._sha1_to_obj[content.sha1_git] = content
+        for directory in directories:
+            self._sha1_to_obj[directory.id] = directory
 
     def mark_known(self, entries: Iterable[Sha1Git]):
         """Mark ``entries`` and those they imply as known in the SWH archive"""
@@ -115,14 +144,19 @@ class BaseDiscoveryGraph:
         - ``target_set``: set where marked entries will be added.
 
         """
+        callback = self._update_info_callback
         to_process = set(entries)
         while to_process:
             current = to_process.pop()
             target_set.add(current)
+            new = current in self.undecided
             self.undecided.discard(current)
             self._undecided_directories.discard(current)
             next_entries = transitive_mapping.get(current, set()) & self.undecided
             to_process.update(next_entries)
+            if new and callback is not None:
+                obj = self._sha1_to_obj[current]
+                callback(obj, current in self.known)
 
     def get_sample(
         self,
@@ -195,10 +229,20 @@ class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph):
         )
 
 
-def filter_known_objects(archive: ArchiveDiscoveryInterface):
+def filter_known_objects(
+    archive: ArchiveDiscoveryInterface,
+    update_info_callback: Optional[Callable[[Any, bool], None]] = None,
+):
     """Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
     to only return those that are unknown to the SWH archive using a discovery
-    algorithm."""
+    algorithm.
+
+    The `update_info_callback` is an optional argument that will get called for
+    each new piece of information we get. The callback arguments are `(content,
+    known)`.
+    - content: the relevant model.Content object,
+    - known: a boolean, True if the file is known to the archive False otherwise.
+    """
     contents = archive.contents
     skipped_contents = archive.skipped_contents
     directories = archive.directories
@@ -207,7 +251,12 @@ def filter_known_objects(archive: ArchiveDiscoveryInterface):
     skipped_contents_count = len(skipped_contents)
     directories_count = len(directories)
 
-    graph = RandomDirSamplingDiscoveryGraph(contents, skipped_contents, directories)
+    graph = RandomDirSamplingDiscoveryGraph(
+        contents,
+        skipped_contents,
+        directories,
+        update_info_callback=update_info_callback,
+    )
 
     while graph.undecided:
         sample = graph.get_sample()
-- 
GitLab