Skip to content
Snippets Groups Projects
Commit e54151a4 authored by Pierre-Yves David's avatar Pierre-Yves David
Browse files

discovery: support optional callback for information

Right now, the discovery process offered by `filter_known_objects`
returns all results after the discovery is complete. The new callback
provides a way to get information "in real time" which is useful for at
least a couple of planned use case in the SWH scanner:
- displaying progress information while processing
- update a graphical UI in real time.

This simple callback fits this need without too much troubles.

For some reason, mypy complained about the existing type hint in this
file for unclear reason. So I fixed them.
parent 1a59d42f
No related branches found
Tags v6.11.0
No related merge requests found
......@@ -11,7 +11,17 @@ from collections import namedtuple
import itertools
import logging
import random
from typing import Any, Iterable, List, Mapping, NamedTuple, Set, Union
from typing import (
Any,
Callable,
Iterable,
List,
Mapping,
NamedTuple,
Optional,
Set,
Union,
)
from typing_extensions import Protocol, runtime_checkable
......@@ -63,15 +73,28 @@ class ArchiveDiscoveryInterface(Protocol):
class BaseDiscoveryGraph:
"""Creates the base structures and methods needed for discovery algorithms.
Subclasses should override ``get_sample`` to affect how the discovery is made."""
Subclasses should override ``get_sample`` to affect how the discovery is made.
The `update_info_callback` is an optional argument that will get called for
each new piece of information we get. The callback arguments are `(content,
known)`.
- content: the relevant model.Content object,
- known: a boolean, True if the file is known to the archive False otherwise.
"""
def __init__(self, contents, skipped_contents, directories):
def __init__(
self,
contents,
skipped_contents,
directories,
update_info_callback: Optional[Callable[[Any, bool], None]] = None,
):
self._all_contents: Mapping[
Sha1Git, Union[model.Content, model.SkippedContent]
] = {}
self._undecided_directories: Set[Sha1Git] = set()
self._children: Mapping[Sha1Git, model.DirectoryEntry] = {}
self._parents: Mapping[model.DirectoryEntry, Sha1Git] = {}
self._children: Mapping[Sha1Git, Set[Sha1Git]] = {}
self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {}
self.undecided: Set[Sha1Git] = set()
for content in itertools.chain(contents, skipped_contents):
......@@ -88,6 +111,12 @@ class BaseDiscoveryGraph:
self.undecided |= self._undecided_directories
self.known: Set[Sha1Git] = set()
self.unknown: Set[Sha1Git] = set()
self._update_info_callback = update_info_callback
self._sha1_to_obj = {}
for content in itertools.chain(contents, skipped_contents):
self._sha1_to_obj[content.sha1_git] = content
for directory in directories:
self._sha1_to_obj[directory.id] = directory
def mark_known(self, entries: Iterable[Sha1Git]):
"""Mark ``entries`` and those they imply as known in the SWH archive"""
......@@ -115,14 +144,19 @@ class BaseDiscoveryGraph:
- ``target_set``: set where marked entries will be added.
"""
callback = self._update_info_callback
to_process = set(entries)
while to_process:
current = to_process.pop()
target_set.add(current)
new = current in self.undecided
self.undecided.discard(current)
self._undecided_directories.discard(current)
next_entries = transitive_mapping.get(current, set()) & self.undecided
to_process.update(next_entries)
if new and callback is not None:
obj = self._sha1_to_obj[current]
callback(obj, current in self.known)
def get_sample(
self,
......@@ -195,10 +229,20 @@ class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph):
)
def filter_known_objects(archive: ArchiveDiscoveryInterface):
def filter_known_objects(
archive: ArchiveDiscoveryInterface,
update_info_callback: Optional[Callable[[Any, bool], None]] = None,
):
"""Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
to only return those that are unknown to the SWH archive using a discovery
algorithm."""
algorithm.
The `update_info_callback` is an optional argument that will get called for
each new piece of information we get. The callback arguments are `(content,
known)`.
- content: the relevant model.Content object,
- known: a boolean, True if the file is known to the archive False otherwise.
"""
contents = archive.contents
skipped_contents = archive.skipped_contents
directories = archive.directories
......@@ -207,7 +251,12 @@ def filter_known_objects(archive: ArchiveDiscoveryInterface):
skipped_contents_count = len(skipped_contents)
directories_count = len(directories)
graph = RandomDirSamplingDiscoveryGraph(contents, skipped_contents, directories)
graph = RandomDirSamplingDiscoveryGraph(
contents,
skipped_contents,
directories,
update_info_callback=update_info_callback,
)
while graph.undecided:
sample = graph.get_sample()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment