Draft: Add discovery algorithms and RandomDirSamplingDiscoveryGraph
2 unresolved threads
2 unresolved threads
Compare changes
Conflict: This file was added both in the source and target branches, but with different contents.
Ask someone with write access to resolve it.
swh/storage/algos/discovery.py
0 → 100644
+ 90
− 0
- Comment on lines +61 to +90
This module should only contain the storage specific implementation for the discovery algorithm, so the
DiscoveryStorageConnection
class should be found here but also a specialized version of thefilter_known_objects
function.# Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import random from typing import Iterable, List from swh.model import discovery from swh.model.from_disk import model from swh.model.model import Sha1Git from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) class DiscoveryStorageConnection(discovery.ArchiveDiscoveryInterface): """Use the storage APIs to query the archive""" def __init__( self, contents: List[model.Content], skipped_contents: List[model.SkippedContent], directories: List[model.Directory], swh_storage: StorageInterface, ) -> None: super().__init__(contents, skipped_contents, directories) self.storage = swh_storage async def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]: """List content missing from the archive by sha1""" return self.storage.content_missing_per_sha1_git(contents) async def skipped_content_missing( self, skipped_contents: List[Sha1Git] ) -> Iterable[Sha1Git]: """List skipped content missing from the archive by sha1""" contents = [ {"sha1_git": s, "sha1": None, "sha256": None, "blake2s256": None} for s in skipped_contents ] return (d["sha1_git"] for d in self.storage.skipped_content_missing(contents)) async def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]: """List directories missing from the archive by sha1""" return self.storage.directory_missing(directories) async def filter_known_objects( contents: List[model.Content], skipped_contents: List[model.SkippedContent], directories: List[model.Directory], swh_storage: StorageInterface, ): """Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories`` to only return those that are unknown to the SWH archive using a discovery algorithm.""" archive = DiscoveryStorageConnection( contents, skipped_contents, directories, swh_storage ) return await discovery.filter_known_objects(archive)
This must be moved in
swh.model.discovery
as it is common logic of the discovery algorithm.