Skip to content
Snippets Groups Projects

Add discovery module with a DiscoveryStrorageConnection class

Closed Franck Bret requested to merge franckbret/swh-storage:discovery into master
1 file
+ 63
0
Compare changes
  • Side-by-side
  • Inline
+ 63
0
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Iterable, List
from swh.model import discovery
from swh.model.from_disk import model
from swh.model.model import Sha1Git
from swh.storage.interface import StorageInterface
logger = logging.getLogger(__name__)
class DiscoveryStorageConnection(discovery.ArchiveDiscoveryInterface):
"""Use the storage APIs to query the archive"""
def __init__(
self,
contents: List[model.Content],
skipped_contents: List[model.SkippedContent],
directories: List[model.Directory],
swh_storage: StorageInterface,
) -> None:
super().__init__(contents, skipped_contents, directories)
self.storage = swh_storage
async def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
"""List content missing from the archive by sha1"""
return self.storage.content_missing_per_sha1_git(contents)
async def skipped_content_missing(
self, skipped_contents: List[Sha1Git]
) -> Iterable[Sha1Git]:
"""List skipped content missing from the archive by sha1"""
contents = [
{"sha1_git": s, "sha1": None, "sha256": None, "blake2s256": None}
for s in skipped_contents
]
return (d["sha1_git"] for d in self.storage.skipped_content_missing(contents))
async def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
"""List directories missing from the archive by sha1"""
return self.storage.directory_missing(directories)
async def filter_known_objects(
contents: List[model.Content],
skipped_contents: List[model.SkippedContent],
directories: List[model.Directory],
swh_storage: StorageInterface,
):
"""Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
to only return those that are unknown to the SWH archive using a discovery
algorithm."""
archive = DiscoveryStorageConnection(
contents, skipped_contents, directories, swh_storage
)
return await discovery.filter_known_objects(archive)
Loading