Skip to content
Snippets Groups Projects

Draft: Add discovery algorithms and RandomDirSamplingDiscoveryGraph

Open Franck Bret requested to merge generated-differential-D8938-source into master
2 unresolved threads

Use a random sampling discovery graph using only directories

This is a follow up of comments made in swh-scanner!64 (closed) about splitting some code to common module

Related swh-scanner!64 (closed)

Related swh-scanner#4591


Migrated from D8938 (view on Phabricator)

Merge request reports

Loading
Loading

Activity

Filter activity
  • Approvals
  • Assignees & reviewers
  • Comments (from bots)
  • Comments (from users)
  • Commits & branches
  • Edits
  • Labels
  • Lock status
  • Mentions
  • Merge request status
  • Tracking
43 contents = set()
44 skipped_contents = set()
45
46 for sha1 in self.undecided:
47 obj = self._all_contents[sha1]
48 obj_type = obj.object_type
49 if obj_type == model.Content.object_type:
50 contents.add(sha1)
51 elif obj_type == model.SkippedContent.object_type:
52 skipped_contents.add(sha1)
53 else:
54 raise TypeError(f"Unexpected object type {obj_type}")
55
56 return Sample(
57 contents=contents, skipped_contents=skipped_contents, directories=set()
58 )
  • 75 while graph.undecided:
    76 sample = await graph.get_sample()
    77 await graph.do_query(archive, sample)
    78
    79 contents = [c for c in contents if c.sha1_git in graph.unknown]
    80 skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown]
    81 directories = [c for c in directories if c.id in graph.unknown]
    82
    83 logger.debug(
    84 "Filtered out %d contents, %d skipped contents and %d directories",
    85 contents_count - len(contents),
    86 skipped_contents_count - len(skipped_contents),
    87 directories_count - len(directories),
    88 )
    89
    90 return (contents, skipped_contents, directories)
    • Comment on lines +61 to +90

      This module should only contain the storage specific implementation for the discovery algorithm, so the DiscoveryStorageConnection class should be found here but also a specialized version of the filter_known_objects function.

      # Copyright (C) 2022 The Software Heritage developers
      # See the AUTHORS file at the top-level directory of this distribution
      # License: GNU General Public License version 3, or any later version
      # See top-level LICENSE file for more information
      
      import logging
      import random
      from typing import Iterable, List
      
      from swh.model import discovery
      from swh.model.from_disk import model
      from swh.model.model import Sha1Git
      from swh.storage.interface import StorageInterface
      
      logger = logging.getLogger(__name__)
      
      
      class DiscoveryStorageConnection(discovery.ArchiveDiscoveryInterface):
          """Use the storage APIs to query the archive"""
      
          def __init__(
              self,
              contents: List[model.Content],
              skipped_contents: List[model.SkippedContent],
              directories: List[model.Directory],
              swh_storage: StorageInterface,
          ) -> None:
              super().__init__(contents, skipped_contents, directories)
              self.storage = swh_storage
      
          async def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
              """List content missing from the archive by sha1"""
              return self.storage.content_missing_per_sha1_git(contents)
      
          async def skipped_content_missing(
              self, skipped_contents: List[Sha1Git]
          ) -> Iterable[Sha1Git]:
              """List skipped content missing from the archive by sha1"""
              contents = [
                  {"sha1_git": s, "sha1": None, "sha256": None, "blake2s256": None}
                  for s in skipped_contents
              ]
              return (d["sha1_git"] for d in self.storage.skipped_content_missing(contents))
      
          async def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
              """List directories missing from the archive by sha1"""
              return self.storage.directory_missing(directories)
      
      
      async def filter_known_objects(
          contents: List[model.Content],
          skipped_contents: List[model.SkippedContent],
          directories: List[model.Directory],
          swh_storage: StorageInterface,
      ):
          """Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
          to only return those that are unknown to the SWH archive using a discovery
          algorithm."""
      
          archive = DiscoveryStorageConnection(
              contents, skipped_contents, directories, swh_storage
          )
      
          return await discovery.filter_known_objects(archive)
      
    • Please register or sign in to reply
  • assigned to @anlambert

  • Jenkins job DSTO/gitlab-builds #398 failed .
    See Console Output and Coverage Report for more details.

  • Please register or sign in to reply
    Loading