From 737570ced6fd4c3f4408022f4cad99d2477ed395 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Wed, 24 Jan 2024 11:33:37 +0100 Subject: [PATCH] to_disk: Ensure all file hashes are used when fetching its bytes Previously only a couple of them were cherry-picked. Also add objstorage typing and use swh.objstorage.interface.objid_from_dict to remove some explicit type casting. --- swh/vault/cookers/base.py | 7 ++++--- swh/vault/cookers/git_bare.py | 9 +++++---- swh/vault/to_disk.py | 9 +++------ 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py index 2c974e8..a0cd107 100644 --- a/swh/vault/cookers/base.py +++ b/swh/vault/cookers/base.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2018 The Software Heritage developers +# Copyright (C) 2016-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,13 +7,14 @@ import abc import io import logging import traceback -from typing import ClassVar, Set +from typing import ClassVar, Optional, Set from psycopg2.extensions import QueryCanceledError import sentry_sdk import swh.model.swhids from swh.model.swhids import CoreSWHID, ObjectType +from swh.objstorage.interface import ObjStorageInterface from swh.storage.interface import StorageInterface MAX_BUNDLE_SIZE = 2**29 # 512 MiB @@ -71,7 +72,7 @@ class BaseVaultCooker(metaclass=abc.ABCMeta): backend, storage: StorageInterface, graph=None, - objstorage=None, + objstorage: Optional[ObjStorageInterface] = None, max_bundle_size: int = MAX_BUNDLE_SIZE, thread_pool_size: int = 10, ): diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py index ba7df19..8b6fcc9 100644 --- a/swh/vault/cookers/git_bare.py +++ b/swh/vault/cookers/git_bare.py @@ -39,7 +39,7 @@ import re import subprocess import tarfile import tempfile -from typing import Any, Dict, Iterable, List, NoReturn, Optional, Set, cast +from typing import Any, Dict, Iterable, List, NoReturn, Optional, Set import zlib import sentry_sdk @@ -61,9 +61,9 @@ from swh.model.model import ( from swh.model.model import Directory, DirectoryEntry from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ObjectType +from swh.objstorage.interface import objid_from_dict from swh.storage.algos.revisions_walker import DFSRevisionsWalker from swh.storage.algos.snapshot import snapshot_get_all_branches -from swh.storage.interface import HashDict from swh.vault.cookers.base import BaseVaultCooker from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE @@ -687,10 +687,11 @@ class GitBareCooker(BaseVaultCooker): self.write_content(obj_id, SKIPPED_MESSAGE) self._expect_mismatched_object_error(obj_id) elif content.status == "visible": + hashes = objid_from_dict(content.hashes()) if self.objstorage is None: - datum = self.storage.content_get_data(cast(HashDict, content.hashes())) + datum = self.storage.content_get_data(hashes) else: - datum = self.objstorage.get(content.hashes()) + datum = self.objstorage.get(hashes) if datum is None: logger.error( diff --git a/swh/vault/to_disk.py b/swh/vault/to_disk.py index 6c2a698..6f671ee 100644 --- a/swh/vault/to_disk.py +++ b/swh/vault/to_disk.py @@ -10,8 +10,8 @@ from typing import Any, Dict, Optional from swh.model import hashutil from swh.model.from_disk import DentryPerms, mode_to_perms -from swh.objstorage.interface import ObjStorageInterface -from swh.storage.interface import HashDict, StorageInterface +from swh.objstorage.interface import ObjStorageInterface, objid_from_dict +from swh.storage.interface import StorageInterface MISSING_MESSAGE = ( b"This content is missing from the Software Heritage archive " @@ -49,10 +49,7 @@ def get_filtered_file_content( """ status = file_data["status"] if status == "visible": - hashes: HashDict = { - "sha1": file_data["sha1"], - "sha1_git": file_data["sha1_git"], - } + hashes = objid_from_dict(file_data) data: Optional[bytes] if objstorage is not None: data = objstorage.get(hashes) -- GitLab