diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index fb88d9a44a93fa72c7046e241cde478e8489d6bb..94ecb201a5eec0a7bad5857edd966150a7ec8f6b 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,13 +1,14 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from contextlib import contextmanager import os +from pathlib import Path import shutil import tempfile -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple from xml.etree import ElementTree from rest_framework import status @@ -25,13 +26,13 @@ from ..common import APIGet @contextmanager -def aggregate_tarballs(extraction_dir, archive_paths): +def aggregate_tarballs(extraction_dir: str, archives: List) -> Iterator[str]: """Aggregate multiple tarballs into one and returns this new archive's path. Args: - extraction_dir (path): Path to use for the tarballs computation - archive_paths ([str]): Deposit's archive paths + extraction_dir: Path to use for the tarballs computation + archive_paths: Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) @@ -43,10 +44,26 @@ def aggregate_tarballs(extraction_dir, archive_paths): # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") - os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) + download_tarball_rootdir = os.path.join(dir_path, "download") + + # uncompress in a temporary location all client's deposit archives + for archive in archives: + with archive.open("rb") as archive_fp: + try: + # For storage which supports the path method access, let's retrieve it + archive_path = archive.path + except NotImplementedError: + # otherwise for remote backend which do not support it, let's download + # the tarball locally first + tarball_path = Path(archive.name) + + tarball_path_dir = Path(download_tarball_rootdir) / tarball_path.parent + tarball_path_dir.mkdir(0o755, parents=True, exist_ok=True) + + archive_path = str(tarball_path_dir / tarball_path.name) + with open(archive_path, "wb") as f: + f.write(archive_fp.read()) - # uncompress in a temporary location all archives - for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones @@ -90,13 +107,13 @@ class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin): Tuple status, stream of content, content-type """ - archive_paths = [ - r.archive.path + archives = [ + r.archive for r in self._deposit_requests(deposit, request_type=ARCHIVE_TYPE) ] return ( status.HTTP_200_OK, - aggregate_tarballs(self.extraction_dir, archive_paths), + aggregate_tarballs(self.extraction_dir, archives), "swh/generator", ) diff --git a/swh/deposit/tests/api/test_deposit_private_read_archive.py b/swh/deposit/tests/api/test_deposit_private_read_archive.py index 85eeb5bb5b08d0197bd7228b06d4ab13406e357b..f2e3e07d03c6c9c17d40a1c517e3ab5d0557feef 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_archive.py +++ b/swh/deposit/tests/api/test_deposit_private_read_archive.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,8 +11,11 @@ from rest_framework import status from swh.deposit.api.private.deposit_read import aggregate_tarballs from swh.deposit.config import EM_IRI, PRIVATE_GET_RAW_CONTENT +from swh.deposit.models import DepositRequest from swh.deposit.tests.common import create_arborescence_archive +from ..common import compute_info, post_archive + PRIVATE_GET_RAW_CONTENT_NC = PRIVATE_GET_RAW_CONTENT + "-nc" @@ -95,8 +98,74 @@ def test_access_to_existing_deposit_with_multiple_archives( assert tfile.extractfile("./file2").read() == b"some other content in file" -def test_aggregate_tarballs_with_strange_archive(datadir, tmp_path): - archive = join(datadir, "archives", "single-artifact-package.tar.gz") +def test_aggregate_tarballs_with_strange_archive( + datadir, authenticated_client, tmp_path, partial_deposit +): + deposit = partial_deposit + update_uri = reverse(EM_IRI, args=[deposit.collection.name, deposit.id]) + archive = compute_info(join(datadir, "archives", "single-artifact-package.tar.gz")) + + response = post_archive( + authenticated_client, + update_uri, + archive, + content_type="application/x-tar", + in_progress=False, + ) + + # then + assert response.status_code == status.HTTP_201_CREATED + + deposit_requests = DepositRequest.objects.filter(type="archive", deposit=deposit) + + archives = [dr.archive for dr in deposit_requests] + + with aggregate_tarballs(tmp_path, archives) as tarball_path: + assert exists(tarball_path) + + +def test_aggregate_tarballs_with_remote_tarball( + datadir, authenticated_client, tmp_path, partial_deposit +): + """Tarballs can be stored remotely and still be accessible for reading.""" + deposit = partial_deposit + update_uri = reverse(EM_IRI, args=[deposit.collection.name, deposit.id]) + archive = compute_info(join(datadir, "archives", "single-artifact-package.tar.gz")) + + response = post_archive( + authenticated_client, + update_uri, + archive, + content_type="application/x-tar", + in_progress=False, + ) + + # then + assert response.status_code == status.HTTP_201_CREATED + + deposit_requests = DepositRequest.objects.filter(type="archive", deposit=deposit) + + all_archives = [dr.archive for dr in deposit_requests] + + archives = [a for a in all_archives if not a.name.endswith(archive["name"])] + + # We'll patch the behavior of this archive to be read as if it were a remote one + interesting_archive = [a for a in all_archives if a.name.endswith(archive["name"])][ + 0 + ] + + class RemoteTarball: + """Basic remote tarball implementation for test purposes.""" + + name = archive["name"] + + @property + def path(self): + raise NotImplementedError("We fake the path implementation") + + open = interesting_archive.open + + archives.append(RemoteTarball()) - with aggregate_tarballs(tmp_path, [archive]) as tarball_path: + with aggregate_tarballs(tmp_path, archives) as tarball_path: assert exists(tarball_path)