Skip to content
Snippets Groups Projects
Verified Commit fc5e843e authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

private/deposit_read: Adapt reading tarballs using the File api

Similar adaptation of the deposit_check endpoint to allow reading remote tarballs when
using remote storage backends.

Refs. swh/infra/sysadm-environment#5129
parent 556273ca
No related branches found
No related tags found
No related merge requests found
# Copyright (C) 2017-2022 The Software Heritage developers
# Copyright (C) 2017-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from contextlib import contextmanager
import os
from pathlib import Path
import shutil
import tempfile
from typing import Any, Dict, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple
from xml.etree import ElementTree
from rest_framework import status
......@@ -25,13 +26,13 @@ from ..common import APIGet
@contextmanager
def aggregate_tarballs(extraction_dir, archive_paths):
def aggregate_tarballs(extraction_dir: str, archives: List) -> Iterator[str]:
"""Aggregate multiple tarballs into one and returns this new archive's
path.
Args:
extraction_dir (path): Path to use for the tarballs computation
archive_paths ([str]): Deposit's archive paths
extraction_dir: Path to use for the tarballs computation
archive_paths: Deposit's archive paths
Returns:
Tuple (directory to clean up, archive path (aggregated or not))
......@@ -43,10 +44,26 @@ def aggregate_tarballs(extraction_dir, archive_paths):
# root folder to build an aggregated tarball
aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate")
os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True)
download_tarball_rootdir = os.path.join(dir_path, "download")
# uncompress in a temporary location all client's deposit archives
for archive in archives:
with archive.open("rb") as archive_fp:
try:
# For storage which supports the path method access, let's retrieve it
archive_path = archive.path
except NotImplementedError:
# otherwise for remote backend which do not support it, let's download
# the tarball locally first
tarball_path = Path(archive.name)
tarball_path_dir = Path(download_tarball_rootdir) / tarball_path.parent
tarball_path_dir.mkdir(0o755, parents=True, exist_ok=True)
archive_path = str(tarball_path_dir / tarball_path.name)
with open(archive_path, "wb") as f:
f.write(archive_fp.read())
# uncompress in a temporary location all archives
for archive_path in archive_paths:
tarball.uncompress(archive_path, aggregated_tarball_rootdir)
# Aggregate into one big tarball the multiple smaller ones
......@@ -90,13 +107,13 @@ class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin):
Tuple status, stream of content, content-type
"""
archive_paths = [
r.archive.path
archives = [
r.archive
for r in self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)
]
return (
status.HTTP_200_OK,
aggregate_tarballs(self.extraction_dir, archive_paths),
aggregate_tarballs(self.extraction_dir, archives),
"swh/generator",
)
......
# Copyright (C) 2017-2021 The Software Heritage developers
# Copyright (C) 2017-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -11,8 +11,11 @@ from rest_framework import status
from swh.deposit.api.private.deposit_read import aggregate_tarballs
from swh.deposit.config import EM_IRI, PRIVATE_GET_RAW_CONTENT
from swh.deposit.models import DepositRequest
from swh.deposit.tests.common import create_arborescence_archive
from ..common import compute_info, post_archive
PRIVATE_GET_RAW_CONTENT_NC = PRIVATE_GET_RAW_CONTENT + "-nc"
......@@ -95,8 +98,74 @@ def test_access_to_existing_deposit_with_multiple_archives(
assert tfile.extractfile("./file2").read() == b"some other content in file"
def test_aggregate_tarballs_with_strange_archive(datadir, tmp_path):
archive = join(datadir, "archives", "single-artifact-package.tar.gz")
def test_aggregate_tarballs_with_strange_archive(
datadir, authenticated_client, tmp_path, partial_deposit
):
deposit = partial_deposit
update_uri = reverse(EM_IRI, args=[deposit.collection.name, deposit.id])
archive = compute_info(join(datadir, "archives", "single-artifact-package.tar.gz"))
response = post_archive(
authenticated_client,
update_uri,
archive,
content_type="application/x-tar",
in_progress=False,
)
# then
assert response.status_code == status.HTTP_201_CREATED
deposit_requests = DepositRequest.objects.filter(type="archive", deposit=deposit)
archives = [dr.archive for dr in deposit_requests]
with aggregate_tarballs(tmp_path, archives) as tarball_path:
assert exists(tarball_path)
def test_aggregate_tarballs_with_remote_tarball(
datadir, authenticated_client, tmp_path, partial_deposit
):
"""Tarballs can be stored remotely and still be accessible for reading."""
deposit = partial_deposit
update_uri = reverse(EM_IRI, args=[deposit.collection.name, deposit.id])
archive = compute_info(join(datadir, "archives", "single-artifact-package.tar.gz"))
response = post_archive(
authenticated_client,
update_uri,
archive,
content_type="application/x-tar",
in_progress=False,
)
# then
assert response.status_code == status.HTTP_201_CREATED
deposit_requests = DepositRequest.objects.filter(type="archive", deposit=deposit)
all_archives = [dr.archive for dr in deposit_requests]
archives = [a for a in all_archives if not a.name.endswith(archive["name"])]
# We'll patch the behavior of this archive to be read as if it were a remote one
interesting_archive = [a for a in all_archives if a.name.endswith(archive["name"])][
0
]
class RemoteTarball:
"""Basic remote tarball implementation for test purposes."""
name = archive["name"]
@property
def path(self):
raise NotImplementedError("We fake the path implementation")
open = interesting_archive.open
archives.append(RemoteTarball())
with aggregate_tarballs(tmp_path, [archive]) as tarball_path:
with aggregate_tarballs(tmp_path, archives) as tarball_path:
assert exists(tarball_path)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment