diff --git a/PKG-INFO b/PKG-INFO index ab3693a0f7f8462c543d61f60ae398f5aa6c9d47..4ccc93d68316939da47eb4693a0db3b46e47628b 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.deposit -Version: 1.3.4 +Version: 1.4.0 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers @@ -62,6 +62,8 @@ Requires-Dist: swh.scheduler>=0.7.0; extra == "server" Requires-Dist: swh.model>=0.3.8; extra == "server" Requires-Dist: swh.auth[django]>=0.5.3; extra == "server" Requires-Dist: swh.storage>=0.28.0; extra == "server" +Provides-Extra: azure +Requires-Dist: django-storages[azure]; extra == "azure" Software Heritage - Deposit =========================== diff --git a/requirements-azure.txt b/requirements-azure.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ac444ac0af84a5bb098d653ed886dbf6a870f8f --- /dev/null +++ b/requirements-azure.txt @@ -0,0 +1 @@ +django-storages[azure] diff --git a/setup.py b/setup.py index 78c2d46f8ac758e13b6e98e4aa33ded3b1680e78..68d33153c732def5dff61c7fd8799755ffc0967f 100755 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ setup( extras_require={ "testing": parse_requirements("test", "server", "swh-server"), "server": parse_requirements("server", "swh-server"), + "azure": parse_requirements("azure"), }, include_package_data=True, entry_points=""" diff --git a/swh.deposit.egg-info/PKG-INFO b/swh.deposit.egg-info/PKG-INFO index ab3693a0f7f8462c543d61f60ae398f5aa6c9d47..4ccc93d68316939da47eb4693a0db3b46e47628b 100644 --- a/swh.deposit.egg-info/PKG-INFO +++ b/swh.deposit.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.deposit -Version: 1.3.4 +Version: 1.4.0 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers @@ -62,6 +62,8 @@ Requires-Dist: swh.scheduler>=0.7.0; extra == "server" Requires-Dist: swh.model>=0.3.8; extra == "server" Requires-Dist: swh.auth[django]>=0.5.3; extra == "server" Requires-Dist: swh.storage>=0.28.0; extra == "server" +Provides-Extra: azure +Requires-Dist: django-storages[azure]; extra == "azure" Software Heritage - Deposit =========================== diff --git a/swh.deposit.egg-info/SOURCES.txt b/swh.deposit.egg-info/SOURCES.txt index 680aededf4987f8e26444e9f30788139f326158c..65193d746beeb59e81eaea64dff60324d4c882a7 100644 --- a/swh.deposit.egg-info/SOURCES.txt +++ b/swh.deposit.egg-info/SOURCES.txt @@ -13,6 +13,7 @@ conftest.py mypy.ini pyproject.toml pytest.ini +requirements-azure.txt requirements-server.txt requirements-swh-server.txt requirements-swh.txt diff --git a/swh.deposit.egg-info/requires.txt b/swh.deposit.egg-info/requires.txt index 3ae7378d64f89c3c817d26249b98ebc097aa7a3d..1604e3f111c6357df34f3cfaf12cb3fb204ed98f 100644 --- a/swh.deposit.egg-info/requires.txt +++ b/swh.deposit.egg-info/requires.txt @@ -5,6 +5,9 @@ sentry-sdk swh.core[http]>=0.4 swh.model>=4.4.0 +[azure] +django-storages[azure] + [server] django djangorestframework diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py index a8807908be897886d35b3e38a963dc8a4b1bce0e..3b9c127146ba54a5d637976ec58580a48e1c4923 100644 --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -114,20 +114,29 @@ class APIChecks(APIPrivateView, APIGet, DepositReadMixin): <detail-error>) otherwise. """ - archive_path = archive_request.archive.path + archive = archive_request.archive + archive_name = archive.name - if not known_archive_format(archive_path): + if not known_archive_format(archive_name): return False, MANDATORY_ARCHIVE_UNSUPPORTED try: - if zipfile.is_zipfile(archive_path): - with zipfile.ZipFile(archive_path) as zipfile_: - files = zipfile_.namelist() - elif tarfile.is_tarfile(archive_path): - with tarfile.open(archive_path) as tarfile_: - files = tarfile_.getnames() - else: - return False, MANDATORY_ARCHIVE_UNSUPPORTED + # Use python's File api which is consistent across different types of + # storage backends (e.g. file, azure, ...) + + with archive.open("rb") as archive_fp: + try: + with zipfile.ZipFile(archive_fp) as zip_fp: + files = zip_fp.namelist() + except Exception: + try: + # rewind since the first tryout reading may have moved the + # cursor + archive_fp.seek(0) + with tarfile.open(fileobj=archive_fp) as tar_fp: + files = tar_fp.getnames() + except Exception: + return False, MANDATORY_ARCHIVE_UNSUPPORTED except Exception: return False, MANDATORY_ARCHIVE_UNREADABLE if len(files) > 1: diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index fb88d9a44a93fa72c7046e241cde478e8489d6bb..94ecb201a5eec0a7bad5857edd966150a7ec8f6b 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,13 +1,14 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from contextlib import contextmanager import os +from pathlib import Path import shutil import tempfile -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple from xml.etree import ElementTree from rest_framework import status @@ -25,13 +26,13 @@ from ..common import APIGet @contextmanager -def aggregate_tarballs(extraction_dir, archive_paths): +def aggregate_tarballs(extraction_dir: str, archives: List) -> Iterator[str]: """Aggregate multiple tarballs into one and returns this new archive's path. Args: - extraction_dir (path): Path to use for the tarballs computation - archive_paths ([str]): Deposit's archive paths + extraction_dir: Path to use for the tarballs computation + archive_paths: Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) @@ -43,10 +44,26 @@ def aggregate_tarballs(extraction_dir, archive_paths): # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") - os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) + download_tarball_rootdir = os.path.join(dir_path, "download") + + # uncompress in a temporary location all client's deposit archives + for archive in archives: + with archive.open("rb") as archive_fp: + try: + # For storage which supports the path method access, let's retrieve it + archive_path = archive.path + except NotImplementedError: + # otherwise for remote backend which do not support it, let's download + # the tarball locally first + tarball_path = Path(archive.name) + + tarball_path_dir = Path(download_tarball_rootdir) / tarball_path.parent + tarball_path_dir.mkdir(0o755, parents=True, exist_ok=True) + + archive_path = str(tarball_path_dir / tarball_path.name) + with open(archive_path, "wb") as f: + f.write(archive_fp.read()) - # uncompress in a temporary location all archives - for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones @@ -90,13 +107,13 @@ class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin): Tuple status, stream of content, content-type """ - archive_paths = [ - r.archive.path + archives = [ + r.archive for r in self._deposit_requests(deposit, request_type=ARCHIVE_TYPE) ] return ( status.HTTP_200_OK, - aggregate_tarballs(self.extraction_dir, archive_paths), + aggregate_tarballs(self.extraction_dir, archives), "swh/generator", ) diff --git a/swh/deposit/settings/production.py b/swh/deposit/settings/production.py index 3f125a41c54c190c1c7ae39de252e7750298dc9f..81e6b7055b3d541acc18471e8d649a233a50dffa 100644 --- a/swh/deposit/settings/production.py +++ b/swh/deposit/settings/production.py @@ -30,23 +30,22 @@ DEBUG = False config_file = os.environ.get("SWH_CONFIG_FILENAME") if not config_file: raise ValueError( - "Production: SWH_CONFIG_FILENAME must be set to the" - " configuration file needed!" + "Production: SWH_CONFIG_FILENAME must be set to the configuration file needed!" ) if not os.path.exists(config_file): raise ValueError( - "Production: configuration file %s does not exist!" % (config_file,) + f"Production: configuration file {config_file} does not exist!", ) conf = config.load_named_config(config_file) if not conf: - raise ValueError("Production: configuration %s does not exist." % (config_file,)) + raise ValueError(f"Production: configuration {config_file} does not exist.") for key in ("scheduler", "private", "authentication_provider"): if not conf.get(key): raise ValueError( - "Production: invalid configuration; missing %s config entry." % (key,) + f"Production: invalid configuration; missing {key} config entry." ) ALLOWED_HOSTS += conf.get("allowed_hosts", []) @@ -113,3 +112,26 @@ if authentication == "keycloak": } } ) + +# Optional azure backend to use +cfg_azure = conf.get("azure", {}) +if cfg_azure: + # Those 3 keys are mandatory + for key in ("container_name", "connection_string"): + if not cfg_azure.get(key): + raise ValueError( + f"Production: invalid configuration; missing {key} config entry." + ) + + STORAGES = { + "staticfiles": { + "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage" + }, + "default": { + "BACKEND": "storages.backends.azure_storage.AzureStorage", + "OPTIONS": { + "azure_container": cfg_azure["container_name"], + "connection_string": cfg_azure["connection_string"], + }, + }, + } diff --git a/swh/deposit/tests/api/test_collection_post_binary.py b/swh/deposit/tests/api/test_collection_post_binary.py index 26e7f5a40d50962212f2c1a0b78b0e792284db82..d13b136b1e1503bea8cc4429b81d25acfa729fab 100644 --- a/swh/deposit/tests/api/test_collection_post_binary.py +++ b/swh/deposit/tests/api/test_collection_post_binary.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -188,8 +188,7 @@ def test_post_deposit_binary_upload_no_content_disposition_header( authenticated_client, url, sample_archive, - HTTP_SLUG=external_id, - HTTP_IN_PROGRESS="false", + in_progress=False, HTTP_CONTENT_DISPOSITION=None, ) diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py index 6f7acc7649586658c75206a3c0262123cebfc8f4..084e32e1976c8235c8c9525f059b8bc6a430be4d 100644 --- a/swh/deposit/tests/api/test_deposit_private_check.py +++ b/swh/deposit/tests/api/test_deposit_private_check.py @@ -16,9 +16,11 @@ from swh.deposit.api.private.deposit_check import ( from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, + DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, PRIVATE_CHECK_DEPOSIT, + SE_IRI, ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml @@ -28,6 +30,8 @@ from swh.deposit.tests.common import ( ) from swh.deposit.utils import NAMESPACES +from ..common import post_archive, post_atom + PRIVATE_CHECK_DEPOSIT_NC = PRIVATE_CHECK_DEPOSIT + "-nc" @@ -41,10 +45,13 @@ def private_check_url_endpoints(collection, deposit): @pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) def test_deposit_ok( - authenticated_client, deposit_collection, ready_deposit_ok, extension + tmp_path, authenticated_client, deposit_collection, extension, atom_dataset ): """Proper deposit should succeed the checks (-> status ready)""" - deposit = ready_deposit_ok + deposit = create_deposit_with_archive( + tmp_path, extension, authenticated_client, deposit_collection.name, atom_dataset + ) + for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) @@ -68,11 +75,11 @@ def test_deposit_ok( @pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) def test_deposit_invalid_tarball( - tmp_path, authenticated_client, deposit_collection, extension + tmp_path, authenticated_client, deposit_collection, extension, atom_dataset ): """Deposit with tarball (of 1 tarball) should fail the checks: rejected""" deposit = create_deposit_archive_with_archive( - tmp_path, extension, authenticated_client, deposit_collection.name + tmp_path, extension, authenticated_client, deposit_collection.name, atom_dataset ) for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) @@ -167,31 +174,16 @@ def test_check_deposit_metadata_ok( deposit.save() -def create_deposit_archive_with_archive( - root_path, archive_extension, client, collection_name -): - # we create the holding archive to a given extension - archive = create_arborescence_archive( - root_path, - "archive1", - "file1", - b"some content in file", - extension=archive_extension, - ) - - # now we create an archive holding the first created archive - invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive) - +def create_deposit(archive, client, collection_name, atom_dataset): + """Create a deposit with archive (and metadata) for client in the collection name.""" # we deposit it - response = client.post( + response = post_archive( + client, reverse(COL_IRI, args=[collection_name]), + archive, content_type="application/x-tar", - data=invalid_archive["data"], - CONTENT_LENGTH=invalid_archive["length"], - HTTP_MD5SUM=invalid_archive["md5sum"], - HTTP_SLUG="external-id", - HTTP_IN_PROGRESS=False, - HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (invalid_archive["name"],), + slug="external-id", + in_progress=True, ) # then @@ -200,9 +192,60 @@ def create_deposit_archive_with_archive( deposit_status = response_content.findtext( "swh:deposit_status", namespaces=NAMESPACES ) - assert deposit_status == DEPOSIT_STATUS_DEPOSITED + assert deposit_status == DEPOSIT_STATUS_PARTIAL deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) + origin_url = client.deposit_client.provider_url + response = post_atom( + client, + reverse(SE_IRI, args=[collection_name, deposit_id]), + data=atom_dataset["entry-data0"] % origin_url, + in_progress=False, + ) + + assert response.status_code == status.HTTP_201_CREATED + response_content = parse_xml(response.content) + deposit_status = response_content.findtext( + "swh:deposit_status", namespaces=NAMESPACES + ) + assert deposit_status == DEPOSIT_STATUS_DEPOSITED + deposit = Deposit.objects.get(pk=deposit_id) assert DEPOSIT_STATUS_DEPOSITED == deposit.status return deposit + + +def create_deposit_with_archive( + root_path, archive_extension, client, collection_name, atom_dataset +): + """Create a deposit with a valid archive.""" + # we create the holding archive to a given extension + archive = create_arborescence_archive( + root_path, + "archive1", + "file1", + b"some content in file", + extension=archive_extension, + ) + + return create_deposit(archive, client, collection_name, atom_dataset) + + +def create_deposit_archive_with_archive( + root_path, archive_extension, client, collection_name, atom_dataset +): + """Create a deposit with an invalid archive (archive within archive)""" + + # we create the holding archive to a given extension + archive = create_arborescence_archive( + root_path, + "archive1", + "file1", + b"some content in file", + extension=archive_extension, + ) + + # now we create an archive holding the first created archive + invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive) + + return create_deposit(invalid_archive, client, collection_name, atom_dataset) diff --git a/swh/deposit/tests/api/test_deposit_private_read_archive.py b/swh/deposit/tests/api/test_deposit_private_read_archive.py index 85eeb5bb5b08d0197bd7228b06d4ab13406e357b..f2e3e07d03c6c9c17d40a1c517e3ab5d0557feef 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_archive.py +++ b/swh/deposit/tests/api/test_deposit_private_read_archive.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,8 +11,11 @@ from rest_framework import status from swh.deposit.api.private.deposit_read import aggregate_tarballs from swh.deposit.config import EM_IRI, PRIVATE_GET_RAW_CONTENT +from swh.deposit.models import DepositRequest from swh.deposit.tests.common import create_arborescence_archive +from ..common import compute_info, post_archive + PRIVATE_GET_RAW_CONTENT_NC = PRIVATE_GET_RAW_CONTENT + "-nc" @@ -95,8 +98,74 @@ def test_access_to_existing_deposit_with_multiple_archives( assert tfile.extractfile("./file2").read() == b"some other content in file" -def test_aggregate_tarballs_with_strange_archive(datadir, tmp_path): - archive = join(datadir, "archives", "single-artifact-package.tar.gz") +def test_aggregate_tarballs_with_strange_archive( + datadir, authenticated_client, tmp_path, partial_deposit +): + deposit = partial_deposit + update_uri = reverse(EM_IRI, args=[deposit.collection.name, deposit.id]) + archive = compute_info(join(datadir, "archives", "single-artifact-package.tar.gz")) + + response = post_archive( + authenticated_client, + update_uri, + archive, + content_type="application/x-tar", + in_progress=False, + ) + + # then + assert response.status_code == status.HTTP_201_CREATED + + deposit_requests = DepositRequest.objects.filter(type="archive", deposit=deposit) + + archives = [dr.archive for dr in deposit_requests] + + with aggregate_tarballs(tmp_path, archives) as tarball_path: + assert exists(tarball_path) + + +def test_aggregate_tarballs_with_remote_tarball( + datadir, authenticated_client, tmp_path, partial_deposit +): + """Tarballs can be stored remotely and still be accessible for reading.""" + deposit = partial_deposit + update_uri = reverse(EM_IRI, args=[deposit.collection.name, deposit.id]) + archive = compute_info(join(datadir, "archives", "single-artifact-package.tar.gz")) + + response = post_archive( + authenticated_client, + update_uri, + archive, + content_type="application/x-tar", + in_progress=False, + ) + + # then + assert response.status_code == status.HTTP_201_CREATED + + deposit_requests = DepositRequest.objects.filter(type="archive", deposit=deposit) + + all_archives = [dr.archive for dr in deposit_requests] + + archives = [a for a in all_archives if not a.name.endswith(archive["name"])] + + # We'll patch the behavior of this archive to be read as if it were a remote one + interesting_archive = [a for a in all_archives if a.name.endswith(archive["name"])][ + 0 + ] + + class RemoteTarball: + """Basic remote tarball implementation for test purposes.""" + + name = archive["name"] + + @property + def path(self): + raise NotImplementedError("We fake the path implementation") + + open = interesting_archive.open + + archives.append(RemoteTarball()) - with aggregate_tarballs(tmp_path, [archive]) as tarball_path: + with aggregate_tarballs(tmp_path, archives) as tarball_path: assert exists(tarball_path) diff --git a/swh/deposit/tests/api/test_deposit_update_binary.py b/swh/deposit/tests/api/test_deposit_update_binary.py index e1c7a73fc743743cba6cedb9f8c116fd92efd93e..9371f049f37006318bc68dca87b49f51a8da3b7e 100644 --- a/swh/deposit/tests/api/test_deposit_update_binary.py +++ b/swh/deposit/tests/api/test_deposit_update_binary.py @@ -74,7 +74,8 @@ def test_post_deposit_binary_and_post_to_add_another_archive( authenticated_client, update_uri, archive2, - HTTP_SLUG=external_id, + slug=external_id, + in_progress=False, ) assert response.status_code == status.HTTP_201_CREATED diff --git a/swh/deposit/tests/common.py b/swh/deposit/tests/common.py index 2651f1584dfbe117e1738b36de80ae502c96bdb0..2fbf878298442300b832867f57fd319eafb2430d 100644 --- a/swh/deposit/tests/common.py +++ b/swh/deposit/tests/common.py @@ -145,11 +145,14 @@ def check_archive(archive_name: str, archive_name_to_check: str): def _post_or_put_archive(f, url, archive, slug=None, in_progress=None, **kwargs): default_kwargs = dict( - content_type="application/zip", - CONTENT_LENGTH=archive["length"], - HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive["name"],), + content_type=kwargs.get("content_type") or "application/zip", HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", + HTTP_IN_PROGRESS=in_progress if in_progress is not None else True, + HTTP_CONTENT_DISPOSITION=f"attachment; filename={archive['name']}", + CONTENT_LENGTH=archive["length"], ) + if slug: + default_kwargs.update(dict(HTTP_SLUG=slug)) kwargs = {**default_kwargs, **kwargs} return f( url,