From 6141f333b9f124db094beee1825ad6e500a1d48d Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Sat, 1 Feb 2025 17:39:32 +0100 Subject: [PATCH 1/2] api/private: Add endpoint to get download links of uploaded tarballs Add a new private Web API endpoint to get a list of URLs for downloading the tarballs uploaded with a deposit. In development or docker mode, the tarballs are stored in the local filesystem and served by django. In production mode, the tarballs are stored in an azure blob storage and temporary download links with a shared access signature are generated when requesting the endpoint. It enables to move costly operations related to downloading and processing tarballs in celery workers instead of letting the deposit server performing those tasks. Related to #4657 and #4658. --- .../api/private/deposit_upload_urls.py | 59 ++++++ swh/deposit/api/private/urls.py | 11 +- swh/deposit/config.py | 3 +- swh/deposit/settings/common.py | 4 +- swh/deposit/settings/production.py | 4 +- .../api/test_deposit_private_upload_urls.py | 190 ++++++++++++++++++ swh/deposit/urls.py | 15 +- 7 files changed, 281 insertions(+), 5 deletions(-) create mode 100644 swh/deposit/api/private/deposit_upload_urls.py create mode 100644 swh/deposit/tests/api/test_deposit_private_upload_urls.py diff --git a/swh/deposit/api/private/deposit_upload_urls.py b/swh/deposit/api/private/deposit_upload_urls.py new file mode 100644 index 00000000..3fb744e1 --- /dev/null +++ b/swh/deposit/api/private/deposit_upload_urls.py @@ -0,0 +1,59 @@ +# Copyright (C) 2025 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import List, Tuple + +from django.db.models import FileField +from django.http.request import HttpRequest +from rest_framework import status + +from swh.deposit.api.common import APIGet +from swh.deposit.api.private import APIPrivateView, DepositReadMixin +from swh.deposit.config import ARCHIVE_TYPE +from swh.deposit.models import Deposit + + +class APIUploadURLs(APIPrivateView, APIGet, DepositReadMixin): + """ + Private API endpoint returning a list of URLs for downloading + tarballs uploaded with a deposit request. + + Only GET is supported. + + """ + + @classmethod + def _get_archive_url(cls, archive: FileField, request: HttpRequest) -> str: + url = archive.storage.url(archive.name) + if url.startswith("/"): + url = request.build_absolute_uri(url) + return url + + def process_get( + self, request: HttpRequest, collection_name: str, deposit: Deposit + ) -> Tuple[int, List[str], str]: + """ + Returns list of URLs for downloading tarballs uploaded with + a deposit request. + + Args: + request: input HTTP request + collection_name: Collection owning the deposit + deposit: Deposit to get tarball download URLs + + Returns: + Tuple status, list of URLs, content-type + + """ + upload_urls = [ + self._get_archive_url(r.archive, request) + # ensure that tarball URLs are sorted in ascending order of their upload + # dates as tarball contents will be aggregated into a single tarball by the + # deposit loader and the files they contain can overlap + for r in reversed( + list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) + ) + ] + return status.HTTP_200_OK, upload_urls, "application/json" diff --git a/swh/deposit/api/private/urls.py b/swh/deposit/api/private/urls.py index d0af733a..01ae2d92 100644 --- a/swh/deposit/api/private/urls.py +++ b/swh/deposit/api/private/urls.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2023 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,11 +11,13 @@ from swh.deposit.api.private.deposit_list import APIList, deposit_list_datatable from swh.deposit.api.private.deposit_read import APIReadArchives, APIReadMetadata from swh.deposit.api.private.deposit_releases import APIReleases from swh.deposit.api.private.deposit_update_status import APIUpdateStatus +from swh.deposit.api.private.deposit_upload_urls import APIUploadURLs from swh.deposit.config import ( PRIVATE_CHECK_DEPOSIT, PRIVATE_GET_DEPOSIT_METADATA, PRIVATE_GET_RAW_CONTENT, PRIVATE_GET_RELEASES, + PRIVATE_GET_UPLOAD_URLS, PRIVATE_LIST_DEPOSITS, PRIVATE_LIST_DEPOSITS_DATATABLES, PRIVATE_PUT_DEPOSIT, @@ -91,4 +93,11 @@ urlpatterns = [ APIReleases.as_view(), name=PRIVATE_GET_RELEASES, ), + # Retrieve download URLs for the tarballs uploaded with a deposit + # -> GET + path( + "<int:deposit_id>/upload-urls/", + APIUploadURLs.as_view(), + name=PRIVATE_GET_UPLOAD_URLS, + ), ] diff --git a/swh/deposit/config.py b/swh/deposit/config.py index f3d51ad8..68aab223 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -29,6 +29,7 @@ PRIVATE_GET_DEPOSIT_METADATA = "private-read" PRIVATE_LIST_DEPOSITS = "private-deposit-list" PRIVATE_LIST_DEPOSITS_DATATABLES = "private-deposit-list-datatables" PRIVATE_GET_RELEASES = "private-releases" +PRIVATE_GET_UPLOAD_URLS = "private-upload-urls" ARCHIVE_KEY = "archive" RAW_METADATA_KEY = "raw-metadata" diff --git a/swh/deposit/settings/common.py b/swh/deposit/settings/common.py index 5a85b99d..e2441988 100644 --- a/swh/deposit/settings/common.py +++ b/swh/deposit/settings/common.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -122,3 +122,5 @@ CACHES = { "BACKEND": "django.core.cache.backends.locmem.LocMemCache", } } + +MEDIA_URL = "uploads/" diff --git a/swh/deposit/settings/production.py b/swh/deposit/settings/production.py index 5008f85a..e9f549f4 100644 --- a/swh/deposit/settings/production.py +++ b/swh/deposit/settings/production.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2023 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -127,6 +127,8 @@ if cfg_azure: azure_container=cfg_azure["container_name"], connection_string=cfg_azure["connection_string"], timeout=cfg_azure.get("connection_timeout", 120), + # ensure to generate temporary download links with shared access signature + expiration_secs=cfg_azure.get("expiration_secs", 1800), ) # Which may be enhanced with some extra options, lookup "object_parameters" in diff --git a/swh/deposit/tests/api/test_deposit_private_upload_urls.py b/swh/deposit/tests/api/test_deposit_private_upload_urls.py new file mode 100644 index 00000000..4b8819fa --- /dev/null +++ b/swh/deposit/tests/api/test_deposit_private_upload_urls.py @@ -0,0 +1,190 @@ +# Copyright (C) 2025 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from hashlib import sha1 +import os +import secrets +import shutil +import subprocess + +from azure.storage.blob import BlobServiceClient +from django.urls import reverse_lazy as reverse +import pytest +import requests +from rest_framework import status + +from swh.deposit.config import DEPOSIT_STATUS_PARTIAL, EM_IRI, PRIVATE_GET_UPLOAD_URLS +from swh.deposit.tests.common import create_arborescence_archive +from swh.deposit.tests.conftest import create_deposit + +AZURITE_EXE = shutil.which( + "azurite-blob", path=os.environ.get("AZURITE_PATH", os.environ.get("PATH")) +) + + +@pytest.fixture(scope="module") +def azure_container_name(): + return secrets.token_hex(10) + + +@pytest.fixture(scope="module") +def azurite_connection_string(tmpdir_factory): + host = "127.0.0.1" + + azurite_path = tmpdir_factory.mktemp("azurite") + + azurite_proc = subprocess.Popen( + [ + AZURITE_EXE, + "--blobHost", + host, + "--blobPort", + "0", + ], + stdout=subprocess.PIPE, + cwd=azurite_path, + ) + + prefix = b"Azurite Blob service successfully listens on " + for line in azurite_proc.stdout: + if line.startswith(prefix): + base_url = line[len(prefix) :].decode().strip() + break + else: + assert False, "Did not get Azurite Blob service port." + + # https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite#well-known-storage-account-and-key + account_name = "devstoreaccount1" + account_key = ( + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq" + "/K1SZFPTOtr/KBHBeksoGMGw==" + ) + + container_url = f"{base_url}/{account_name}" + + # unset proxy set by the swh_proxy session fixture + del os.environ["http_proxy"] + del os.environ["https_proxy"] + + yield ( + f"DefaultEndpointsProtocol=https;" + f"AccountName={account_name};" + f"AccountKey={account_key};" + f"BlobEndpoint={container_url};" + ) + + # reset proxy + os.environ["http_proxy"] = "http://localhost:999" + os.environ["https_proxy"] = "http://localhost:999" + + azurite_proc.kill() + azurite_proc.wait(2) + + +def check_deposit_upload_urls( + tmp_path, + authenticated_client, + deposit_collection, + sample_archive, +): + # Add a first archive to deposit + partial_deposit = create_deposit( + authenticated_client, + collection_name=deposit_collection.name, + sample_archive=sample_archive, + external_id="external-id", + deposit_status=DEPOSIT_STATUS_PARTIAL, + ) + + # Add a second archive to deposit + archive2 = create_arborescence_archive( + tmp_path, "archive2", "file2", b"some other content in file" + ) + update_uri = reverse(EM_IRI, args=[deposit_collection.name, partial_deposit.id]) + response = authenticated_client.post( + update_uri, + content_type="application/zip", # as zip + data=archive2["data"], + # + headers + CONTENT_LENGTH=archive2["length"], + HTTP_SLUG=partial_deposit.external_id, + HTTP_CONTENT_MD5=archive2["md5sum"], + HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", + HTTP_IN_PROGRESS="false", + HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), + ) + assert response.status_code == status.HTTP_201_CREATED + + # check uploaded tarballs can be downloaded using URLs and + # compare download contents with originals + url = reverse(PRIVATE_GET_UPLOAD_URLS, args=[partial_deposit.id]) + response = authenticated_client.get(url) + upload_urls = response.json() + assert len(upload_urls) == 2 + assert "archive1.zip" in upload_urls[0] + assert "archive2.zip" in upload_urls[1] + tarball_shasums = set() + for upload_url in upload_urls: + response = ( + # when storage backend is local filesystem and served by django + authenticated_client.get(upload_url) + if upload_url.startswith("http://testserver/") + # when storage backend is azurite + else requests.get(upload_url) + ) + assert response.status_code == status.HTTP_200_OK + tarball_shasums.add( + sha1( + b"".join(response.streaming_content) + if hasattr(response, "streaming_content") + else response.content + ).hexdigest() + ) + + assert tarball_shasums == {sample_archive["sha1sum"], archive2["sha1sum"]} + + +def test_deposit_upload_urls_local_filesystem_storage_backend( + tmp_path, + authenticated_client, + deposit_collection, + sample_archive, +): + check_deposit_upload_urls( + tmp_path, + authenticated_client, + deposit_collection, + sample_archive, + ) + + +@pytest.mark.skipif(not AZURITE_EXE, reason="azurite not found in AZURITE_PATH or PATH") +def test_deposit_upload_urls_azure_storage_backend( + tmp_path, + authenticated_client, + deposit_collection, + sample_archive, + settings, + azurite_connection_string, + azure_container_name, +): + blob_client = BlobServiceClient.from_connection_string(azurite_connection_string) + blob_client.create_container(azure_container_name) + settings.STORAGES = { + "default": { + "BACKEND": "storages.backends.azure_storage.AzureStorage", + "OPTIONS": { + "connection_string": azurite_connection_string, + "azure_container": azure_container_name, + "expiration_secs": 1800, + }, + }, + } + check_deposit_upload_urls( + tmp_path, + authenticated_client, + deposit_collection, + sample_archive, + ) diff --git a/swh/deposit/urls.py b/swh/deposit/urls.py index 6127d448..6860e80d 100644 --- a/swh/deposit/urls.py +++ b/swh/deposit/urls.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2023 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,10 +8,12 @@ from __future__ import annotations from typing import Sequence, Union +from django.conf import settings from django.conf.urls import include from django.shortcuts import render from django.urls import re_path as url from django.views.generic.base import RedirectView +from django.views.static import serve from rest_framework.urlpatterns import format_suffix_patterns try: @@ -41,3 +43,14 @@ urlpatterns = format_suffix_patterns( url(r"^$", default_view, name="home"), ] ) + +if "AzureStorage" not in settings.STORAGES["default"]["BACKEND"]: + # to serve uploaded tarballs when no azure storage backend is configured, + # typically in docker or with development/test settings + urlpatterns.append( + url( + rf"^{settings.MEDIA_URL.rstrip('/').split('/')[-1]}/(?P<path>.*)$", + serve, + {"document_root": settings.MEDIA_ROOT}, + ) + ) -- GitLab From 46e2c7b6b8c5c08976e51bad3d7282fab7e06f68 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Fri, 7 Feb 2025 17:35:06 +0100 Subject: [PATCH 2/2] checker: Remove private API endpoint and do checks on celery worker Checking deposit archives can be a costly operation as the checker must download the archives to list their content. It has been observed in production that if a large archive has been uploaded with a deposit, requesting the check endpoint of the private deposit API can end up with gunicorn worker being killed as the time to download the archive exceeds the worker timeout. So instead of using the private API endpoint performs the checks, prefer to move these operations in the celery worker executing the check-deposit task. Fixes #4658. --- requirements-server.txt | 1 - requirements.txt | 1 + swh/deposit/api/common.py | 18 +- swh/deposit/api/private/deposit_check.py | 256 --------- .../api/private/deposit_update_status.py | 18 +- swh/deposit/api/private/urls.py | 16 - swh/deposit/config.py | 10 +- swh/deposit/loader/checker.py | 193 ++++++- swh/deposit/{api => loader}/checks.py | 15 - swh/deposit/tests/api/conftest.py | 69 +-- swh/deposit/tests/api/test_checks.py | 14 +- .../tests/api/test_deposit_private_check.py | 288 ---------- swh/deposit/tests/cli/test_client.py | 12 +- swh/deposit/tests/conftest.py | 57 +- swh/deposit/tests/data/atom/entry-data0.xml | 3 +- swh/deposit/tests/data/atom/entry-data1.xml | 3 +- swh/deposit/tests/loader/conftest.py | 14 +- swh/deposit/tests/loader/test_checker.py | 518 +++++++++++++++++- swh/deposit/tests/loader/test_tasks.py | 30 +- 19 files changed, 801 insertions(+), 735 deletions(-) delete mode 100644 swh/deposit/api/private/deposit_check.py rename swh/deposit/{api => loader}/checks.py (97%) delete mode 100644 swh/deposit/tests/api/test_deposit_private_check.py diff --git a/requirements-server.txt b/requirements-server.txt index 7aab2921..58f9d109 100644 --- a/requirements-server.txt +++ b/requirements-server.txt @@ -2,6 +2,5 @@ django djangorestframework psycopg2 setuptools -xmlschema pymemcache diff --git a/requirements.txt b/requirements.txt index 6d901d21..8cc0c788 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ click iso8601 requests sentry-sdk +xmlschema diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index 56ad43e9..9a46c46b 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2024 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -25,7 +25,6 @@ from rest_framework.permissions import BasePermission, IsAuthenticated from rest_framework.request import Request from rest_framework.views import APIView -from swh.deposit.api.checks import check_metadata, check_url_match_provider from swh.deposit.api.converters import convert_status_detail from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication from swh.deposit.config import ( @@ -56,6 +55,7 @@ from swh.deposit.errors import ( DepositError, ParserError, ) +from swh.deposit.loader.checks import check_metadata from swh.deposit.models import ( DEPOSIT_METADATA_ONLY, Deposit, @@ -166,6 +166,20 @@ def guess_deposit_origin_url(deposit: Deposit): return "%s/%s" % (deposit.client.provider_url.rstrip("/"), external_id) +def check_url_match_provider(url: str, provider_url: str) -> None: + """Check url matches the provider url. + + Raises DepositError in case of mismatch + + """ + provider_url = provider_url.rstrip("/") + "/" + if not url.startswith(provider_url): + raise DepositError( + FORBIDDEN, + f"URL mismatch: {url} must start with {provider_url}", + ) + + class APIBase(APIConfig, APIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors.""" diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py deleted file mode 100644 index a8dc63b2..00000000 --- a/swh/deposit/api/private/deposit_check.py +++ /dev/null @@ -1,256 +0,0 @@ -# Copyright (C) 2017-2024 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from itertools import chain -import os -import re -from shutil import get_unpack_formats -import tarfile -from typing import Dict, Optional, Tuple -from xml.etree import ElementTree -import zipfile - -from rest_framework import status -from rest_framework.request import Request - -from swh.deposit.api.checks import check_metadata -from swh.deposit.api.common import APIGet -from swh.deposit.api.private import APIPrivateView, DepositReadMixin -from swh.deposit.config import ( - ARCHIVE_TYPE, - DEPOSIT_STATUS_REJECTED, - DEPOSIT_STATUS_VERIFIED, -) -from swh.deposit.models import Deposit, DepositRequest -from swh.scheduler.utils import create_oneshot_task - -MANDATORY_ARCHIVE_UNREADABLE = ( - "At least one of its associated archives is not readable" # noqa -) -MANDATORY_ARCHIVE_INVALID = ( - "Mandatory archive is invalid (i.e contains only one archive)" # noqa -) -MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" -MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" - -ARCHIVE_EXTENSIONS = [ - "zip", - "tar", - "tar.gz", - "xz", - "tar.xz", - "bz2", - "tar.bz2", - "Z", - "tar.Z", - "tgz", - "7z", -] - -PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) - - -def known_archive_format(filename): - return any( - filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) - ) - - -class APIChecks(APIPrivateView, APIGet, DepositReadMixin): - """Dedicated class to trigger the deposit checks on deposit archives and metadata. - - Only GET is supported. - - """ - - def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]: - """Given a deposit, check each deposit request of type archive. - - Args: - The deposit to check archives for - - Returns - tuple (status, details): True, None if all archives - are ok, (False, <detailed-error>) otherwise. - - """ - requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) - requests.reverse() - if len(requests) == 0: # no associated archive is refused - return False, { - "archive": [ - { - "summary": MANDATORY_ARCHIVE_MISSING, - } - ] - } - - errors = [] - for archive_request in requests: - check, error_message = self._check_archive(archive_request) - if not check: - errors.append( - {"summary": error_message, "fields": [archive_request.id]} - ) - - if not errors: - return True, None - return False, {"archive": errors} - - def _check_archive( - self, archive_request: DepositRequest - ) -> Tuple[bool, Optional[str]]: - """Check that a deposit associated archive is ok: - - readable - - supported archive format - - valid content: the archive does not contain a single archive file - - If any of those checks are not ok, return the corresponding - failing check. - - Args: - archive_path (DepositRequest): Archive to check - - Returns: - (True, None) if archive is check compliant, (False, - <detail-error>) otherwise. - - """ - archive = archive_request.archive - archive_name = os.path.basename(archive.name) - - if not known_archive_format(archive_name): - return False, MANDATORY_ARCHIVE_UNSUPPORTED - - try: - # Use python's File api which is consistent across different types of - # storage backends (e.g. file, azure, ...) - - # I did not find any other) workaround for azure blobstorage use, noop - # otherwise - reset_content_settings_if_needed(archive) - # FIXME: ^ Implement a better way (after digging into django-storages[azure] - - with archive.open("rb") as archive_fp: - try: - with zipfile.ZipFile(archive_fp) as zip_fp: - files = zip_fp.namelist() - except Exception: - try: - # rewind since the first tryout reading may have moved the - # cursor - archive_fp.seek(0) - with tarfile.open(fileobj=archive_fp) as tar_fp: - files = tar_fp.getnames() - except Exception: - return False, MANDATORY_ARCHIVE_UNSUPPORTED - except Exception: - return False, MANDATORY_ARCHIVE_UNREADABLE - if len(files) > 1: - return True, None - element = files[0] - if PATTERN_ARCHIVE_EXTENSION.match(element): - # archive in archive! - return False, MANDATORY_ARCHIVE_INVALID - return True, None - - def process_get( - self, req: Request, collection_name: str, deposit: Deposit - ) -> Tuple[int, Dict, str]: - """Trigger the checks on the deposit archives and then on the deposit metadata. - If any problems (or warnings) are raised, the deposit status and status detail - are updated accordingly. If all checks are ok, the deposit status is updated to - the 'verified' status (details updated with warning if any) and a loading task - is scheduled for the deposit to be ingested. Otherwise, the deposit is marked as - 'rejected' with the error details. A json response is returned to the caller - with the deposit checks. - - Args: - req: Client request - collection_name: Collection owning the deposit - deposit: Deposit concerned by the reading - - Returns: - Tuple (status, json response, content-type) - - """ - raw_metadata = self._metadata_get(deposit) - details_dict: Dict = {} - # will check each deposit's associated request (both of type - # archive and metadata) for errors - archives_status_ok, details = self._check_deposit_archives(deposit) - if not archives_status_ok: - assert details is not None - details_dict.update(details) - - if raw_metadata is None: - metadata_status_ok = False - details_dict["metadata"] = [{"summary": "Missing Atom document"}] - else: - metadata_tree = ElementTree.fromstring(raw_metadata) - metadata_status_ok, details = check_metadata(metadata_tree) - # Ensure in case of error, we do have the rejection details - assert metadata_status_ok or ( - not metadata_status_ok and details is not None - ) - # we can have warnings even if checks are ok (e.g. missing suggested field) - details_dict.update(details or {}) - - deposit_status_ok = archives_status_ok and metadata_status_ok - # if any details_dict arose, the deposit is rejected - deposit.status = ( - DEPOSIT_STATUS_VERIFIED if deposit_status_ok else DEPOSIT_STATUS_REJECTED - ) - response: Dict = { - "status": deposit.status, - } - if details_dict: - deposit.status_detail = details_dict - response["details"] = details_dict - - # Deposit ok, then we schedule the deposit loading task (if not already done) - if deposit_status_ok and not deposit.load_task_id and self.config["checks"]: - url = deposit.origin_url - task = create_oneshot_task( - "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 - ) - load_task_id = self.scheduler.create_tasks([task])[0].id - deposit.load_task_id = str(load_task_id) - - deposit.save() - - return status.HTTP_200_OK, response, "application/json" - - -def reset_content_settings_if_needed(archive) -> None: - """This resets the content_settings on the associated blob stored in an azure - blobstorage. This prevents the correct reading of the file and failing the checks - for no good reason. - - """ - try: - from storages.backends.azure_storage import AzureStorage - except ImportError: - return None - - if not isinstance(archive.storage, AzureStorage): - return None - - from azure.storage.blob import ContentSettings - - blob_client = archive.storage.client.get_blob_client(archive.name) - - # Get the existing blob properties - properties = blob_client.get_blob_properties() - - # reset content encoding in the settings - content_settings = dict(properties.content_settings) - content_settings["content_encoding"] = "" - - # Set the content_type and content_language headers, and populate the remaining - # headers from the existing properties - blob_headers = ContentSettings(**content_settings) - - blob_client.set_http_headers(blob_headers) diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py index 6afe6d11..bc3fa500 100644 --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2024 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,10 +11,12 @@ from swh.deposit.errors import BAD_REQUEST, DepositError from swh.deposit.models import ( DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS, + DEPOSIT_STATUS_VERIFIED, Deposit, ) from swh.model.hashutil import hash_to_bytes from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID +from swh.scheduler.utils import create_oneshot_task MANDATORY_KEYS = ["origin_url", "release_id", "directory_id", "snapshot_id"] @@ -111,8 +113,18 @@ class APIUpdateStatus(APIPrivateView, APIPut): path="/", ) ) - else: # rejected - deposit.status = status + elif ( + status == DEPOSIT_STATUS_VERIFIED + and not deposit.load_task_id + and self.config["checks"] + ): + # Deposit ok, then we schedule the deposit loading task (if not already done) + url = deposit.origin_url + task = create_oneshot_task( + "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 + ) + load_task_id = self.scheduler.create_tasks([task])[0].id + deposit.load_task_id = str(load_task_id) if "status_detail" in data: deposit.status_detail = data["status_detail"] diff --git a/swh/deposit/api/private/urls.py b/swh/deposit/api/private/urls.py index 01ae2d92..7a6fd748 100644 --- a/swh/deposit/api/private/urls.py +++ b/swh/deposit/api/private/urls.py @@ -6,14 +6,12 @@ from django.urls import path from django.urls import re_path as url -from swh.deposit.api.private.deposit_check import APIChecks from swh.deposit.api.private.deposit_list import APIList, deposit_list_datatables from swh.deposit.api.private.deposit_read import APIReadArchives, APIReadMetadata from swh.deposit.api.private.deposit_releases import APIReleases from swh.deposit.api.private.deposit_update_status import APIUpdateStatus from swh.deposit.api.private.deposit_upload_urls import APIUploadURLs from swh.deposit.config import ( - PRIVATE_CHECK_DEPOSIT, PRIVATE_GET_DEPOSIT_METADATA, PRIVATE_GET_RAW_CONTENT, PRIVATE_GET_RELEASES, @@ -45,13 +43,6 @@ urlpatterns = [ APIReadMetadata.as_view(), name=PRIVATE_GET_DEPOSIT_METADATA, ), - # Check archive and metadata information on a specific deposit - # -> GET - url( - r"^(?P<collection_name>[^/]+)/(?P<deposit_id>[^/]+)/check/$", - APIChecks.as_view(), - name=PRIVATE_CHECK_DEPOSIT, - ), # Retrieve deposit's raw archives' content # -> GET url( @@ -73,13 +64,6 @@ urlpatterns = [ APIReadMetadata.as_view(), name=PRIVATE_GET_DEPOSIT_METADATA + "-nc", ), - # Check archive and metadata information on a specific deposit - # -> GET - url( - r"^(?P<deposit_id>[^/]+)/check/$", - APIChecks.as_view(), - name=PRIVATE_CHECK_DEPOSIT + "-nc", - ), url(r"^deposits/$", APIList.as_view(), name=PRIVATE_LIST_DEPOSITS), url( r"^deposits/datatables/$", diff --git a/swh/deposit/config.py b/swh/deposit/config.py index 68aab223..bfb1f98d 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -9,10 +9,6 @@ from typing import Any, Dict, Optional from swh.core import config from swh.deposit import __version__ from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher -from swh.scheduler import get_scheduler -from swh.scheduler.interface import SchedulerInterface -from swh.storage import get_storage -from swh.storage.interface import StorageInterface # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_IRI = "edit_iri" @@ -23,7 +19,6 @@ SD_IRI = "servicedocument" COL_IRI = "upload" STATE_IRI = "state_iri" PRIVATE_GET_RAW_CONTENT = "private-download" -PRIVATE_CHECK_DEPOSIT = "check-deposit" PRIVATE_PUT_DEPOSIT = "private-update" PRIVATE_GET_DEPOSIT_METADATA = "private-read" PRIVATE_LIST_DEPOSITS = "private-deposit-list" @@ -100,6 +95,11 @@ class APIConfig: """ def __init__(self): + from swh.scheduler import get_scheduler + from swh.scheduler.interface import SchedulerInterface + from swh.storage import get_storage + from swh.storage.interface import StorageInterface + self.config: Dict[str, Any] = config.load_from_envvar(DEFAULT_CONFIG) self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) self.tool = { diff --git a/swh/deposit/loader/checker.py b/swh/deposit/loader/checker.py index 8998a803..68439a42 100644 --- a/swh/deposit/loader/checker.py +++ b/swh/deposit/loader/checker.py @@ -1,18 +1,148 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from itertools import chain import logging -from typing import Any, Dict +import os +import re +from shutil import get_unpack_formats +import tarfile +import tempfile +from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import urlparse +from xml.etree import ElementTree +import zipfile +import requests import sentry_sdk from swh.core import config from swh.deposit.client import PrivateApiDepositClient +from swh.deposit.config import DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED +from swh.deposit.loader.checks import check_metadata logger = logging.getLogger(__name__) +MANDATORY_ARCHIVE_UNREADABLE = ( + "At least one of its associated archives is not readable" # noqa +) +MANDATORY_ARCHIVE_INVALID = ( + "Mandatory archive is invalid (i.e contains only one archive)" # noqa +) +MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" +MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" + +ARCHIVE_EXTENSIONS = [ + "zip", + "tar", + "tar.gz", + "xz", + "tar.xz", + "bz2", + "tar.bz2", + "Z", + "tar.Z", + "tgz", + "7z", +] + +PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) + + +def known_archive_format(filename): + return any( + filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) + ) + + +def _check_archive(archive_url: str) -> Tuple[bool, Optional[str]]: + """Check that a deposit associated archive is ok: + - readable + - supported archive format + - valid content: the archive does not contain a single archive file + + If any of those checks are not ok, return the corresponding + failing check. + + Args: + archive_path (DepositRequest): Archive to check + + Returns: + (True, None) if archive is check compliant, (False, + <detail-error>) otherwise. + + """ + parsed_archive_url = urlparse(archive_url) + archive_name = os.path.basename(parsed_archive_url.path) + + if not known_archive_format(archive_name): + return False, MANDATORY_ARCHIVE_UNSUPPORTED + + try: + response = requests.get(archive_url, stream=True) + with tempfile.TemporaryDirectory() as tmpdir: + archive_path = os.path.join(tmpdir, archive_name) + with open(archive_path, "wb") as archive_fp: + for chunk in response.iter_content(chunk_size=10 * 1024 * 1024): + archive_fp.write(chunk) + with open(archive_path, "rb") as archive_fp: + try: + with zipfile.ZipFile(archive_fp) as zip_fp: + files = zip_fp.namelist() + except Exception: + try: + # rewind since the first tryout reading may have moved the + # cursor + archive_fp.seek(0) + with tarfile.open(fileobj=archive_fp) as tar_fp: + files = tar_fp.getnames() + except Exception: + return False, MANDATORY_ARCHIVE_UNSUPPORTED + except Exception: + return False, MANDATORY_ARCHIVE_UNREADABLE + if len(files) > 1: + return True, None + element = files[0] + if PATTERN_ARCHIVE_EXTENSION.match(element): + # archive in archive! + return False, MANDATORY_ARCHIVE_INVALID + return True, None + + +def _check_deposit_archives( + archive_urls: List[str], +) -> Tuple[bool, Optional[Dict]]: + """Given a deposit, check each deposit request of type archive. + + Args: + The deposit to check archives for + + Returns + tuple (status, details): True, None if all archives + are ok, (False, <detailed-error>) otherwise. + + """ + if len(archive_urls) == 0: # no associated archive is refused + return False, { + "archive": [ + { + "summary": MANDATORY_ARCHIVE_MISSING, + } + ] + } + + errors = [] + for archive_url in archive_urls: + check, error_message = _check_archive(archive_url) + if not check: + errors.append({"summary": error_message}) + + if not errors: + return True, None + return False, {"archive": errors} + class DepositChecker: """Deposit checker implementation. @@ -25,17 +155,58 @@ class DepositChecker: self.config: Dict[str, Any] = config.load_from_envvar() self.client = PrivateApiDepositClient(config=self.config["deposit"]) - def check(self, collection: str, deposit_id: str) -> Dict[str, str]: + def check(self, collection: str, deposit_id: str) -> Dict[str, Any]: status = None - deposit_check_url = f"/{collection}/{deposit_id}/check/" - logger.debug("deposit-check-url: %s", deposit_check_url) + deposit_upload_urls = f"/{deposit_id}/upload-urls/" + logger.debug("deposit-upload-urls: %s", deposit_upload_urls) + details_dict: Dict = {} try: - r = self.client.check(deposit_check_url) - logger.debug("Check result: %s", r) - status = "eventful" if r == "verified" else "failed" - except Exception: - logger.exception("Failure during check on '%s'", deposit_check_url) + raw_metadata = self.client.metadata_get(f"/{deposit_id}/meta/").get( + "raw_metadata" + ) + + # will check each deposit's associated request (both of type + # archive and metadata) for errors + + archive_urls = self.client.do("GET", deposit_upload_urls).json() + logger.debug("deposit-upload-urls result: %s", archive_urls) + + archives_status_ok, details = _check_deposit_archives(archive_urls) + + if not archives_status_ok: + assert details is not None + details_dict.update(details) + + if raw_metadata is None: + metadata_status_ok = False + details_dict["metadata"] = [{"summary": "Missing Atom document"}] + else: + metadata_tree = ElementTree.fromstring(raw_metadata) + metadata_status_ok, details = check_metadata(metadata_tree) + # Ensure in case of error, we do have the rejection details + assert metadata_status_ok or ( + not metadata_status_ok and details is not None + ) + # we can have warnings even if checks are ok (e.g. missing suggested field) + details_dict.update(details or {}) + + deposit_status_ok = archives_status_ok and metadata_status_ok + # if any details_dict arose, the deposit is rejected + + status = ( + DEPOSIT_STATUS_VERIFIED + if deposit_status_ok + else DEPOSIT_STATUS_REJECTED + ) + + self.client.status_update( + f"/{deposit_id}/update/", status=status, status_detail=details_dict + ) + + status = "eventful" if status == DEPOSIT_STATUS_VERIFIED else "failed" + except Exception as e: sentry_sdk.capture_exception() status = "failed" + details_dict["exception"] = f"{e.__class__.__name__}: {str(e)}" logger.debug("Check status: %s", status) - return {"status": status} + return {"status": status, "status_detail": details_dict} diff --git a/swh/deposit/api/checks.py b/swh/deposit/loader/checks.py similarity index 97% rename from swh/deposit/api/checks.py rename to swh/deposit/loader/checks.py index dd24e650..1fd227b6 100644 --- a/swh/deposit/api/checks.py +++ b/swh/deposit/loader/checks.py @@ -24,7 +24,6 @@ from xml.etree import ElementTree import xmlschema -from swh.deposit.errors import FORBIDDEN, DepositError from swh.deposit.utils import NAMESPACES, parse_swh_metadata_provenance MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" @@ -404,17 +403,3 @@ def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]] return True, {"metadata": suggested_fields} return True, None - - -def check_url_match_provider(url: str, provider_url: str) -> None: - """Check url matches the provider url. - - Raises DepositError in case of mismatch - - """ - provider_url = provider_url.rstrip("/") + "/" - if not url.startswith(provider_url): - raise DepositError( - FORBIDDEN, - f"URL mismatch: {url} must start with {provider_url}", - ) diff --git a/swh/deposit/tests/api/conftest.py b/swh/deposit/tests/api/conftest.py index 234053a0..c9f24668 100644 --- a/swh/deposit/tests/api/conftest.py +++ b/swh/deposit/tests/api/conftest.py @@ -1,24 +1,14 @@ -# Copyright (C) 2019-2023 The Software Heritage developers +# Copyright (C) 2019-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import hashlib import os -from django.urls import reverse_lazy as reverse from django.utils import timezone import pytest -from swh.deposit.api.private.deposit_check import APIChecks -from swh.deposit.config import ( - COL_IRI, - DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_VERIFIED, -) -from swh.deposit.models import Deposit -from swh.deposit.parsers import parse_xml -from swh.deposit.utils import NAMESPACES +from swh.deposit.config import DEPOSIT_STATUS_VERIFIED @pytest.fixture @@ -27,15 +17,6 @@ def datadir(request): return os.path.join(os.path.dirname(str(request.fspath)), "../data") -@pytest.fixture -def ready_deposit_ok(partial_deposit_with_metadata): - """Returns a deposit ready for checks (it will pass the checks).""" - deposit = partial_deposit_with_metadata - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - return deposit - - @pytest.fixture def ready_deposit_verified(partial_deposit_with_metadata): """Returns a verified deposit.""" @@ -44,49 +25,3 @@ def ready_deposit_verified(partial_deposit_with_metadata): deposit.complete_date = timezone.now() deposit.save() return deposit - - -@pytest.fixture -def ready_deposit_only_metadata(partial_deposit_only_metadata): - """Deposit in status ready that will fail the checks (because missing - archive). - - """ - deposit = partial_deposit_only_metadata - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - return deposit - - -@pytest.fixture -def ready_deposit_invalid_archive(authenticated_client, deposit_collection): - url = reverse(COL_IRI, args=[deposit_collection.name]) - - data = b"some data which is clearly not a zip file" - md5sum = hashlib.md5(data).hexdigest() - - # when - response = authenticated_client.post( - url, - content_type="application/zip", # as zip - data=data, - # + headers - CONTENT_LENGTH=len(data), - # other headers needs HTTP_ prefix to be taken into account - HTTP_SLUG="external-id-invalid", - HTTP_CONTENT_MD5=md5sum, - HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", - HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", - ) - - response_content = parse_xml(response.content) - deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) - deposit = Deposit.objects.get(pk=deposit_id) - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - return deposit - - -@pytest.fixture -def swh_checks_deposit(): - return APIChecks() diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py index 7a95846e..520a9ae1 100644 --- a/swh/deposit/tests/api/test_checks.py +++ b/swh/deposit/tests/api/test_checks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2024 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,7 +14,7 @@ from xml.etree import ElementTree import pytest -from swh.deposit.api.checks import ( +from swh.deposit.loader.checks import ( METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING, check_metadata, @@ -396,7 +396,7 @@ _parameters1 = [ "metadata_ok", _parameters1, ) -def test_api_checks_check_metadata_ok(metadata_ok, swh_checks_deposit): +def test_api_checks_check_metadata_ok(metadata_ok): actual_check, detail = check_metadata(ElementTree.fromstring(metadata_ok)) assert actual_check is True, f"Unexpected result:\n{pprint.pformat(detail)}" if "swh:deposit" in metadata_ok: @@ -662,9 +662,7 @@ _parameters2 = [ @pytest.mark.parametrize("metadata_ko,expected_summary", _parameters2) -def test_api_checks_check_metadata_ko( - metadata_ko, expected_summary, swh_checks_deposit -): +def test_api_checks_check_metadata_ko(metadata_ko, expected_summary): actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) assert actual_check is False assert error_detail == {"metadata": [expected_summary]} @@ -1216,9 +1214,7 @@ _parameters3 = [ @pytest.mark.parametrize("metadata_ko,expected_summaries", _parameters3) -def test_api_checks_check_metadata_ko_schema( - metadata_ko, expected_summaries, swh_checks_deposit -): +def test_api_checks_check_metadata_ko_schema(metadata_ko, expected_summaries): actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) assert actual_check is False assert len(error_detail["metadata"]) == len(expected_summaries), error_detail[ diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py deleted file mode 100644 index f3cb00ce..00000000 --- a/swh/deposit/tests/api/test_deposit_private_check.py +++ /dev/null @@ -1,288 +0,0 @@ -# Copyright (C) 2017-2023 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import random - -from django.urls import reverse_lazy as reverse -import pytest -from rest_framework import status - -from swh.deposit.api.checks import METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING -from swh.deposit.api.private.deposit_check import ( - MANDATORY_ARCHIVE_INVALID, - MANDATORY_ARCHIVE_MISSING, - MANDATORY_ARCHIVE_UNSUPPORTED, -) -from swh.deposit.config import ( - COL_IRI, - DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_PARTIAL, - DEPOSIT_STATUS_REJECTED, - DEPOSIT_STATUS_VERIFIED, - PRIVATE_CHECK_DEPOSIT, - SE_IRI, -) -from swh.deposit.models import Deposit -from swh.deposit.parsers import parse_xml -from swh.deposit.tests.common import ( - SUPPORTED_TARBALL_MODES, - create_arborescence_archive, - create_archive_with_archive, - post_archive, - post_atom, -) -from swh.deposit.utils import NAMESPACES - -PRIVATE_CHECK_DEPOSIT_NC = PRIVATE_CHECK_DEPOSIT + "-nc" - - -def private_check_url_endpoints(collection, deposit): - """There are 2 endpoints to check (one with collection, one without)""" - return [ - reverse(PRIVATE_CHECK_DEPOSIT, args=[collection.name, deposit.id]), - reverse(PRIVATE_CHECK_DEPOSIT_NC, args=[deposit.id]), - ] - - -@pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) -def test_deposit_ok( - tmp_path, authenticated_client, deposit_collection, extension, atom_dataset -): - """Proper deposit should succeed the checks (-> status ready)""" - deposit = create_deposit_with_archive( - tmp_path, extension, authenticated_client, deposit_collection.name, atom_dataset - ) - - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - - assert response.status_code == status.HTTP_200_OK - data = response.json() - assert data["status"] == DEPOSIT_STATUS_VERIFIED, data - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_VERIFIED - - # Deposit is ok but it's missing suggested fields in its metadata detected by - # the checks - status_detail = deposit.status_detail["metadata"] - assert len(status_detail) == 1 - suggested = status_detail[0] - assert suggested["summary"] == SUGGESTED_FIELDS_MISSING - assert set(suggested["fields"]) == set([METADATA_PROVENANCE_KEY]) - - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - - -@pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) -def test_deposit_invalid_tarball( - tmp_path, authenticated_client, deposit_collection, extension, atom_dataset -): - """Deposit with tarball (of 1 tarball) should fail the checks: rejected""" - deposit = create_deposit_archive_with_archive( - tmp_path, extension, authenticated_client, deposit_collection.name, atom_dataset - ) - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - assert response.status_code == status.HTTP_200_OK - data = response.json() - assert data["status"] == DEPOSIT_STATUS_REJECTED - details = data["details"] - # archive checks failure - assert len(details["archive"]) == 1 - assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_INVALID - - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_REJECTED - - -def test_deposit_ko_missing_tarball( - authenticated_client, deposit_collection, ready_deposit_only_metadata -): - """Deposit without archive should fail the checks: rejected""" - deposit = ready_deposit_only_metadata - assert deposit.status == DEPOSIT_STATUS_DEPOSITED - - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - - assert response.status_code == status.HTTP_200_OK - data = response.json() - assert data["status"] == DEPOSIT_STATUS_REJECTED - details = data["details"] - # archive checks failure - assert len(details["archive"]) == 1 - assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_MISSING - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_REJECTED - - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - - -def test_deposit_ko_unsupported_tarball( - tmp_path, authenticated_client, deposit_collection, ready_deposit_invalid_archive -): - """Deposit with unsupported tarball should fail checks and be rejected""" - deposit = ready_deposit_invalid_archive - assert DEPOSIT_STATUS_DEPOSITED == deposit.status - - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - - assert response.status_code == status.HTTP_200_OK - data = response.json() - assert data["status"] == DEPOSIT_STATUS_REJECTED - details = data["details"] - - # archive checks failure - assert len(details["archive"]) == 1 - assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED - # metadata check failure - assert len(details["metadata"]) == 1 - mandatory = details["metadata"][0] - assert mandatory["summary"] == "Missing Atom document" - - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_REJECTED - - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - - -def test_deposit_ko_unsupported_tarball_prebasic_check( - tmp_path, authenticated_client, deposit_collection, atom_dataset -): - """Deposit with unsupported tarball extension should fail checks and be rejected""" - - invalid_gz_mode = random.choice( - [f"{ext}-foobar" for ext in SUPPORTED_TARBALL_MODES] - ) - invalid_extension = f"tar.{invalid_gz_mode}" - - deposit = create_deposit_with_archive( - tmp_path, - invalid_extension, - authenticated_client, - deposit_collection.name, - atom_dataset, - ) - assert DEPOSIT_STATUS_DEPOSITED == deposit.status - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - - assert response.status_code == status.HTTP_200_OK - data = response.json() - - assert data["status"] == DEPOSIT_STATUS_REJECTED - details = data["details"] - # archive checks failure - assert len(details["archive"]) == 1 - assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED - - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_REJECTED - - -def test_check_deposit_metadata_ok( - authenticated_client, deposit_collection, ready_deposit_ok -): - """Proper deposit should succeed the checks (-> status ready) - with all **MUST** metadata - - using the codemeta metadata test set - """ - deposit = ready_deposit_ok - assert deposit.status == DEPOSIT_STATUS_DEPOSITED - - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - - assert response.status_code == status.HTTP_200_OK - data = response.json() - - assert data["status"] == DEPOSIT_STATUS_VERIFIED, data - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_VERIFIED - - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - - -def create_deposit(archive, client, collection_name, atom_dataset): - """Create a deposit with archive (and metadata) for client in the collection name.""" - # we deposit it - response = post_archive( - client, - reverse(COL_IRI, args=[collection_name]), - archive, - content_type="application/x-tar", - slug="external-id", - in_progress=True, - ) - - # then - assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(response.content) - deposit_status = response_content.findtext( - "swh:deposit_status", namespaces=NAMESPACES - ) - assert deposit_status == DEPOSIT_STATUS_PARTIAL - deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) - - origin_url = client.deposit_client.provider_url - response = post_atom( - client, - reverse(SE_IRI, args=[collection_name, deposit_id]), - data=atom_dataset["entry-data0"] % origin_url, - in_progress=False, - ) - - assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(response.content) - deposit_status = response_content.findtext( - "swh:deposit_status", namespaces=NAMESPACES - ) - assert deposit_status == DEPOSIT_STATUS_DEPOSITED - - deposit = Deposit.objects.get(pk=deposit_id) - assert DEPOSIT_STATUS_DEPOSITED == deposit.status - return deposit - - -def create_deposit_with_archive( - root_path, archive_extension, client, collection_name, atom_dataset -): - """Create a deposit with a valid archive.""" - # we create the holding archive to a given extension - archive = create_arborescence_archive( - root_path, - "archive1", - "file1", - b"some content in file", - extension=archive_extension, - ) - - return create_deposit(archive, client, collection_name, atom_dataset) - - -def create_deposit_archive_with_archive( - root_path, archive_extension, client, collection_name, atom_dataset -): - """Create a deposit with an invalid archive (archive within archive)""" - - # we create the holding archive to a given extension - archive = create_arborescence_archive( - root_path, - "archive1", - "file1", - b"some content in file", - extension=archive_extension, - ) - - # now we create an archive holding the first created archive - invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive) - - return create_deposit(invalid_archive, client, collection_name, atom_dataset) diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py index 9c4baed2..99e291f3 100644 --- a/swh/deposit/tests/cli/test_client.py +++ b/swh/deposit/tests/cli/test_client.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2022 The Software Heritage developers +# Copyright (C) 2020-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,11 +15,6 @@ from xml.etree import ElementTree import pytest import yaml -from swh.deposit.api.checks import ( - METADATA_PROVENANCE_KEY, - SUGGESTED_FIELDS_MISSING, - check_metadata, -) from swh.deposit.cli import deposit as cli from swh.deposit.cli.client import InputError, _collection, _url, generate_metadata from swh.deposit.client import ( @@ -28,6 +23,11 @@ from swh.deposit.client import ( PublicApiDepositClient, ServiceDocumentDepositClient, ) +from swh.deposit.loader.checks import ( + METADATA_PROVENANCE_KEY, + SUGGESTED_FIELDS_MISSING, + check_metadata, +) from swh.deposit.parsers import parse_xml from swh.deposit.tests.conftest import TEST_USER from swh.deposit.utils import NAMESPACES diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index 2c545d99..a5979179 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2024 The Software Heritage developers +# Copyright (C) 2019-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,6 +6,7 @@ import base64 from copy import deepcopy from functools import partial +import hashlib import os import re from typing import TYPE_CHECKING, Dict, Mapping @@ -35,6 +36,8 @@ from swh.deposit.config import ( SE_IRI, setup_django_for, ) +from swh.deposit.models import Deposit +from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( create_arborescence_archive, post_archive, @@ -46,7 +49,7 @@ from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.scheduler import get_scheduler if TYPE_CHECKING: - from swh.deposit.models import Deposit, DepositClient, DepositCollection + from swh.deposit.models import DepositClient, DepositCollection # mypy is asked to ignore the import statement above because setup_databases @@ -594,6 +597,56 @@ def complete_deposit(sample_archive, deposit_collection, authenticated_client): return deposit +@pytest.fixture +def ready_deposit_invalid_archive(authenticated_client, deposit_collection): + url = reverse(COL_IRI, args=[deposit_collection.name]) + + data = b"some data which is clearly not a zip file" + md5sum = hashlib.md5(data).hexdigest() + + # when + response = authenticated_client.post( + url, + content_type="application/zip", # as zip + data=data, + # + headers + CONTENT_LENGTH=len(data), + # other headers needs HTTP_ prefix to be taken into account + HTTP_SLUG="external-id-invalid", + HTTP_CONTENT_MD5=md5sum, + HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", + HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", + ) + + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) + deposit = Deposit.objects.get(pk=deposit_id) + deposit.status = DEPOSIT_STATUS_DEPOSITED + deposit.save() + return deposit + + +@pytest.fixture +def ready_deposit_ok(partial_deposit_with_metadata): + """Returns a deposit ready for checks (it will pass the checks).""" + deposit = partial_deposit_with_metadata + deposit.status = DEPOSIT_STATUS_DEPOSITED + deposit.save() + return deposit + + +@pytest.fixture +def ready_deposit_only_metadata(partial_deposit_only_metadata): + """Deposit in status ready that will fail the checks (because missing + archive). + + """ + deposit = partial_deposit_only_metadata + deposit.status = DEPOSIT_STATUS_DEPOSITED + deposit.save() + return deposit + + @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version diff --git a/swh/deposit/tests/data/atom/entry-data0.xml b/swh/deposit/tests/data/atom/entry-data0.xml index 647dbe9f..7ea58e44 100644 --- a/swh/deposit/tests/data/atom/entry-data0.xml +++ b/swh/deposit/tests/data/atom/entry-data0.xml @@ -1,9 +1,10 @@ <?xml version="1.0"?> <entry xmlns="http://www.w3.org/2005/Atom" + xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" xmlns:swh="https://www.softwareheritage.org/schema/2018/deposit"> <title>Awesome Compiler</title> <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> - <updated>2017-10-07T15:17:08Z</updated> + <codemeta:datePublished>2017-10-07T15:17:08Z</codemeta:datePublished> <author>some awesome author</author> <name>awesome-compiler</name> diff --git a/swh/deposit/tests/data/atom/entry-data1.xml b/swh/deposit/tests/data/atom/entry-data1.xml index 8a75ef12..c573500f 100644 --- a/swh/deposit/tests/data/atom/entry-data1.xml +++ b/swh/deposit/tests/data/atom/entry-data1.xml @@ -1,7 +1,8 @@ <?xml version="1.0"?> -<entry xmlns="http://www.w3.org/2005/Atom"> +<entry xmlns="http://www.w3.org/2005/Atom" xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"> <id>urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a</id> <updated>2017-10-07T15:17:08Z</updated> + <codemeta:datePublished>2017-10-07T15:17:08Z</codemeta:datePublished> <author>some awesome author</author> <name>awesome-compiler</name> </entry> diff --git a/swh/deposit/tests/loader/conftest.py b/swh/deposit/tests/loader/conftest.py index 2f8f9e6b..74eae66c 100644 --- a/swh/deposit/tests/loader/conftest.py +++ b/swh/deposit/tests/loader/conftest.py @@ -1,23 +1,17 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest -from swh.deposit.loader.checker import DepositChecker - @pytest.fixture -def deposit_config(tmp_path): +def deposit_config(deposit_config): return { + **deposit_config, "deposit": { "url": "https://deposit.softwareheritage.org/1/private/", "auth": {}, - } + }, } - - -@pytest.fixture -def deposit_checker(deposit_config_path): - return DepositChecker() diff --git a/swh/deposit/tests/loader/test_checker.py b/swh/deposit/tests/loader/test_checker.py index 56189517..6ac58978 100644 --- a/swh/deposit/tests/loader/test_checker.py +++ b/swh/deposit/tests/loader/test_checker.py @@ -1,26 +1,512 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from unittest.mock import patch +import os +import random +from django.urls import reverse_lazy as reverse +import pytest +from rest_framework import status -def test_checker_deposit_ready(requests_mock_datadir, deposit_checker): - """Check on a valid 'deposited' deposit should result in 'verified'""" - actual_result = deposit_checker.check(collection="test", deposit_id=1) - assert actual_result == {"status": "eventful"} +from swh.deposit.config import ( + COL_IRI, + DEPOSIT_STATUS_DEPOSITED, + DEPOSIT_STATUS_PARTIAL, + PRIVATE_GET_DEPOSIT_METADATA, + PRIVATE_GET_UPLOAD_URLS, + PRIVATE_PUT_DEPOSIT, + SE_IRI, +) +from swh.deposit.loader.checker import ( + MANDATORY_ARCHIVE_INVALID, + MANDATORY_ARCHIVE_MISSING, + MANDATORY_ARCHIVE_UNSUPPORTED, + DepositChecker, +) +from swh.deposit.loader.checks import METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING +from swh.deposit.models import Deposit +from swh.deposit.parsers import parse_xml +from swh.deposit.tests.common import ( + SUPPORTED_TARBALL_MODES, + create_arborescence_archive, + create_archive_with_archive, + post_archive, + post_atom, +) +from swh.deposit.utils import NAMESPACES +PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc" +PRIVATE_PUT_DEPOSIT_NC = PRIVATE_PUT_DEPOSIT + "-nc" -def test_checker_deposit_rejected(requests_mock_datadir, deposit_checker): - """Check on invalid 'deposited' deposit should result in 'rejected'""" - actual_result = deposit_checker.check(collection="test", deposit_id=2) - assert actual_result == {"status": "failed"} +BASE_URL = "https://deposit.softwareheritage.org" -@patch("swh.deposit.client.requests.get") -def test_checker_deposit_rejected_exception(mock_requests, deposit_checker): - """Check on invalid 'deposited' deposit should result in 'rejected'""" - mock_requests.side_effect = ValueError("simulated problem when checking") - actual_result = deposit_checker.check(collection="test", deposit_id=3) - assert actual_result == {"status": "failed"} +def create_deposit(archive, client, collection_name, atom_dataset): + """Create a deposit with archive (and metadata) for client in the collection name.""" + # we deposit it + response = post_archive( + client, + reverse(COL_IRI, args=[collection_name]), + archive, + content_type="application/x-tar", + slug="external-id", + in_progress=True, + ) + + # then + assert response.status_code == status.HTTP_201_CREATED + response_content = parse_xml(response.content) + deposit_status = response_content.findtext( + "swh:deposit_status", namespaces=NAMESPACES + ) + assert deposit_status == DEPOSIT_STATUS_PARTIAL + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) + + origin_url = client.deposit_client.provider_url + response = post_atom( + client, + reverse(SE_IRI, args=[collection_name, deposit_id]), + data=atom_dataset["entry-data0"] % origin_url, + in_progress=False, + ) + + assert response.status_code == status.HTTP_201_CREATED + response_content = parse_xml(response.content) + deposit_status = response_content.findtext( + "swh:deposit_status", namespaces=NAMESPACES + ) + assert deposit_status == DEPOSIT_STATUS_DEPOSITED + + deposit = Deposit.objects.get(pk=deposit_id) + assert DEPOSIT_STATUS_DEPOSITED == deposit.status + return deposit + + +def create_deposit_with_archive( + root_path, archive_extension, client, collection_name, atom_dataset +): + """Create a deposit with a valid archive.""" + # we create the holding archive to a given extension + archive = create_arborescence_archive( + root_path, + "archive1", + "file1", + b"some content in file", + extension=archive_extension, + ) + + return create_deposit(archive, client, collection_name, atom_dataset) + + +def create_deposit_archive_with_archive( + root_path, archive_extension, client, collection_name, atom_dataset +): + """Create a deposit with an invalid archive (archive within archive)""" + + # we create the holding archive to a given extension + archive = create_arborescence_archive( + root_path, + "archive1", + "file1", + b"some content in file", + extension=archive_extension, + ) + + # now we create an archive holding the first created archive + invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive) + + return create_deposit(invalid_archive, client, collection_name, atom_dataset) + + +@pytest.fixture +def deposit_checker(): + return DepositChecker() + + +@pytest.fixture +def datadir(): + """Override default datadir to target main test datadir""" + return os.path.join(os.path.dirname(__file__), "../data") + + +@pytest.fixture +def deposited_deposit_valid_metadata(partial_deposit_with_metadata): + partial_deposit_with_metadata.status = DEPOSIT_STATUS_DEPOSITED + partial_deposit_with_metadata.save() + return partial_deposit_with_metadata + + +@pytest.fixture() +def deposited_deposit_only_metadata(partial_deposit_only_metadata): + partial_deposit_only_metadata.status = DEPOSIT_STATUS_DEPOSITED + partial_deposit_only_metadata.save() + return partial_deposit_only_metadata + + +def mock_http_requests(deposit, authenticated_client, requests_mock): + """Mock HTTP requests performed by deposit checker with responses + of django test client.""" + metadata_url = reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit.id]) + upload_urls_url = reverse(PRIVATE_GET_UPLOAD_URLS, args=[deposit.id]) + archive_urls = authenticated_client.get(upload_urls_url).json() + + if archive_urls: + archive_response = authenticated_client.get(archive_urls[0]) + # mock archive download + requests_mock.get( + archive_urls[0], content=b"".join(archive_response.streaming_content) + ) + + # mock requests to private deposit API by forwarding authenticated_client responses + for url in (metadata_url, upload_urls_url): + requests_mock.get(BASE_URL + url, json=authenticated_client.get(url).json()) + + def status_update(request, context): + authenticated_client.put( + put_deposit_url, content_type="application/json", data=request.body + ) + return None + + put_deposit_url = reverse(PRIVATE_PUT_DEPOSIT_NC, args=[deposit.id]) + requests_mock.put( + BASE_URL + put_deposit_url, + content=status_update, + ) + + +def test_checker_deposit_missing_metadata( + deposit_checker, + deposited_deposit, + authenticated_client, + requests_mock, +): + mock_http_requests(deposited_deposit, authenticated_client, requests_mock) + actual_result = deposit_checker.check( + collection="test", deposit_id=deposited_deposit.id + ) + assert actual_result == { + "status": "failed", + "status_detail": {"metadata": [{"summary": "Missing Atom document"}]}, + } + + +def test_checker_deposit_valid_metadata( + deposit_checker, + deposited_deposit_valid_metadata, + authenticated_client, + requests_mock, +): + mock_http_requests( + deposited_deposit_valid_metadata, + authenticated_client, + requests_mock, + ) + actual_result = deposit_checker.check( + collection="test", deposit_id=deposited_deposit_valid_metadata.id + ) + assert actual_result == { + "status": "eventful", + "status_detail": { + "metadata": [ + { + "fields": [METADATA_PROVENANCE_KEY], + "summary": SUGGESTED_FIELDS_MISSING, + }, + ] + }, + } + + +def test_checker_deposit_only_metadata( + deposit_checker, + deposited_deposit_only_metadata, + authenticated_client, + requests_mock, +): + mock_http_requests( + deposited_deposit_only_metadata, + authenticated_client, + requests_mock, + ) + actual_result = deposit_checker.check( + collection="test", deposit_id=deposited_deposit_only_metadata.id + ) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_MISSING}], + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ], + }, + } + + +def test_checker_deposit_exception_raised( + deposit_checker, + deposited_deposit_valid_metadata, + authenticated_client, + requests_mock, + mocker, +): + mock_http_requests( + deposited_deposit_valid_metadata, + authenticated_client, + requests_mock, + ) + mocker.patch("swh.deposit.loader.checker.check_metadata").side_effect = ValueError( + "Error when checking metadata" + ) + actual_result = deposit_checker.check( + collection="test", deposit_id=deposited_deposit_valid_metadata.id + ) + assert actual_result == { + "status": "failed", + "status_detail": { + "exception": "ValueError: Error when checking metadata", + }, + } + + +def test_checker_deposit_invalid_archive( + deposit_checker, + ready_deposit_invalid_archive, + authenticated_client, + requests_mock, +): + mock_http_requests( + ready_deposit_invalid_archive, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check( + collection="test", deposit_id=ready_deposit_invalid_archive.id + ) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_UNSUPPORTED}], + "metadata": [{"summary": "Missing Atom document"}], + }, + } + + +@pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) +def test_deposit_ok( + tmp_path, + authenticated_client, + deposit_collection, + extension, + atom_dataset, + deposit_checker, + requests_mock, +): + """Proper deposit should succeed the checks (-> status ready)""" + deposit = create_deposit_with_archive( + tmp_path, extension, authenticated_client, deposit_collection.name, atom_dataset + ) + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "eventful", + "status_detail": { + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ] + }, + } + + +@pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) +def test_deposit_invalid_tarball( + tmp_path, + authenticated_client, + deposit_collection, + extension, + atom_dataset, + requests_mock, + deposit_checker, +): + """Deposit with tarball (of 1 tarball) should fail the checks: rejected""" + deposit = create_deposit_archive_with_archive( + tmp_path, extension, authenticated_client, deposit_collection.name, atom_dataset + ) + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_INVALID}], + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ], + }, + } + + +def test_deposit_ko_missing_tarball( + authenticated_client, + ready_deposit_only_metadata, + requests_mock, + deposit_checker, +): + """Deposit without archive should fail the checks: rejected""" + deposit = ready_deposit_only_metadata + assert deposit.status == DEPOSIT_STATUS_DEPOSITED + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_MISSING}], + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ], + }, + } + + +def test_deposit_ko_unsupported_tarball( + authenticated_client, + ready_deposit_invalid_archive, + requests_mock, + deposit_checker, +): + """Deposit with unsupported tarball should fail checks and be rejected""" + deposit = ready_deposit_invalid_archive + assert DEPOSIT_STATUS_DEPOSITED == deposit.status + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_UNSUPPORTED}], + "metadata": [{"summary": "Missing Atom document"}], + }, + } + + +def test_deposit_ko_unsupported_tarball_prebasic_check( + tmp_path, + authenticated_client, + deposit_collection, + atom_dataset, + requests_mock, + deposit_checker, +): + """Deposit with unsupported tarball extension should fail checks and be rejected""" + + invalid_gz_mode = random.choice( + [f"{ext}-foobar" for ext in SUPPORTED_TARBALL_MODES] + ) + invalid_extension = f"tar.{invalid_gz_mode}" + + deposit = create_deposit_with_archive( + tmp_path, + invalid_extension, + authenticated_client, + deposit_collection.name, + atom_dataset, + ) + assert DEPOSIT_STATUS_DEPOSITED == deposit.status + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_UNSUPPORTED}], + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ], + }, + } + + +def test_check_deposit_metadata_ok( + authenticated_client, + ready_deposit_ok, + requests_mock, + deposit_checker, +): + """Proper deposit should succeed the checks (-> status ready) + with all **MUST** metadata + + using the codemeta metadata test set + """ + deposit = ready_deposit_ok + assert deposit.status == DEPOSIT_STATUS_DEPOSITED + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "eventful", + "status_detail": { + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ] + }, + } diff --git a/swh/deposit/tests/loader/test_tasks.py b/swh/deposit/tests/loader/test_tasks.py index fe885f5c..5fbaa0f7 100644 --- a/swh/deposit/tests/loader/test_tasks.py +++ b/swh/deposit/tests/loader/test_tasks.py @@ -8,8 +8,8 @@ def test_task_check_eventful( mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker ): """Successful check should make the check succeed""" - client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") - client.return_value = "verified" + check = mocker.patch("swh.deposit.loader.checker.DepositChecker.check") + check.return_value = {"status": "eventful"} collection = "collection" deposit_id = 42 @@ -21,15 +21,14 @@ def test_task_check_eventful( assert res.successful() assert res.result == {"status": "eventful"} - client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") def test_task_check_failure( mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker ): """Unverified check status should make the check fail""" - client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") - client.return_value = "not-verified" # will make the status "failed" + check = mocker.patch("swh.deposit.loader.checker.DepositChecker.check") + check.return_value = {"status": "failed"} collection = "collec" deposit_id = 666 @@ -41,24 +40,3 @@ def test_task_check_failure( assert res.successful() assert res.result == {"status": "failed"} - client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") - - -def test_task_check_3( - mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker -): - """Unexpected failures should fail the check""" - client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") - client.side_effect = ValueError("unexpected failure will make it fail") - - collection = "another-collection" - deposit_id = 999 - res = swh_scheduler_celery_app.send_task( - "swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id] - ) - assert res - res.wait() - assert res.successful() - - assert res.result == {"status": "failed"} - client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") -- GitLab