diff --git a/requirements-server.txt b/requirements-server.txt index 7aab2921af94cce9eb8e186de817dbc4862e97ef..58f9d10994d7f8570dd606699bd9dfb81bfa242a 100644 --- a/requirements-server.txt +++ b/requirements-server.txt @@ -2,6 +2,5 @@ django djangorestframework psycopg2 setuptools -xmlschema pymemcache diff --git a/requirements.txt b/requirements.txt index 6d901d214f03cabfdc60d9e3ff7fbd173a8f4725..8cc0c788c703e6a71e3b59702be0fd094bbbbb23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ click iso8601 requests sentry-sdk +xmlschema diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index 56ad43e93ab744114cf44a0c3390d398ffdf3e4e..9a46c46bb6efdd0c4dad95a6c960a49f796d12fc 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2024 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -25,7 +25,6 @@ from rest_framework.permissions import BasePermission, IsAuthenticated from rest_framework.request import Request from rest_framework.views import APIView -from swh.deposit.api.checks import check_metadata, check_url_match_provider from swh.deposit.api.converters import convert_status_detail from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication from swh.deposit.config import ( @@ -56,6 +55,7 @@ from swh.deposit.errors import ( DepositError, ParserError, ) +from swh.deposit.loader.checks import check_metadata from swh.deposit.models import ( DEPOSIT_METADATA_ONLY, Deposit, @@ -166,6 +166,20 @@ def guess_deposit_origin_url(deposit: Deposit): return "%s/%s" % (deposit.client.provider_url.rstrip("/"), external_id) +def check_url_match_provider(url: str, provider_url: str) -> None: + """Check url matches the provider url. + + Raises DepositError in case of mismatch + + """ + provider_url = provider_url.rstrip("/") + "/" + if not url.startswith(provider_url): + raise DepositError( + FORBIDDEN, + f"URL mismatch: {url} must start with {provider_url}", + ) + + class APIBase(APIConfig, APIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors.""" diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py deleted file mode 100644 index a8dc63b2e3ad6126ae91dae6654802961fb2f08b..0000000000000000000000000000000000000000 --- a/swh/deposit/api/private/deposit_check.py +++ /dev/null @@ -1,256 +0,0 @@ -# Copyright (C) 2017-2024 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from itertools import chain -import os -import re -from shutil import get_unpack_formats -import tarfile -from typing import Dict, Optional, Tuple -from xml.etree import ElementTree -import zipfile - -from rest_framework import status -from rest_framework.request import Request - -from swh.deposit.api.checks import check_metadata -from swh.deposit.api.common import APIGet -from swh.deposit.api.private import APIPrivateView, DepositReadMixin -from swh.deposit.config import ( - ARCHIVE_TYPE, - DEPOSIT_STATUS_REJECTED, - DEPOSIT_STATUS_VERIFIED, -) -from swh.deposit.models import Deposit, DepositRequest -from swh.scheduler.utils import create_oneshot_task - -MANDATORY_ARCHIVE_UNREADABLE = ( - "At least one of its associated archives is not readable" # noqa -) -MANDATORY_ARCHIVE_INVALID = ( - "Mandatory archive is invalid (i.e contains only one archive)" # noqa -) -MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" -MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" - -ARCHIVE_EXTENSIONS = [ - "zip", - "tar", - "tar.gz", - "xz", - "tar.xz", - "bz2", - "tar.bz2", - "Z", - "tar.Z", - "tgz", - "7z", -] - -PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) - - -def known_archive_format(filename): - return any( - filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) - ) - - -class APIChecks(APIPrivateView, APIGet, DepositReadMixin): - """Dedicated class to trigger the deposit checks on deposit archives and metadata. - - Only GET is supported. - - """ - - def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]: - """Given a deposit, check each deposit request of type archive. - - Args: - The deposit to check archives for - - Returns - tuple (status, details): True, None if all archives - are ok, (False, <detailed-error>) otherwise. - - """ - requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) - requests.reverse() - if len(requests) == 0: # no associated archive is refused - return False, { - "archive": [ - { - "summary": MANDATORY_ARCHIVE_MISSING, - } - ] - } - - errors = [] - for archive_request in requests: - check, error_message = self._check_archive(archive_request) - if not check: - errors.append( - {"summary": error_message, "fields": [archive_request.id]} - ) - - if not errors: - return True, None - return False, {"archive": errors} - - def _check_archive( - self, archive_request: DepositRequest - ) -> Tuple[bool, Optional[str]]: - """Check that a deposit associated archive is ok: - - readable - - supported archive format - - valid content: the archive does not contain a single archive file - - If any of those checks are not ok, return the corresponding - failing check. - - Args: - archive_path (DepositRequest): Archive to check - - Returns: - (True, None) if archive is check compliant, (False, - <detail-error>) otherwise. - - """ - archive = archive_request.archive - archive_name = os.path.basename(archive.name) - - if not known_archive_format(archive_name): - return False, MANDATORY_ARCHIVE_UNSUPPORTED - - try: - # Use python's File api which is consistent across different types of - # storage backends (e.g. file, azure, ...) - - # I did not find any other) workaround for azure blobstorage use, noop - # otherwise - reset_content_settings_if_needed(archive) - # FIXME: ^ Implement a better way (after digging into django-storages[azure] - - with archive.open("rb") as archive_fp: - try: - with zipfile.ZipFile(archive_fp) as zip_fp: - files = zip_fp.namelist() - except Exception: - try: - # rewind since the first tryout reading may have moved the - # cursor - archive_fp.seek(0) - with tarfile.open(fileobj=archive_fp) as tar_fp: - files = tar_fp.getnames() - except Exception: - return False, MANDATORY_ARCHIVE_UNSUPPORTED - except Exception: - return False, MANDATORY_ARCHIVE_UNREADABLE - if len(files) > 1: - return True, None - element = files[0] - if PATTERN_ARCHIVE_EXTENSION.match(element): - # archive in archive! - return False, MANDATORY_ARCHIVE_INVALID - return True, None - - def process_get( - self, req: Request, collection_name: str, deposit: Deposit - ) -> Tuple[int, Dict, str]: - """Trigger the checks on the deposit archives and then on the deposit metadata. - If any problems (or warnings) are raised, the deposit status and status detail - are updated accordingly. If all checks are ok, the deposit status is updated to - the 'verified' status (details updated with warning if any) and a loading task - is scheduled for the deposit to be ingested. Otherwise, the deposit is marked as - 'rejected' with the error details. A json response is returned to the caller - with the deposit checks. - - Args: - req: Client request - collection_name: Collection owning the deposit - deposit: Deposit concerned by the reading - - Returns: - Tuple (status, json response, content-type) - - """ - raw_metadata = self._metadata_get(deposit) - details_dict: Dict = {} - # will check each deposit's associated request (both of type - # archive and metadata) for errors - archives_status_ok, details = self._check_deposit_archives(deposit) - if not archives_status_ok: - assert details is not None - details_dict.update(details) - - if raw_metadata is None: - metadata_status_ok = False - details_dict["metadata"] = [{"summary": "Missing Atom document"}] - else: - metadata_tree = ElementTree.fromstring(raw_metadata) - metadata_status_ok, details = check_metadata(metadata_tree) - # Ensure in case of error, we do have the rejection details - assert metadata_status_ok or ( - not metadata_status_ok and details is not None - ) - # we can have warnings even if checks are ok (e.g. missing suggested field) - details_dict.update(details or {}) - - deposit_status_ok = archives_status_ok and metadata_status_ok - # if any details_dict arose, the deposit is rejected - deposit.status = ( - DEPOSIT_STATUS_VERIFIED if deposit_status_ok else DEPOSIT_STATUS_REJECTED - ) - response: Dict = { - "status": deposit.status, - } - if details_dict: - deposit.status_detail = details_dict - response["details"] = details_dict - - # Deposit ok, then we schedule the deposit loading task (if not already done) - if deposit_status_ok and not deposit.load_task_id and self.config["checks"]: - url = deposit.origin_url - task = create_oneshot_task( - "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 - ) - load_task_id = self.scheduler.create_tasks([task])[0].id - deposit.load_task_id = str(load_task_id) - - deposit.save() - - return status.HTTP_200_OK, response, "application/json" - - -def reset_content_settings_if_needed(archive) -> None: - """This resets the content_settings on the associated blob stored in an azure - blobstorage. This prevents the correct reading of the file and failing the checks - for no good reason. - - """ - try: - from storages.backends.azure_storage import AzureStorage - except ImportError: - return None - - if not isinstance(archive.storage, AzureStorage): - return None - - from azure.storage.blob import ContentSettings - - blob_client = archive.storage.client.get_blob_client(archive.name) - - # Get the existing blob properties - properties = blob_client.get_blob_properties() - - # reset content encoding in the settings - content_settings = dict(properties.content_settings) - content_settings["content_encoding"] = "" - - # Set the content_type and content_language headers, and populate the remaining - # headers from the existing properties - blob_headers = ContentSettings(**content_settings) - - blob_client.set_http_headers(blob_headers) diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py index 6afe6d1191bbbce57b05197dfa62b4bcdff0488d..bc3fa5008a77e4b11c8fb3c116536e0e30b8923d 100644 --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2024 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,10 +11,12 @@ from swh.deposit.errors import BAD_REQUEST, DepositError from swh.deposit.models import ( DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS, + DEPOSIT_STATUS_VERIFIED, Deposit, ) from swh.model.hashutil import hash_to_bytes from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID +from swh.scheduler.utils import create_oneshot_task MANDATORY_KEYS = ["origin_url", "release_id", "directory_id", "snapshot_id"] @@ -111,8 +113,18 @@ class APIUpdateStatus(APIPrivateView, APIPut): path="/", ) ) - else: # rejected - deposit.status = status + elif ( + status == DEPOSIT_STATUS_VERIFIED + and not deposit.load_task_id + and self.config["checks"] + ): + # Deposit ok, then we schedule the deposit loading task (if not already done) + url = deposit.origin_url + task = create_oneshot_task( + "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 + ) + load_task_id = self.scheduler.create_tasks([task])[0].id + deposit.load_task_id = str(load_task_id) if "status_detail" in data: deposit.status_detail = data["status_detail"] diff --git a/swh/deposit/api/private/deposit_upload_urls.py b/swh/deposit/api/private/deposit_upload_urls.py new file mode 100644 index 0000000000000000000000000000000000000000..3fb744e19fa5e982c3d28f4f3058914be52de88a --- /dev/null +++ b/swh/deposit/api/private/deposit_upload_urls.py @@ -0,0 +1,59 @@ +# Copyright (C) 2025 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import List, Tuple + +from django.db.models import FileField +from django.http.request import HttpRequest +from rest_framework import status + +from swh.deposit.api.common import APIGet +from swh.deposit.api.private import APIPrivateView, DepositReadMixin +from swh.deposit.config import ARCHIVE_TYPE +from swh.deposit.models import Deposit + + +class APIUploadURLs(APIPrivateView, APIGet, DepositReadMixin): + """ + Private API endpoint returning a list of URLs for downloading + tarballs uploaded with a deposit request. + + Only GET is supported. + + """ + + @classmethod + def _get_archive_url(cls, archive: FileField, request: HttpRequest) -> str: + url = archive.storage.url(archive.name) + if url.startswith("/"): + url = request.build_absolute_uri(url) + return url + + def process_get( + self, request: HttpRequest, collection_name: str, deposit: Deposit + ) -> Tuple[int, List[str], str]: + """ + Returns list of URLs for downloading tarballs uploaded with + a deposit request. + + Args: + request: input HTTP request + collection_name: Collection owning the deposit + deposit: Deposit to get tarball download URLs + + Returns: + Tuple status, list of URLs, content-type + + """ + upload_urls = [ + self._get_archive_url(r.archive, request) + # ensure that tarball URLs are sorted in ascending order of their upload + # dates as tarball contents will be aggregated into a single tarball by the + # deposit loader and the files they contain can overlap + for r in reversed( + list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) + ) + ] + return status.HTTP_200_OK, upload_urls, "application/json" diff --git a/swh/deposit/api/private/urls.py b/swh/deposit/api/private/urls.py index d0af733aab2d5d4a3c0b3e41bb04e5f2479d0d58..7a6fd74816a92819315baf46aec3fe67a34f87ab 100644 --- a/swh/deposit/api/private/urls.py +++ b/swh/deposit/api/private/urls.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2023 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,16 +6,16 @@ from django.urls import path from django.urls import re_path as url -from swh.deposit.api.private.deposit_check import APIChecks from swh.deposit.api.private.deposit_list import APIList, deposit_list_datatables from swh.deposit.api.private.deposit_read import APIReadArchives, APIReadMetadata from swh.deposit.api.private.deposit_releases import APIReleases from swh.deposit.api.private.deposit_update_status import APIUpdateStatus +from swh.deposit.api.private.deposit_upload_urls import APIUploadURLs from swh.deposit.config import ( - PRIVATE_CHECK_DEPOSIT, PRIVATE_GET_DEPOSIT_METADATA, PRIVATE_GET_RAW_CONTENT, PRIVATE_GET_RELEASES, + PRIVATE_GET_UPLOAD_URLS, PRIVATE_LIST_DEPOSITS, PRIVATE_LIST_DEPOSITS_DATATABLES, PRIVATE_PUT_DEPOSIT, @@ -43,13 +43,6 @@ urlpatterns = [ APIReadMetadata.as_view(), name=PRIVATE_GET_DEPOSIT_METADATA, ), - # Check archive and metadata information on a specific deposit - # -> GET - url( - r"^(?P<collection_name>[^/]+)/(?P<deposit_id>[^/]+)/check/$", - APIChecks.as_view(), - name=PRIVATE_CHECK_DEPOSIT, - ), # Retrieve deposit's raw archives' content # -> GET url( @@ -71,13 +64,6 @@ urlpatterns = [ APIReadMetadata.as_view(), name=PRIVATE_GET_DEPOSIT_METADATA + "-nc", ), - # Check archive and metadata information on a specific deposit - # -> GET - url( - r"^(?P<deposit_id>[^/]+)/check/$", - APIChecks.as_view(), - name=PRIVATE_CHECK_DEPOSIT + "-nc", - ), url(r"^deposits/$", APIList.as_view(), name=PRIVATE_LIST_DEPOSITS), url( r"^deposits/datatables/$", @@ -91,4 +77,11 @@ urlpatterns = [ APIReleases.as_view(), name=PRIVATE_GET_RELEASES, ), + # Retrieve download URLs for the tarballs uploaded with a deposit + # -> GET + path( + "<int:deposit_id>/upload-urls/", + APIUploadURLs.as_view(), + name=PRIVATE_GET_UPLOAD_URLS, + ), ] diff --git a/swh/deposit/config.py b/swh/deposit/config.py index f3d51ad8455162ffca3c5ed7cd307cb99ce13bf4..bfb1f98da04999e2a77470cd97fae197ab5b432e 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -9,10 +9,6 @@ from typing import Any, Dict, Optional from swh.core import config from swh.deposit import __version__ from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher -from swh.scheduler import get_scheduler -from swh.scheduler.interface import SchedulerInterface -from swh.storage import get_storage -from swh.storage.interface import StorageInterface # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_IRI = "edit_iri" @@ -23,12 +19,12 @@ SD_IRI = "servicedocument" COL_IRI = "upload" STATE_IRI = "state_iri" PRIVATE_GET_RAW_CONTENT = "private-download" -PRIVATE_CHECK_DEPOSIT = "check-deposit" PRIVATE_PUT_DEPOSIT = "private-update" PRIVATE_GET_DEPOSIT_METADATA = "private-read" PRIVATE_LIST_DEPOSITS = "private-deposit-list" PRIVATE_LIST_DEPOSITS_DATATABLES = "private-deposit-list-datatables" PRIVATE_GET_RELEASES = "private-releases" +PRIVATE_GET_UPLOAD_URLS = "private-upload-urls" ARCHIVE_KEY = "archive" RAW_METADATA_KEY = "raw-metadata" @@ -99,6 +95,11 @@ class APIConfig: """ def __init__(self): + from swh.scheduler import get_scheduler + from swh.scheduler.interface import SchedulerInterface + from swh.storage import get_storage + from swh.storage.interface import StorageInterface + self.config: Dict[str, Any] = config.load_from_envvar(DEFAULT_CONFIG) self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) self.tool = { diff --git a/swh/deposit/loader/checker.py b/swh/deposit/loader/checker.py index 8998a803ad8f270862091e671c30cc2b8d4c07e3..68439a421a005cc41587cb5291b87035cde7db29 100644 --- a/swh/deposit/loader/checker.py +++ b/swh/deposit/loader/checker.py @@ -1,18 +1,148 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from itertools import chain import logging -from typing import Any, Dict +import os +import re +from shutil import get_unpack_formats +import tarfile +import tempfile +from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import urlparse +from xml.etree import ElementTree +import zipfile +import requests import sentry_sdk from swh.core import config from swh.deposit.client import PrivateApiDepositClient +from swh.deposit.config import DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED +from swh.deposit.loader.checks import check_metadata logger = logging.getLogger(__name__) +MANDATORY_ARCHIVE_UNREADABLE = ( + "At least one of its associated archives is not readable" # noqa +) +MANDATORY_ARCHIVE_INVALID = ( + "Mandatory archive is invalid (i.e contains only one archive)" # noqa +) +MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" +MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" + +ARCHIVE_EXTENSIONS = [ + "zip", + "tar", + "tar.gz", + "xz", + "tar.xz", + "bz2", + "tar.bz2", + "Z", + "tar.Z", + "tgz", + "7z", +] + +PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) + + +def known_archive_format(filename): + return any( + filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) + ) + + +def _check_archive(archive_url: str) -> Tuple[bool, Optional[str]]: + """Check that a deposit associated archive is ok: + - readable + - supported archive format + - valid content: the archive does not contain a single archive file + + If any of those checks are not ok, return the corresponding + failing check. + + Args: + archive_path (DepositRequest): Archive to check + + Returns: + (True, None) if archive is check compliant, (False, + <detail-error>) otherwise. + + """ + parsed_archive_url = urlparse(archive_url) + archive_name = os.path.basename(parsed_archive_url.path) + + if not known_archive_format(archive_name): + return False, MANDATORY_ARCHIVE_UNSUPPORTED + + try: + response = requests.get(archive_url, stream=True) + with tempfile.TemporaryDirectory() as tmpdir: + archive_path = os.path.join(tmpdir, archive_name) + with open(archive_path, "wb") as archive_fp: + for chunk in response.iter_content(chunk_size=10 * 1024 * 1024): + archive_fp.write(chunk) + with open(archive_path, "rb") as archive_fp: + try: + with zipfile.ZipFile(archive_fp) as zip_fp: + files = zip_fp.namelist() + except Exception: + try: + # rewind since the first tryout reading may have moved the + # cursor + archive_fp.seek(0) + with tarfile.open(fileobj=archive_fp) as tar_fp: + files = tar_fp.getnames() + except Exception: + return False, MANDATORY_ARCHIVE_UNSUPPORTED + except Exception: + return False, MANDATORY_ARCHIVE_UNREADABLE + if len(files) > 1: + return True, None + element = files[0] + if PATTERN_ARCHIVE_EXTENSION.match(element): + # archive in archive! + return False, MANDATORY_ARCHIVE_INVALID + return True, None + + +def _check_deposit_archives( + archive_urls: List[str], +) -> Tuple[bool, Optional[Dict]]: + """Given a deposit, check each deposit request of type archive. + + Args: + The deposit to check archives for + + Returns + tuple (status, details): True, None if all archives + are ok, (False, <detailed-error>) otherwise. + + """ + if len(archive_urls) == 0: # no associated archive is refused + return False, { + "archive": [ + { + "summary": MANDATORY_ARCHIVE_MISSING, + } + ] + } + + errors = [] + for archive_url in archive_urls: + check, error_message = _check_archive(archive_url) + if not check: + errors.append({"summary": error_message}) + + if not errors: + return True, None + return False, {"archive": errors} + class DepositChecker: """Deposit checker implementation. @@ -25,17 +155,58 @@ class DepositChecker: self.config: Dict[str, Any] = config.load_from_envvar() self.client = PrivateApiDepositClient(config=self.config["deposit"]) - def check(self, collection: str, deposit_id: str) -> Dict[str, str]: + def check(self, collection: str, deposit_id: str) -> Dict[str, Any]: status = None - deposit_check_url = f"/{collection}/{deposit_id}/check/" - logger.debug("deposit-check-url: %s", deposit_check_url) + deposit_upload_urls = f"/{deposit_id}/upload-urls/" + logger.debug("deposit-upload-urls: %s", deposit_upload_urls) + details_dict: Dict = {} try: - r = self.client.check(deposit_check_url) - logger.debug("Check result: %s", r) - status = "eventful" if r == "verified" else "failed" - except Exception: - logger.exception("Failure during check on '%s'", deposit_check_url) + raw_metadata = self.client.metadata_get(f"/{deposit_id}/meta/").get( + "raw_metadata" + ) + + # will check each deposit's associated request (both of type + # archive and metadata) for errors + + archive_urls = self.client.do("GET", deposit_upload_urls).json() + logger.debug("deposit-upload-urls result: %s", archive_urls) + + archives_status_ok, details = _check_deposit_archives(archive_urls) + + if not archives_status_ok: + assert details is not None + details_dict.update(details) + + if raw_metadata is None: + metadata_status_ok = False + details_dict["metadata"] = [{"summary": "Missing Atom document"}] + else: + metadata_tree = ElementTree.fromstring(raw_metadata) + metadata_status_ok, details = check_metadata(metadata_tree) + # Ensure in case of error, we do have the rejection details + assert metadata_status_ok or ( + not metadata_status_ok and details is not None + ) + # we can have warnings even if checks are ok (e.g. missing suggested field) + details_dict.update(details or {}) + + deposit_status_ok = archives_status_ok and metadata_status_ok + # if any details_dict arose, the deposit is rejected + + status = ( + DEPOSIT_STATUS_VERIFIED + if deposit_status_ok + else DEPOSIT_STATUS_REJECTED + ) + + self.client.status_update( + f"/{deposit_id}/update/", status=status, status_detail=details_dict + ) + + status = "eventful" if status == DEPOSIT_STATUS_VERIFIED else "failed" + except Exception as e: sentry_sdk.capture_exception() status = "failed" + details_dict["exception"] = f"{e.__class__.__name__}: {str(e)}" logger.debug("Check status: %s", status) - return {"status": status} + return {"status": status, "status_detail": details_dict} diff --git a/swh/deposit/api/checks.py b/swh/deposit/loader/checks.py similarity index 97% rename from swh/deposit/api/checks.py rename to swh/deposit/loader/checks.py index dd24e65047eca7fb15bcd7de41fa74acd0833d2f..1fd227b67f7177cc8eb67332ff1c622344996d33 100644 --- a/swh/deposit/api/checks.py +++ b/swh/deposit/loader/checks.py @@ -24,7 +24,6 @@ from xml.etree import ElementTree import xmlschema -from swh.deposit.errors import FORBIDDEN, DepositError from swh.deposit.utils import NAMESPACES, parse_swh_metadata_provenance MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" @@ -404,17 +403,3 @@ def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]] return True, {"metadata": suggested_fields} return True, None - - -def check_url_match_provider(url: str, provider_url: str) -> None: - """Check url matches the provider url. - - Raises DepositError in case of mismatch - - """ - provider_url = provider_url.rstrip("/") + "/" - if not url.startswith(provider_url): - raise DepositError( - FORBIDDEN, - f"URL mismatch: {url} must start with {provider_url}", - ) diff --git a/swh/deposit/settings/common.py b/swh/deposit/settings/common.py index 5a85b99de2436da3aeb30ea4e145f27397fdd890..e244198832cff6c78f709fb3d3434264a424a781 100644 --- a/swh/deposit/settings/common.py +++ b/swh/deposit/settings/common.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -122,3 +122,5 @@ CACHES = { "BACKEND": "django.core.cache.backends.locmem.LocMemCache", } } + +MEDIA_URL = "uploads/" diff --git a/swh/deposit/settings/production.py b/swh/deposit/settings/production.py index 5008f85a620de25690db204f1c7e2ff568101f15..e9f549f449ceba2d834f32b67cc994cab6e9fe6f 100644 --- a/swh/deposit/settings/production.py +++ b/swh/deposit/settings/production.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2023 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -127,6 +127,8 @@ if cfg_azure: azure_container=cfg_azure["container_name"], connection_string=cfg_azure["connection_string"], timeout=cfg_azure.get("connection_timeout", 120), + # ensure to generate temporary download links with shared access signature + expiration_secs=cfg_azure.get("expiration_secs", 1800), ) # Which may be enhanced with some extra options, lookup "object_parameters" in diff --git a/swh/deposit/tests/api/conftest.py b/swh/deposit/tests/api/conftest.py index 234053a01e7ea679fc7ac850bf3c457e41fc8056..c9f24668574a42a79a339a66b30a673b16f25afb 100644 --- a/swh/deposit/tests/api/conftest.py +++ b/swh/deposit/tests/api/conftest.py @@ -1,24 +1,14 @@ -# Copyright (C) 2019-2023 The Software Heritage developers +# Copyright (C) 2019-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import hashlib import os -from django.urls import reverse_lazy as reverse from django.utils import timezone import pytest -from swh.deposit.api.private.deposit_check import APIChecks -from swh.deposit.config import ( - COL_IRI, - DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_VERIFIED, -) -from swh.deposit.models import Deposit -from swh.deposit.parsers import parse_xml -from swh.deposit.utils import NAMESPACES +from swh.deposit.config import DEPOSIT_STATUS_VERIFIED @pytest.fixture @@ -27,15 +17,6 @@ def datadir(request): return os.path.join(os.path.dirname(str(request.fspath)), "../data") -@pytest.fixture -def ready_deposit_ok(partial_deposit_with_metadata): - """Returns a deposit ready for checks (it will pass the checks).""" - deposit = partial_deposit_with_metadata - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - return deposit - - @pytest.fixture def ready_deposit_verified(partial_deposit_with_metadata): """Returns a verified deposit.""" @@ -44,49 +25,3 @@ def ready_deposit_verified(partial_deposit_with_metadata): deposit.complete_date = timezone.now() deposit.save() return deposit - - -@pytest.fixture -def ready_deposit_only_metadata(partial_deposit_only_metadata): - """Deposit in status ready that will fail the checks (because missing - archive). - - """ - deposit = partial_deposit_only_metadata - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - return deposit - - -@pytest.fixture -def ready_deposit_invalid_archive(authenticated_client, deposit_collection): - url = reverse(COL_IRI, args=[deposit_collection.name]) - - data = b"some data which is clearly not a zip file" - md5sum = hashlib.md5(data).hexdigest() - - # when - response = authenticated_client.post( - url, - content_type="application/zip", # as zip - data=data, - # + headers - CONTENT_LENGTH=len(data), - # other headers needs HTTP_ prefix to be taken into account - HTTP_SLUG="external-id-invalid", - HTTP_CONTENT_MD5=md5sum, - HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", - HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", - ) - - response_content = parse_xml(response.content) - deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) - deposit = Deposit.objects.get(pk=deposit_id) - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - return deposit - - -@pytest.fixture -def swh_checks_deposit(): - return APIChecks() diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py index 7a95846edb0eb3cc6ececfe023bc94cd6c7e8126..520a9ae188b47877a0a6837fb4735ffae3a44f1e 100644 --- a/swh/deposit/tests/api/test_checks.py +++ b/swh/deposit/tests/api/test_checks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2024 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,7 +14,7 @@ from xml.etree import ElementTree import pytest -from swh.deposit.api.checks import ( +from swh.deposit.loader.checks import ( METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING, check_metadata, @@ -396,7 +396,7 @@ _parameters1 = [ "metadata_ok", _parameters1, ) -def test_api_checks_check_metadata_ok(metadata_ok, swh_checks_deposit): +def test_api_checks_check_metadata_ok(metadata_ok): actual_check, detail = check_metadata(ElementTree.fromstring(metadata_ok)) assert actual_check is True, f"Unexpected result:\n{pprint.pformat(detail)}" if "swh:deposit" in metadata_ok: @@ -662,9 +662,7 @@ _parameters2 = [ @pytest.mark.parametrize("metadata_ko,expected_summary", _parameters2) -def test_api_checks_check_metadata_ko( - metadata_ko, expected_summary, swh_checks_deposit -): +def test_api_checks_check_metadata_ko(metadata_ko, expected_summary): actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) assert actual_check is False assert error_detail == {"metadata": [expected_summary]} @@ -1216,9 +1214,7 @@ _parameters3 = [ @pytest.mark.parametrize("metadata_ko,expected_summaries", _parameters3) -def test_api_checks_check_metadata_ko_schema( - metadata_ko, expected_summaries, swh_checks_deposit -): +def test_api_checks_check_metadata_ko_schema(metadata_ko, expected_summaries): actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) assert actual_check is False assert len(error_detail["metadata"]) == len(expected_summaries), error_detail[ diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py deleted file mode 100644 index f3cb00ce64314aee977901f5f5aeed62ddcc8a23..0000000000000000000000000000000000000000 --- a/swh/deposit/tests/api/test_deposit_private_check.py +++ /dev/null @@ -1,288 +0,0 @@ -# Copyright (C) 2017-2023 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import random - -from django.urls import reverse_lazy as reverse -import pytest -from rest_framework import status - -from swh.deposit.api.checks import METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING -from swh.deposit.api.private.deposit_check import ( - MANDATORY_ARCHIVE_INVALID, - MANDATORY_ARCHIVE_MISSING, - MANDATORY_ARCHIVE_UNSUPPORTED, -) -from swh.deposit.config import ( - COL_IRI, - DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_PARTIAL, - DEPOSIT_STATUS_REJECTED, - DEPOSIT_STATUS_VERIFIED, - PRIVATE_CHECK_DEPOSIT, - SE_IRI, -) -from swh.deposit.models import Deposit -from swh.deposit.parsers import parse_xml -from swh.deposit.tests.common import ( - SUPPORTED_TARBALL_MODES, - create_arborescence_archive, - create_archive_with_archive, - post_archive, - post_atom, -) -from swh.deposit.utils import NAMESPACES - -PRIVATE_CHECK_DEPOSIT_NC = PRIVATE_CHECK_DEPOSIT + "-nc" - - -def private_check_url_endpoints(collection, deposit): - """There are 2 endpoints to check (one with collection, one without)""" - return [ - reverse(PRIVATE_CHECK_DEPOSIT, args=[collection.name, deposit.id]), - reverse(PRIVATE_CHECK_DEPOSIT_NC, args=[deposit.id]), - ] - - -@pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) -def test_deposit_ok( - tmp_path, authenticated_client, deposit_collection, extension, atom_dataset -): - """Proper deposit should succeed the checks (-> status ready)""" - deposit = create_deposit_with_archive( - tmp_path, extension, authenticated_client, deposit_collection.name, atom_dataset - ) - - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - - assert response.status_code == status.HTTP_200_OK - data = response.json() - assert data["status"] == DEPOSIT_STATUS_VERIFIED, data - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_VERIFIED - - # Deposit is ok but it's missing suggested fields in its metadata detected by - # the checks - status_detail = deposit.status_detail["metadata"] - assert len(status_detail) == 1 - suggested = status_detail[0] - assert suggested["summary"] == SUGGESTED_FIELDS_MISSING - assert set(suggested["fields"]) == set([METADATA_PROVENANCE_KEY]) - - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - - -@pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) -def test_deposit_invalid_tarball( - tmp_path, authenticated_client, deposit_collection, extension, atom_dataset -): - """Deposit with tarball (of 1 tarball) should fail the checks: rejected""" - deposit = create_deposit_archive_with_archive( - tmp_path, extension, authenticated_client, deposit_collection.name, atom_dataset - ) - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - assert response.status_code == status.HTTP_200_OK - data = response.json() - assert data["status"] == DEPOSIT_STATUS_REJECTED - details = data["details"] - # archive checks failure - assert len(details["archive"]) == 1 - assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_INVALID - - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_REJECTED - - -def test_deposit_ko_missing_tarball( - authenticated_client, deposit_collection, ready_deposit_only_metadata -): - """Deposit without archive should fail the checks: rejected""" - deposit = ready_deposit_only_metadata - assert deposit.status == DEPOSIT_STATUS_DEPOSITED - - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - - assert response.status_code == status.HTTP_200_OK - data = response.json() - assert data["status"] == DEPOSIT_STATUS_REJECTED - details = data["details"] - # archive checks failure - assert len(details["archive"]) == 1 - assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_MISSING - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_REJECTED - - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - - -def test_deposit_ko_unsupported_tarball( - tmp_path, authenticated_client, deposit_collection, ready_deposit_invalid_archive -): - """Deposit with unsupported tarball should fail checks and be rejected""" - deposit = ready_deposit_invalid_archive - assert DEPOSIT_STATUS_DEPOSITED == deposit.status - - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - - assert response.status_code == status.HTTP_200_OK - data = response.json() - assert data["status"] == DEPOSIT_STATUS_REJECTED - details = data["details"] - - # archive checks failure - assert len(details["archive"]) == 1 - assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED - # metadata check failure - assert len(details["metadata"]) == 1 - mandatory = details["metadata"][0] - assert mandatory["summary"] == "Missing Atom document" - - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_REJECTED - - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - - -def test_deposit_ko_unsupported_tarball_prebasic_check( - tmp_path, authenticated_client, deposit_collection, atom_dataset -): - """Deposit with unsupported tarball extension should fail checks and be rejected""" - - invalid_gz_mode = random.choice( - [f"{ext}-foobar" for ext in SUPPORTED_TARBALL_MODES] - ) - invalid_extension = f"tar.{invalid_gz_mode}" - - deposit = create_deposit_with_archive( - tmp_path, - invalid_extension, - authenticated_client, - deposit_collection.name, - atom_dataset, - ) - assert DEPOSIT_STATUS_DEPOSITED == deposit.status - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - - assert response.status_code == status.HTTP_200_OK - data = response.json() - - assert data["status"] == DEPOSIT_STATUS_REJECTED - details = data["details"] - # archive checks failure - assert len(details["archive"]) == 1 - assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED - - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_REJECTED - - -def test_check_deposit_metadata_ok( - authenticated_client, deposit_collection, ready_deposit_ok -): - """Proper deposit should succeed the checks (-> status ready) - with all **MUST** metadata - - using the codemeta metadata test set - """ - deposit = ready_deposit_ok - assert deposit.status == DEPOSIT_STATUS_DEPOSITED - - for url in private_check_url_endpoints(deposit_collection, deposit): - response = authenticated_client.get(url) - - assert response.status_code == status.HTTP_200_OK - data = response.json() - - assert data["status"] == DEPOSIT_STATUS_VERIFIED, data - deposit = Deposit.objects.get(pk=deposit.id) - assert deposit.status == DEPOSIT_STATUS_VERIFIED - - deposit.status = DEPOSIT_STATUS_DEPOSITED - deposit.save() - - -def create_deposit(archive, client, collection_name, atom_dataset): - """Create a deposit with archive (and metadata) for client in the collection name.""" - # we deposit it - response = post_archive( - client, - reverse(COL_IRI, args=[collection_name]), - archive, - content_type="application/x-tar", - slug="external-id", - in_progress=True, - ) - - # then - assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(response.content) - deposit_status = response_content.findtext( - "swh:deposit_status", namespaces=NAMESPACES - ) - assert deposit_status == DEPOSIT_STATUS_PARTIAL - deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) - - origin_url = client.deposit_client.provider_url - response = post_atom( - client, - reverse(SE_IRI, args=[collection_name, deposit_id]), - data=atom_dataset["entry-data0"] % origin_url, - in_progress=False, - ) - - assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(response.content) - deposit_status = response_content.findtext( - "swh:deposit_status", namespaces=NAMESPACES - ) - assert deposit_status == DEPOSIT_STATUS_DEPOSITED - - deposit = Deposit.objects.get(pk=deposit_id) - assert DEPOSIT_STATUS_DEPOSITED == deposit.status - return deposit - - -def create_deposit_with_archive( - root_path, archive_extension, client, collection_name, atom_dataset -): - """Create a deposit with a valid archive.""" - # we create the holding archive to a given extension - archive = create_arborescence_archive( - root_path, - "archive1", - "file1", - b"some content in file", - extension=archive_extension, - ) - - return create_deposit(archive, client, collection_name, atom_dataset) - - -def create_deposit_archive_with_archive( - root_path, archive_extension, client, collection_name, atom_dataset -): - """Create a deposit with an invalid archive (archive within archive)""" - - # we create the holding archive to a given extension - archive = create_arborescence_archive( - root_path, - "archive1", - "file1", - b"some content in file", - extension=archive_extension, - ) - - # now we create an archive holding the first created archive - invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive) - - return create_deposit(invalid_archive, client, collection_name, atom_dataset) diff --git a/swh/deposit/tests/api/test_deposit_private_upload_urls.py b/swh/deposit/tests/api/test_deposit_private_upload_urls.py new file mode 100644 index 0000000000000000000000000000000000000000..4b8819fa608672fd6ccdb4d76d5c03aa6de54cc9 --- /dev/null +++ b/swh/deposit/tests/api/test_deposit_private_upload_urls.py @@ -0,0 +1,190 @@ +# Copyright (C) 2025 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from hashlib import sha1 +import os +import secrets +import shutil +import subprocess + +from azure.storage.blob import BlobServiceClient +from django.urls import reverse_lazy as reverse +import pytest +import requests +from rest_framework import status + +from swh.deposit.config import DEPOSIT_STATUS_PARTIAL, EM_IRI, PRIVATE_GET_UPLOAD_URLS +from swh.deposit.tests.common import create_arborescence_archive +from swh.deposit.tests.conftest import create_deposit + +AZURITE_EXE = shutil.which( + "azurite-blob", path=os.environ.get("AZURITE_PATH", os.environ.get("PATH")) +) + + +@pytest.fixture(scope="module") +def azure_container_name(): + return secrets.token_hex(10) + + +@pytest.fixture(scope="module") +def azurite_connection_string(tmpdir_factory): + host = "127.0.0.1" + + azurite_path = tmpdir_factory.mktemp("azurite") + + azurite_proc = subprocess.Popen( + [ + AZURITE_EXE, + "--blobHost", + host, + "--blobPort", + "0", + ], + stdout=subprocess.PIPE, + cwd=azurite_path, + ) + + prefix = b"Azurite Blob service successfully listens on " + for line in azurite_proc.stdout: + if line.startswith(prefix): + base_url = line[len(prefix) :].decode().strip() + break + else: + assert False, "Did not get Azurite Blob service port." + + # https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite#well-known-storage-account-and-key + account_name = "devstoreaccount1" + account_key = ( + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq" + "/K1SZFPTOtr/KBHBeksoGMGw==" + ) + + container_url = f"{base_url}/{account_name}" + + # unset proxy set by the swh_proxy session fixture + del os.environ["http_proxy"] + del os.environ["https_proxy"] + + yield ( + f"DefaultEndpointsProtocol=https;" + f"AccountName={account_name};" + f"AccountKey={account_key};" + f"BlobEndpoint={container_url};" + ) + + # reset proxy + os.environ["http_proxy"] = "http://localhost:999" + os.environ["https_proxy"] = "http://localhost:999" + + azurite_proc.kill() + azurite_proc.wait(2) + + +def check_deposit_upload_urls( + tmp_path, + authenticated_client, + deposit_collection, + sample_archive, +): + # Add a first archive to deposit + partial_deposit = create_deposit( + authenticated_client, + collection_name=deposit_collection.name, + sample_archive=sample_archive, + external_id="external-id", + deposit_status=DEPOSIT_STATUS_PARTIAL, + ) + + # Add a second archive to deposit + archive2 = create_arborescence_archive( + tmp_path, "archive2", "file2", b"some other content in file" + ) + update_uri = reverse(EM_IRI, args=[deposit_collection.name, partial_deposit.id]) + response = authenticated_client.post( + update_uri, + content_type="application/zip", # as zip + data=archive2["data"], + # + headers + CONTENT_LENGTH=archive2["length"], + HTTP_SLUG=partial_deposit.external_id, + HTTP_CONTENT_MD5=archive2["md5sum"], + HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", + HTTP_IN_PROGRESS="false", + HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), + ) + assert response.status_code == status.HTTP_201_CREATED + + # check uploaded tarballs can be downloaded using URLs and + # compare download contents with originals + url = reverse(PRIVATE_GET_UPLOAD_URLS, args=[partial_deposit.id]) + response = authenticated_client.get(url) + upload_urls = response.json() + assert len(upload_urls) == 2 + assert "archive1.zip" in upload_urls[0] + assert "archive2.zip" in upload_urls[1] + tarball_shasums = set() + for upload_url in upload_urls: + response = ( + # when storage backend is local filesystem and served by django + authenticated_client.get(upload_url) + if upload_url.startswith("http://testserver/") + # when storage backend is azurite + else requests.get(upload_url) + ) + assert response.status_code == status.HTTP_200_OK + tarball_shasums.add( + sha1( + b"".join(response.streaming_content) + if hasattr(response, "streaming_content") + else response.content + ).hexdigest() + ) + + assert tarball_shasums == {sample_archive["sha1sum"], archive2["sha1sum"]} + + +def test_deposit_upload_urls_local_filesystem_storage_backend( + tmp_path, + authenticated_client, + deposit_collection, + sample_archive, +): + check_deposit_upload_urls( + tmp_path, + authenticated_client, + deposit_collection, + sample_archive, + ) + + +@pytest.mark.skipif(not AZURITE_EXE, reason="azurite not found in AZURITE_PATH or PATH") +def test_deposit_upload_urls_azure_storage_backend( + tmp_path, + authenticated_client, + deposit_collection, + sample_archive, + settings, + azurite_connection_string, + azure_container_name, +): + blob_client = BlobServiceClient.from_connection_string(azurite_connection_string) + blob_client.create_container(azure_container_name) + settings.STORAGES = { + "default": { + "BACKEND": "storages.backends.azure_storage.AzureStorage", + "OPTIONS": { + "connection_string": azurite_connection_string, + "azure_container": azure_container_name, + "expiration_secs": 1800, + }, + }, + } + check_deposit_upload_urls( + tmp_path, + authenticated_client, + deposit_collection, + sample_archive, + ) diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py index 9c4baed2d12874d69a521187e89381263ae56a3f..99e291f3103ce4f2954dd830537837aa5a54b009 100644 --- a/swh/deposit/tests/cli/test_client.py +++ b/swh/deposit/tests/cli/test_client.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2022 The Software Heritage developers +# Copyright (C) 2020-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,11 +15,6 @@ from xml.etree import ElementTree import pytest import yaml -from swh.deposit.api.checks import ( - METADATA_PROVENANCE_KEY, - SUGGESTED_FIELDS_MISSING, - check_metadata, -) from swh.deposit.cli import deposit as cli from swh.deposit.cli.client import InputError, _collection, _url, generate_metadata from swh.deposit.client import ( @@ -28,6 +23,11 @@ from swh.deposit.client import ( PublicApiDepositClient, ServiceDocumentDepositClient, ) +from swh.deposit.loader.checks import ( + METADATA_PROVENANCE_KEY, + SUGGESTED_FIELDS_MISSING, + check_metadata, +) from swh.deposit.parsers import parse_xml from swh.deposit.tests.conftest import TEST_USER from swh.deposit.utils import NAMESPACES diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index 2c545d99cd704de22a31db555a098db6cdbd7434..a59791799d70e7e104f6ba05f64d5a08d533cde0 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2024 The Software Heritage developers +# Copyright (C) 2019-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,6 +6,7 @@ import base64 from copy import deepcopy from functools import partial +import hashlib import os import re from typing import TYPE_CHECKING, Dict, Mapping @@ -35,6 +36,8 @@ from swh.deposit.config import ( SE_IRI, setup_django_for, ) +from swh.deposit.models import Deposit +from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( create_arborescence_archive, post_archive, @@ -46,7 +49,7 @@ from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.scheduler import get_scheduler if TYPE_CHECKING: - from swh.deposit.models import Deposit, DepositClient, DepositCollection + from swh.deposit.models import DepositClient, DepositCollection # mypy is asked to ignore the import statement above because setup_databases @@ -594,6 +597,56 @@ def complete_deposit(sample_archive, deposit_collection, authenticated_client): return deposit +@pytest.fixture +def ready_deposit_invalid_archive(authenticated_client, deposit_collection): + url = reverse(COL_IRI, args=[deposit_collection.name]) + + data = b"some data which is clearly not a zip file" + md5sum = hashlib.md5(data).hexdigest() + + # when + response = authenticated_client.post( + url, + content_type="application/zip", # as zip + data=data, + # + headers + CONTENT_LENGTH=len(data), + # other headers needs HTTP_ prefix to be taken into account + HTTP_SLUG="external-id-invalid", + HTTP_CONTENT_MD5=md5sum, + HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", + HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", + ) + + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) + deposit = Deposit.objects.get(pk=deposit_id) + deposit.status = DEPOSIT_STATUS_DEPOSITED + deposit.save() + return deposit + + +@pytest.fixture +def ready_deposit_ok(partial_deposit_with_metadata): + """Returns a deposit ready for checks (it will pass the checks).""" + deposit = partial_deposit_with_metadata + deposit.status = DEPOSIT_STATUS_DEPOSITED + deposit.save() + return deposit + + +@pytest.fixture +def ready_deposit_only_metadata(partial_deposit_only_metadata): + """Deposit in status ready that will fail the checks (because missing + archive). + + """ + deposit = partial_deposit_only_metadata + deposit.status = DEPOSIT_STATUS_DEPOSITED + deposit.save() + return deposit + + @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version diff --git a/swh/deposit/tests/data/atom/entry-data0.xml b/swh/deposit/tests/data/atom/entry-data0.xml index 647dbe9f5dce21f9b93129076a2d55501d992deb..7ea58e4439666c5b792523aebde2141e36413730 100644 --- a/swh/deposit/tests/data/atom/entry-data0.xml +++ b/swh/deposit/tests/data/atom/entry-data0.xml @@ -1,9 +1,10 @@ <?xml version="1.0"?> <entry xmlns="http://www.w3.org/2005/Atom" + xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" xmlns:swh="https://www.softwareheritage.org/schema/2018/deposit"> <title>Awesome Compiler</title> <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> - <updated>2017-10-07T15:17:08Z</updated> + <codemeta:datePublished>2017-10-07T15:17:08Z</codemeta:datePublished> <author>some awesome author</author> <name>awesome-compiler</name> diff --git a/swh/deposit/tests/data/atom/entry-data1.xml b/swh/deposit/tests/data/atom/entry-data1.xml index 8a75ef12017942516b9fb755ccbc21b750448f72..c573500f17705d583d7f2e3c92611b5126f12fbd 100644 --- a/swh/deposit/tests/data/atom/entry-data1.xml +++ b/swh/deposit/tests/data/atom/entry-data1.xml @@ -1,7 +1,8 @@ <?xml version="1.0"?> -<entry xmlns="http://www.w3.org/2005/Atom"> +<entry xmlns="http://www.w3.org/2005/Atom" xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"> <id>urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a</id> <updated>2017-10-07T15:17:08Z</updated> + <codemeta:datePublished>2017-10-07T15:17:08Z</codemeta:datePublished> <author>some awesome author</author> <name>awesome-compiler</name> </entry> diff --git a/swh/deposit/tests/loader/conftest.py b/swh/deposit/tests/loader/conftest.py index 2f8f9e6bc2bf95365c6db1c1aade8eda312ee984..74eae66c118e28245ccc3424fe120b26c847b558 100644 --- a/swh/deposit/tests/loader/conftest.py +++ b/swh/deposit/tests/loader/conftest.py @@ -1,23 +1,17 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest -from swh.deposit.loader.checker import DepositChecker - @pytest.fixture -def deposit_config(tmp_path): +def deposit_config(deposit_config): return { + **deposit_config, "deposit": { "url": "https://deposit.softwareheritage.org/1/private/", "auth": {}, - } + }, } - - -@pytest.fixture -def deposit_checker(deposit_config_path): - return DepositChecker() diff --git a/swh/deposit/tests/loader/test_checker.py b/swh/deposit/tests/loader/test_checker.py index 56189517035cd0d633e5537519d3c717ec1d2a69..6ac589787e1c6fd44dcd26ca31e0e3708bcb88df 100644 --- a/swh/deposit/tests/loader/test_checker.py +++ b/swh/deposit/tests/loader/test_checker.py @@ -1,26 +1,512 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from unittest.mock import patch +import os +import random +from django.urls import reverse_lazy as reverse +import pytest +from rest_framework import status -def test_checker_deposit_ready(requests_mock_datadir, deposit_checker): - """Check on a valid 'deposited' deposit should result in 'verified'""" - actual_result = deposit_checker.check(collection="test", deposit_id=1) - assert actual_result == {"status": "eventful"} +from swh.deposit.config import ( + COL_IRI, + DEPOSIT_STATUS_DEPOSITED, + DEPOSIT_STATUS_PARTIAL, + PRIVATE_GET_DEPOSIT_METADATA, + PRIVATE_GET_UPLOAD_URLS, + PRIVATE_PUT_DEPOSIT, + SE_IRI, +) +from swh.deposit.loader.checker import ( + MANDATORY_ARCHIVE_INVALID, + MANDATORY_ARCHIVE_MISSING, + MANDATORY_ARCHIVE_UNSUPPORTED, + DepositChecker, +) +from swh.deposit.loader.checks import METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING +from swh.deposit.models import Deposit +from swh.deposit.parsers import parse_xml +from swh.deposit.tests.common import ( + SUPPORTED_TARBALL_MODES, + create_arborescence_archive, + create_archive_with_archive, + post_archive, + post_atom, +) +from swh.deposit.utils import NAMESPACES +PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc" +PRIVATE_PUT_DEPOSIT_NC = PRIVATE_PUT_DEPOSIT + "-nc" -def test_checker_deposit_rejected(requests_mock_datadir, deposit_checker): - """Check on invalid 'deposited' deposit should result in 'rejected'""" - actual_result = deposit_checker.check(collection="test", deposit_id=2) - assert actual_result == {"status": "failed"} +BASE_URL = "https://deposit.softwareheritage.org" -@patch("swh.deposit.client.requests.get") -def test_checker_deposit_rejected_exception(mock_requests, deposit_checker): - """Check on invalid 'deposited' deposit should result in 'rejected'""" - mock_requests.side_effect = ValueError("simulated problem when checking") - actual_result = deposit_checker.check(collection="test", deposit_id=3) - assert actual_result == {"status": "failed"} +def create_deposit(archive, client, collection_name, atom_dataset): + """Create a deposit with archive (and metadata) for client in the collection name.""" + # we deposit it + response = post_archive( + client, + reverse(COL_IRI, args=[collection_name]), + archive, + content_type="application/x-tar", + slug="external-id", + in_progress=True, + ) + + # then + assert response.status_code == status.HTTP_201_CREATED + response_content = parse_xml(response.content) + deposit_status = response_content.findtext( + "swh:deposit_status", namespaces=NAMESPACES + ) + assert deposit_status == DEPOSIT_STATUS_PARTIAL + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) + + origin_url = client.deposit_client.provider_url + response = post_atom( + client, + reverse(SE_IRI, args=[collection_name, deposit_id]), + data=atom_dataset["entry-data0"] % origin_url, + in_progress=False, + ) + + assert response.status_code == status.HTTP_201_CREATED + response_content = parse_xml(response.content) + deposit_status = response_content.findtext( + "swh:deposit_status", namespaces=NAMESPACES + ) + assert deposit_status == DEPOSIT_STATUS_DEPOSITED + + deposit = Deposit.objects.get(pk=deposit_id) + assert DEPOSIT_STATUS_DEPOSITED == deposit.status + return deposit + + +def create_deposit_with_archive( + root_path, archive_extension, client, collection_name, atom_dataset +): + """Create a deposit with a valid archive.""" + # we create the holding archive to a given extension + archive = create_arborescence_archive( + root_path, + "archive1", + "file1", + b"some content in file", + extension=archive_extension, + ) + + return create_deposit(archive, client, collection_name, atom_dataset) + + +def create_deposit_archive_with_archive( + root_path, archive_extension, client, collection_name, atom_dataset +): + """Create a deposit with an invalid archive (archive within archive)""" + + # we create the holding archive to a given extension + archive = create_arborescence_archive( + root_path, + "archive1", + "file1", + b"some content in file", + extension=archive_extension, + ) + + # now we create an archive holding the first created archive + invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive) + + return create_deposit(invalid_archive, client, collection_name, atom_dataset) + + +@pytest.fixture +def deposit_checker(): + return DepositChecker() + + +@pytest.fixture +def datadir(): + """Override default datadir to target main test datadir""" + return os.path.join(os.path.dirname(__file__), "../data") + + +@pytest.fixture +def deposited_deposit_valid_metadata(partial_deposit_with_metadata): + partial_deposit_with_metadata.status = DEPOSIT_STATUS_DEPOSITED + partial_deposit_with_metadata.save() + return partial_deposit_with_metadata + + +@pytest.fixture() +def deposited_deposit_only_metadata(partial_deposit_only_metadata): + partial_deposit_only_metadata.status = DEPOSIT_STATUS_DEPOSITED + partial_deposit_only_metadata.save() + return partial_deposit_only_metadata + + +def mock_http_requests(deposit, authenticated_client, requests_mock): + """Mock HTTP requests performed by deposit checker with responses + of django test client.""" + metadata_url = reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit.id]) + upload_urls_url = reverse(PRIVATE_GET_UPLOAD_URLS, args=[deposit.id]) + archive_urls = authenticated_client.get(upload_urls_url).json() + + if archive_urls: + archive_response = authenticated_client.get(archive_urls[0]) + # mock archive download + requests_mock.get( + archive_urls[0], content=b"".join(archive_response.streaming_content) + ) + + # mock requests to private deposit API by forwarding authenticated_client responses + for url in (metadata_url, upload_urls_url): + requests_mock.get(BASE_URL + url, json=authenticated_client.get(url).json()) + + def status_update(request, context): + authenticated_client.put( + put_deposit_url, content_type="application/json", data=request.body + ) + return None + + put_deposit_url = reverse(PRIVATE_PUT_DEPOSIT_NC, args=[deposit.id]) + requests_mock.put( + BASE_URL + put_deposit_url, + content=status_update, + ) + + +def test_checker_deposit_missing_metadata( + deposit_checker, + deposited_deposit, + authenticated_client, + requests_mock, +): + mock_http_requests(deposited_deposit, authenticated_client, requests_mock) + actual_result = deposit_checker.check( + collection="test", deposit_id=deposited_deposit.id + ) + assert actual_result == { + "status": "failed", + "status_detail": {"metadata": [{"summary": "Missing Atom document"}]}, + } + + +def test_checker_deposit_valid_metadata( + deposit_checker, + deposited_deposit_valid_metadata, + authenticated_client, + requests_mock, +): + mock_http_requests( + deposited_deposit_valid_metadata, + authenticated_client, + requests_mock, + ) + actual_result = deposit_checker.check( + collection="test", deposit_id=deposited_deposit_valid_metadata.id + ) + assert actual_result == { + "status": "eventful", + "status_detail": { + "metadata": [ + { + "fields": [METADATA_PROVENANCE_KEY], + "summary": SUGGESTED_FIELDS_MISSING, + }, + ] + }, + } + + +def test_checker_deposit_only_metadata( + deposit_checker, + deposited_deposit_only_metadata, + authenticated_client, + requests_mock, +): + mock_http_requests( + deposited_deposit_only_metadata, + authenticated_client, + requests_mock, + ) + actual_result = deposit_checker.check( + collection="test", deposit_id=deposited_deposit_only_metadata.id + ) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_MISSING}], + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ], + }, + } + + +def test_checker_deposit_exception_raised( + deposit_checker, + deposited_deposit_valid_metadata, + authenticated_client, + requests_mock, + mocker, +): + mock_http_requests( + deposited_deposit_valid_metadata, + authenticated_client, + requests_mock, + ) + mocker.patch("swh.deposit.loader.checker.check_metadata").side_effect = ValueError( + "Error when checking metadata" + ) + actual_result = deposit_checker.check( + collection="test", deposit_id=deposited_deposit_valid_metadata.id + ) + assert actual_result == { + "status": "failed", + "status_detail": { + "exception": "ValueError: Error when checking metadata", + }, + } + + +def test_checker_deposit_invalid_archive( + deposit_checker, + ready_deposit_invalid_archive, + authenticated_client, + requests_mock, +): + mock_http_requests( + ready_deposit_invalid_archive, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check( + collection="test", deposit_id=ready_deposit_invalid_archive.id + ) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_UNSUPPORTED}], + "metadata": [{"summary": "Missing Atom document"}], + }, + } + + +@pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) +def test_deposit_ok( + tmp_path, + authenticated_client, + deposit_collection, + extension, + atom_dataset, + deposit_checker, + requests_mock, +): + """Proper deposit should succeed the checks (-> status ready)""" + deposit = create_deposit_with_archive( + tmp_path, extension, authenticated_client, deposit_collection.name, atom_dataset + ) + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "eventful", + "status_detail": { + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ] + }, + } + + +@pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) +def test_deposit_invalid_tarball( + tmp_path, + authenticated_client, + deposit_collection, + extension, + atom_dataset, + requests_mock, + deposit_checker, +): + """Deposit with tarball (of 1 tarball) should fail the checks: rejected""" + deposit = create_deposit_archive_with_archive( + tmp_path, extension, authenticated_client, deposit_collection.name, atom_dataset + ) + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_INVALID}], + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ], + }, + } + + +def test_deposit_ko_missing_tarball( + authenticated_client, + ready_deposit_only_metadata, + requests_mock, + deposit_checker, +): + """Deposit without archive should fail the checks: rejected""" + deposit = ready_deposit_only_metadata + assert deposit.status == DEPOSIT_STATUS_DEPOSITED + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_MISSING}], + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ], + }, + } + + +def test_deposit_ko_unsupported_tarball( + authenticated_client, + ready_deposit_invalid_archive, + requests_mock, + deposit_checker, +): + """Deposit with unsupported tarball should fail checks and be rejected""" + deposit = ready_deposit_invalid_archive + assert DEPOSIT_STATUS_DEPOSITED == deposit.status + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_UNSUPPORTED}], + "metadata": [{"summary": "Missing Atom document"}], + }, + } + + +def test_deposit_ko_unsupported_tarball_prebasic_check( + tmp_path, + authenticated_client, + deposit_collection, + atom_dataset, + requests_mock, + deposit_checker, +): + """Deposit with unsupported tarball extension should fail checks and be rejected""" + + invalid_gz_mode = random.choice( + [f"{ext}-foobar" for ext in SUPPORTED_TARBALL_MODES] + ) + invalid_extension = f"tar.{invalid_gz_mode}" + + deposit = create_deposit_with_archive( + tmp_path, + invalid_extension, + authenticated_client, + deposit_collection.name, + atom_dataset, + ) + assert DEPOSIT_STATUS_DEPOSITED == deposit.status + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "failed", + "status_detail": { + "archive": [{"summary": MANDATORY_ARCHIVE_UNSUPPORTED}], + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ], + }, + } + + +def test_check_deposit_metadata_ok( + authenticated_client, + ready_deposit_ok, + requests_mock, + deposit_checker, +): + """Proper deposit should succeed the checks (-> status ready) + with all **MUST** metadata + + using the codemeta metadata test set + """ + deposit = ready_deposit_ok + assert deposit.status == DEPOSIT_STATUS_DEPOSITED + + mock_http_requests( + deposit, + authenticated_client, + requests_mock, + ) + + actual_result = deposit_checker.check(collection="test", deposit_id=deposit.id) + + assert actual_result == { + "status": "eventful", + "status_detail": { + "metadata": [ + { + "summary": SUGGESTED_FIELDS_MISSING, + "fields": [METADATA_PROVENANCE_KEY], + } + ] + }, + } diff --git a/swh/deposit/tests/loader/test_tasks.py b/swh/deposit/tests/loader/test_tasks.py index fe885f5c79c31a387aa775fa5c3831784b5f92df..5fbaa0f79ab81a33b3bc0c34bcb885a023d085f8 100644 --- a/swh/deposit/tests/loader/test_tasks.py +++ b/swh/deposit/tests/loader/test_tasks.py @@ -8,8 +8,8 @@ def test_task_check_eventful( mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker ): """Successful check should make the check succeed""" - client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") - client.return_value = "verified" + check = mocker.patch("swh.deposit.loader.checker.DepositChecker.check") + check.return_value = {"status": "eventful"} collection = "collection" deposit_id = 42 @@ -21,15 +21,14 @@ def test_task_check_eventful( assert res.successful() assert res.result == {"status": "eventful"} - client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") def test_task_check_failure( mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker ): """Unverified check status should make the check fail""" - client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") - client.return_value = "not-verified" # will make the status "failed" + check = mocker.patch("swh.deposit.loader.checker.DepositChecker.check") + check.return_value = {"status": "failed"} collection = "collec" deposit_id = 666 @@ -41,24 +40,3 @@ def test_task_check_failure( assert res.successful() assert res.result == {"status": "failed"} - client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") - - -def test_task_check_3( - mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker -): - """Unexpected failures should fail the check""" - client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") - client.side_effect = ValueError("unexpected failure will make it fail") - - collection = "another-collection" - deposit_id = 999 - res = swh_scheduler_celery_app.send_task( - "swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id] - ) - assert res - res.wait() - assert res.successful() - - assert res.result == {"status": "failed"} - client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") diff --git a/swh/deposit/urls.py b/swh/deposit/urls.py index 6127d448fcd74d475b3f36c51fc81055346d5de2..6860e80d0e3b441fe0141cfae6444f2592f3c011 100644 --- a/swh/deposit/urls.py +++ b/swh/deposit/urls.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2023 The Software Heritage developers +# Copyright (C) 2017-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,10 +8,12 @@ from __future__ import annotations from typing import Sequence, Union +from django.conf import settings from django.conf.urls import include from django.shortcuts import render from django.urls import re_path as url from django.views.generic.base import RedirectView +from django.views.static import serve from rest_framework.urlpatterns import format_suffix_patterns try: @@ -41,3 +43,14 @@ urlpatterns = format_suffix_patterns( url(r"^$", default_view, name="home"), ] ) + +if "AzureStorage" not in settings.STORAGES["default"]["BACKEND"]: + # to serve uploaded tarballs when no azure storage backend is configured, + # typically in docker or with development/test settings + urlpatterns.append( + url( + rf"^{settings.MEDIA_URL.rstrip('/').split('/')[-1]}/(?P<path>.*)$", + serve, + {"document_root": settings.MEDIA_ROOT}, + ) + )