From 0a10ac37146d1670364cfd752d73ce2b20619cec Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org> Date: Fri, 24 Nov 2023 12:12:48 +0100 Subject: [PATCH] azure blobstorage: Reset blob's content-encoding to allow check This ensures the deposit checks in the elastic infra (using azure objstorage) is working. There is currently no way to prevent the content-encoding from being detected and stored in the blobstorage (various tryouts have been tested to no avail). That content-encoding, is then preventing the checking/reading to work (failing [2]). Following the documentation, a workaround has been implemented to reset a posteriori the content-encoding [1]. It's not proper but, from repl experimentation, it's working which is better than the actual failure. [1] https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blob-properties-metadata-python [2] ``` HttpResponseError: ('Received response with content-encoding: gzip, but failed to decode it.', error('Error -3 while decompressing data: incorrect header check')) ``` Refs. swh/infra/sysadm-environment#5129 --- mypy.ini | 6 ++++ swh/deposit/api/private/deposit_check.py | 37 ++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/mypy.ini b/mypy.ini index cbe939b3..36098343 100644 --- a/mypy.ini +++ b/mypy.ini @@ -44,3 +44,9 @@ ignore_missing_imports = True [mypy-swh.storage.*] ignore_missing_imports = True + +[mypy-storages.*] +ignore_missing_imports = True + +[mypy-azure.*] +ignore_missing_imports = True diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py index d803173e..e24310aa 100644 --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -125,6 +125,11 @@ class APIChecks(APIPrivateView, APIGet, DepositReadMixin): # Use python's File api which is consistent across different types of # storage backends (e.g. file, azure, ...) + # I did not find any other) workaround for azure blobstorage use, noop + # otherwise + reset_content_settings_if_needed(archive) + # FIXME: ^ Implement a better way (after digging into django-storages[azure] + with archive.open("rb") as archive_fp: try: with zipfile.ZipFile(archive_fp) as zip_fp: @@ -214,3 +219,35 @@ class APIChecks(APIPrivateView, APIGet, DepositReadMixin): deposit.save() return status.HTTP_200_OK, response, "application/json" + + +def reset_content_settings_if_needed(archive) -> None: + """This resets the content_settings on the associated blob stored in an azure + blobstorage. This prevents the correct reading of the file and failing the checks + for no good reason. + + """ + try: + from storages.backends.azure_storage import AzureStorage + except ImportError: + return None + + if not isinstance(archive.storage, AzureStorage): + return None + + from azure.storage.blob import ContentSettings + + blob_client = archive.storage.client.get_blob_client(archive.name) + + # Get the existing blob properties + properties = blob_client.get_blob_properties() + + # reset content encoding in the settings + content_settings = dict(properties.content_settings) + content_settings["content_encoding"] = "" + + # Set the content_type and content_language headers, and populate the remaining + # headers from the existing properties + blob_headers = ContentSettings(**content_settings) + + blob_client.set_http_headers(blob_headers) -- GitLab