From 0a10ac37146d1670364cfd752d73ce2b20619cec Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Fri, 24 Nov 2023 12:12:48 +0100
Subject: [PATCH] azure blobstorage: Reset blob's content-encoding to allow
 check

This ensures the deposit checks in the elastic infra (using azure objstorage) is
working. There is currently no way to prevent the content-encoding from being detected
and stored in the blobstorage (various tryouts have been tested to no avail). That
content-encoding, is then preventing the checking/reading to work (failing [2]).

Following the documentation, a workaround has been implemented to reset a posteriori the
content-encoding [1].

It's not proper but, from repl experimentation, it's working which is better than the
actual failure.

[1] https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blob-properties-metadata-python

[2]
```
HttpResponseError: ('Received response with content-encoding: gzip, but failed to decode it.', error('Error -3 while decompressing data: incorrect header check'))
```

Refs. swh/infra/sysadm-environment#5129
---
 mypy.ini                                 |  6 ++++
 swh/deposit/api/private/deposit_check.py | 37 ++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/mypy.ini b/mypy.ini
index cbe939b3..36098343 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -44,3 +44,9 @@ ignore_missing_imports = True
 
 [mypy-swh.storage.*]
 ignore_missing_imports = True
+
+[mypy-storages.*]
+ignore_missing_imports = True
+
+[mypy-azure.*]
+ignore_missing_imports = True
diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py
index d803173e..e24310aa 100644
--- a/swh/deposit/api/private/deposit_check.py
+++ b/swh/deposit/api/private/deposit_check.py
@@ -125,6 +125,11 @@ class APIChecks(APIPrivateView, APIGet, DepositReadMixin):
             # Use python's File api which is consistent across different types of
             # storage backends (e.g. file, azure, ...)
 
+            # I did not find any other) workaround for azure blobstorage use, noop
+            # otherwise
+            reset_content_settings_if_needed(archive)
+            # FIXME: ^ Implement a better way (after digging into django-storages[azure]
+
             with archive.open("rb") as archive_fp:
                 try:
                     with zipfile.ZipFile(archive_fp) as zip_fp:
@@ -214,3 +219,35 @@ class APIChecks(APIPrivateView, APIGet, DepositReadMixin):
         deposit.save()
 
         return status.HTTP_200_OK, response, "application/json"
+
+
+def reset_content_settings_if_needed(archive) -> None:
+    """This resets the content_settings on the associated blob stored in an azure
+    blobstorage. This prevents the correct reading of the file and failing the checks
+    for no good reason.
+
+    """
+    try:
+        from storages.backends.azure_storage import AzureStorage
+    except ImportError:
+        return None
+
+    if not isinstance(archive.storage, AzureStorage):
+        return None
+
+    from azure.storage.blob import ContentSettings
+
+    blob_client = archive.storage.client.get_blob_client(archive.name)
+
+    # Get the existing blob properties
+    properties = blob_client.get_blob_properties()
+
+    # reset content encoding in the settings
+    content_settings = dict(properties.content_settings)
+    content_settings["content_encoding"] = ""
+
+    # Set the content_type and content_language headers, and populate the remaining
+    # headers from the existing properties
+    blob_headers = ContentSettings(**content_settings)
+
+    blob_client.set_http_headers(blob_headers)
-- 
GitLab