From 84d685ca715fb9701b469d703c76acd936c1749b Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Fri, 12 Apr 2024 11:00:58 +0200
Subject: [PATCH] browse/utils: Fix error when charset_normalizer.detect failed

The charset_normalizer.detect function can return a dict filled with
None values when it failed to detect an encoding so ensure to add a
None check to avoid runtime error when a content encoding cannot be
detected.
---
 swh/web/browse/tests/views/test_content.py | 17 +++++++++++++++++
 swh/web/browse/utils.py                    |  2 +-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/swh/web/browse/tests/views/test_content.py b/swh/web/browse/tests/views/test_content.py
index 2dedbc3d0..690a0b355 100644
--- a/swh/web/browse/tests/views/test_content.py
+++ b/swh/web/browse/tests/views/test_content.py
@@ -1290,3 +1290,20 @@ def test_browse_content_rate_limit(client, content_text, view_name):
 
     check_http_get_response(client, url, status_code=200)
     check_http_get_response(client, url, status_code=429)
+
+
+def test_browse_content_failed_encoding_detection(
+    client, content_text_non_utf8, mocker
+):
+    # simulate charset_normalizer.detect failure
+    detect = mocker.patch("charset_normalizer.detect")
+    detect.return_value = {"confidence": None, "encoding": None, "language": ""}
+
+    url = reverse(
+        "browse-content",
+        url_args={"query_string": f"sha1_git:{content_text_non_utf8['sha1_git']}"},
+    )
+
+    # content should be rendered even if encoding detection failed
+    check_http_get_response(client, url, status_code=200)
+    detect.assert_called()
diff --git a/swh/web/browse/utils.py b/swh/web/browse/utils.py
index 6b85c057c..b5032d5a3 100644
--- a/swh/web/browse/utils.py
+++ b/swh/web/browse/utils.py
@@ -109,7 +109,7 @@ def re_encode_content(
     if mimetype.startswith("text/") and encoding not in ("us-ascii", "utf-8"):
         # first check if charset_normalizer detects an encoding with confidence
         result = charset_normalizer.detect(content_data)
-        if cast(float, result["confidence"]) >= 0.9:
+        if result.get("confidence") and cast(float, result["confidence"]) >= 0.9:
             encoding = cast(str, result["encoding"])
             content_data = content_data.decode(encoding, "replace").encode("utf-8")
         elif encoding == "unknown-8bit":
-- 
GitLab