From 84d685ca715fb9701b469d703c76acd936c1749b Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Fri, 12 Apr 2024 11:00:58 +0200 Subject: [PATCH] browse/utils: Fix error when charset_normalizer.detect failed The charset_normalizer.detect function can return a dict filled with None values when it failed to detect an encoding so ensure to add a None check to avoid runtime error when a content encoding cannot be detected. --- swh/web/browse/tests/views/test_content.py | 17 +++++++++++++++++ swh/web/browse/utils.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/swh/web/browse/tests/views/test_content.py b/swh/web/browse/tests/views/test_content.py index 2dedbc3d0..690a0b355 100644 --- a/swh/web/browse/tests/views/test_content.py +++ b/swh/web/browse/tests/views/test_content.py @@ -1290,3 +1290,20 @@ def test_browse_content_rate_limit(client, content_text, view_name): check_http_get_response(client, url, status_code=200) check_http_get_response(client, url, status_code=429) + + +def test_browse_content_failed_encoding_detection( + client, content_text_non_utf8, mocker +): + # simulate charset_normalizer.detect failure + detect = mocker.patch("charset_normalizer.detect") + detect.return_value = {"confidence": None, "encoding": None, "language": ""} + + url = reverse( + "browse-content", + url_args={"query_string": f"sha1_git:{content_text_non_utf8['sha1_git']}"}, + ) + + # content should be rendered even if encoding detection failed + check_http_get_response(client, url, status_code=200) + detect.assert_called() diff --git a/swh/web/browse/utils.py b/swh/web/browse/utils.py index 6b85c057c..b5032d5a3 100644 --- a/swh/web/browse/utils.py +++ b/swh/web/browse/utils.py @@ -109,7 +109,7 @@ def re_encode_content( if mimetype.startswith("text/") and encoding not in ("us-ascii", "utf-8"): # first check if charset_normalizer detects an encoding with confidence result = charset_normalizer.detect(content_data) - if cast(float, result["confidence"]) >= 0.9: + if result.get("confidence") and cast(float, result["confidence"]) >= 0.9: encoding = cast(str, result["encoding"]) content_data = content_data.decode(encoding, "replace").encode("utf-8") elif encoding == "unknown-8bit": -- GitLab