From 458ad28026b92db640f80dc4c75431c81016cb74 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Wed, 10 Apr 2024 11:59:05 +0200
Subject: [PATCH] browse/utils: Replace chardet use by charset-normalizer

Chardet does not seem really maintained and failed to properly detect some
encodings, resulting in 500 errors when attempting to browse some contents.

So prefer to use the charset-normalizer package instead which is better at
detecting encoding while being faster than chardet.
---
 requirements-test.txt              |  1 -
 requirements.txt                   |  2 +-
 swh/web/browse/tests/test_utils.py |  2 +-
 swh/web/browse/utils.py            | 15 ++++++++-------
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index 7f6f1aceb..0b40c6497 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -12,7 +12,6 @@ swh.core[http] >= 3.0.0
 swh.loader.git >= 0.8.0
 swh-scheduler[testing] >= 2.0.0
 swh.storage >= 0.1.1
-types-chardet
 types-cryptography
 types-docutils
 types-psycopg2
diff --git a/requirements.txt b/requirements.txt
index 5947d21b2..295d55fbc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@
 # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
 
 beautifulsoup4
-chardet
+charset-normalizer
 cryptography
 django
 django-cors-headers
diff --git a/swh/web/browse/tests/test_utils.py b/swh/web/browse/tests/test_utils.py
index 84dc14a36..28e50e3e4 100644
--- a/swh/web/browse/tests/test_utils.py
+++ b/swh/web/browse/tests/test_utils.py
@@ -104,7 +104,7 @@ def test_re_encode_content_for_shift_jis_encoding():
 
     _, encoding, re_encoded_data = re_encode_content(mime_type, encoding, data)
 
-    assert encoding == "SHIFT_JIS"
+    assert encoding == "CP932"
     assert data.decode(encoding) == re_encoded_data.decode("utf-8")
     assert re_encoded_data.decode("utf-8") == "/* 関連の文字コード変換 */"
 
diff --git a/swh/web/browse/utils.py b/swh/web/browse/utils.py
index ce1b62d87..4289da613 100644
--- a/swh/web/browse/utils.py
+++ b/swh/web/browse/utils.py
@@ -6,9 +6,9 @@
 import base64
 import stat
 import textwrap
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
 
-import chardet
+import charset_normalizer
 import magic
 
 from django.utils.html import escape, format_html
@@ -107,11 +107,12 @@ def re_encode_content(
         after processing), content raw bytes (possibly reencoded to UTF-8)
     """
     if mimetype.startswith("text/") and encoding not in ("us-ascii", "utf-8"):
-        # first check if chardet detects an encoding with confidence
-        result = chardet.detect(content_data)
-        if result["confidence"] >= 0.9:
-            encoding = result["encoding"]
-            content_data = content_data.decode(encoding).encode("utf-8")
+        # first check if charset_normalizer detects an encoding with confidence
+        result = charset_normalizer.detect(content_data)
+        assert isinstance(result["confidence"], float)
+        if cast(float, result["confidence"]) >= 0.9:
+            encoding = cast(str, result["encoding"])
+            content_data = content_data.decode(encoding, "replace").encode("utf-8")
         elif encoding == "unknown-8bit":
             # probably a malformed UTF-8 content, re-encode it
             # by replacing invalid chars with a substitution one
-- 
GitLab