browse/utils: Replace chardet use by charset-normalizer

Chardet does not seem really maintained and failed to properly detect some encodings, resulting in 500 errors when attempting to browse some contents. So prefer to use the charset-normalizer package instead which is better at detecting encoding while being faster than chardet.

browse/utils: Replace chardet use by charset-normalizer
Chardet does not seem really maintained and failed to properly detect some encodings, resulting in 500 errors when attempting to browse some contents. So prefer to use the charset-normalizer package instead which is better at detecting encoding while being faster than chardet.
458ad280 · Antoine Lambert · 5d126911 · 458ad280 · 458ad280 · 458ad280
Commit 458ad280 authored 11 months ago by Antoine Lambert
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -12,7 +12,6 @@ swh.core[http] >= 3.0.0
 swh.loader.git >= 0.8.0
 swh-scheduler[testing] >= 2.0.0
 swh.storage >= 0.1.1
-types-chardet
 types-cryptography
 types-docutils
 types-psycopg2

--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@
 # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html

 beautifulsoup4
-chardet
+charset-normalizer
 cryptography
 django
 django-cors-headers

--- a/swh/web/browse/tests/test_utils.py
+++ b/swh/web/browse/tests/test_utils.py
@@ -104,7 +104,7 @@ def test_re_encode_content_for_shift_jis_encoding():

    _, encoding, re_encoded_data = re_encode_content(mime_type, encoding, data)

-    assert encoding == "SHIFT_JIS"
+    assert encoding == "CP932"
    assert data.decode(encoding) == re_encoded_data.decode("utf-8")
    assert re_encoded_data.decode("utf-8") == "/* 関連の文字コード変換 */"


--- a/swh/web/browse/utils.py
+++ b/swh/web/browse/utils.py
@@ -6,9 +6,9 @@
 import base64
 import stat
 import textwrap
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast

-import chardet
+import charset_normalizer
 import magic

 from django.utils.html import escape, format_html
@@ -107,11 +107,12 @@ def re_encode_content(
        after processing), content raw bytes (possibly reencoded to UTF-8)
    """
    if mimetype.startswith("text/") and encoding not in ("us-ascii", "utf-8"):
-        # first check if chardet detects an encoding with confidence
-        result = chardet.detect(content_data)
-        if result["confidence"] >= 0.9:
-            encoding = result["encoding"]
-            content_data = content_data.decode(encoding).encode("utf-8")
+        # first check if charset_normalizer detects an encoding with confidence
+        result = charset_normalizer.detect(content_data)
+        assert isinstance(result["confidence"], float)
+        if cast(float, result["confidence"]) >= 0.9:
+            encoding = cast(str, result["encoding"])
+            content_data = content_data.decode(encoding, "replace").encode("utf-8")
        elif encoding == "unknown-8bit":
            # probably a malformed UTF-8 content, re-encode it
            # by replacing invalid chars with a substitution one