browse/utils: Reinstate chardet use but only as a fallback

While charset_normalizer performs usually better than chardet, it remains some edge cases where it can fail to detect content encoding while chardet succeeds. So try to detect content encoding with chardet as a fallback when charset_normalizer failed.

browse/utils: Reinstate chardet use but only as a fallback
While charset_normalizer performs usually better than chardet, it remains some edge cases where it can fail to detect content encoding while chardet succeeds. So try to detect content encoding with chardet as a fallback when charset_normalizer failed.
38204c22 · Antoine Lambert · 2d123083 · 38204c22 · 38204c22 · 38204c22
Commit 38204c22 authored 11 months ago by Antoine Lambert
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,7 +37,8 @@ repos:
          (?x)^(
              cypress/integration/directory.spec.js|
              yarn.lock|
-              package.json
+              package.json|
+              swh/web/browse/tests/data/content_iso-8859-7_encoded
          )$
        args: [-L edn, -L crate]
        stages: [commit]

--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@
 # should match https://pypi.python.org/pypi names. For the full spec or
 # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html

+chardet
 charset-normalizer
 cryptography
 django

--- a/swh/web/browse/tests/data/content_iso-8859-7_encoded
+++ b/swh/web/browse/tests/data/content_iso-8859-7_encoded
--- a/swh/web/browse/tests/test_utils.py
+++ b/swh/web/browse/tests/test_utils.py
@@ -3,6 +3,7 @@
 # License: GNU Affero General Public License version 3, or any later version
 # See top-level LICENSE file for more information

+import os
 import re

 import pytest
@@ -109,6 +110,14 @@ def test_re_encode_content_for_shift_jis_encoding():
    assert re_encoded_data.decode("utf-8") == "/* 関連の文字コード変換 */"


+def test_re_encode_content_chardet_fallback(datadir):
+    with open(os.path.join(datadir, "content_iso-8859-7_encoded"), "rb") as f:
+        content = f.read()
+        mime_type, encoding = get_mimetype_and_encoding_for_content(content)
+        _, encoding, _ = re_encode_content(mime_type, encoding, content)
+        assert encoding == "ISO-8859-7"
+
+
 @pytest.mark.parametrize(
    "input_,expected_output",
    [

--- a/swh/web/browse/utils.py
+++ b/swh/web/browse/utils.py
@@ -8,6 +8,7 @@ import stat
 import textwrap
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast

+import chardet
 import charset_normalizer
 import magic

@@ -112,6 +113,10 @@ def re_encode_content(
        if result.get("confidence") and cast(float, result["confidence"]) >= 0.9:
            encoding = cast(str, result["encoding"])
            content_data = content_data.decode(encoding, "replace").encode("utf-8")
+        # then try to detect encoding with chardet if the above failed
+        elif (cresult := chardet.detect(content_data)).get("confidence", 0) >= 0.9:
+            encoding = cast(str, cresult["encoding"])
+            content_data = content_data.decode(encoding, "replace").encode("utf-8")
        elif encoding == "unknown-8bit":
            # probably a malformed UTF-8 content, re-encode it
            # by replacing invalid chars with a substitution one