From 458ad28026b92db640f80dc4c75431c81016cb74 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Wed, 10 Apr 2024 11:59:05 +0200 Subject: [PATCH] browse/utils: Replace chardet use by charset-normalizer Chardet does not seem really maintained and failed to properly detect some encodings, resulting in 500 errors when attempting to browse some contents. So prefer to use the charset-normalizer package instead which is better at detecting encoding while being faster than chardet. --- requirements-test.txt | 1 - requirements.txt | 2 +- swh/web/browse/tests/test_utils.py | 2 +- swh/web/browse/utils.py | 15 ++++++++------- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index 7f6f1aceb..0b40c6497 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -12,7 +12,6 @@ swh.core[http] >= 3.0.0 swh.loader.git >= 0.8.0 swh-scheduler[testing] >= 2.0.0 swh.storage >= 0.1.1 -types-chardet types-cryptography types-docutils types-psycopg2 diff --git a/requirements.txt b/requirements.txt index 5947d21b2..295d55fbc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html beautifulsoup4 -chardet +charset-normalizer cryptography django django-cors-headers diff --git a/swh/web/browse/tests/test_utils.py b/swh/web/browse/tests/test_utils.py index 84dc14a36..28e50e3e4 100644 --- a/swh/web/browse/tests/test_utils.py +++ b/swh/web/browse/tests/test_utils.py @@ -104,7 +104,7 @@ def test_re_encode_content_for_shift_jis_encoding(): _, encoding, re_encoded_data = re_encode_content(mime_type, encoding, data) - assert encoding == "SHIFT_JIS" + assert encoding == "CP932" assert data.decode(encoding) == re_encoded_data.decode("utf-8") assert re_encoded_data.decode("utf-8") == "/* é–¢é€£ã®æ–‡å—ã‚³ãƒ¼ãƒ‰å¤‰æ› */" diff --git a/swh/web/browse/utils.py b/swh/web/browse/utils.py index ce1b62d87..4289da613 100644 --- a/swh/web/browse/utils.py +++ b/swh/web/browse/utils.py @@ -6,9 +6,9 @@ import base64 import stat import textwrap -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast -import chardet +import charset_normalizer import magic from django.utils.html import escape, format_html @@ -107,11 +107,12 @@ def re_encode_content( after processing), content raw bytes (possibly reencoded to UTF-8) """ if mimetype.startswith("text/") and encoding not in ("us-ascii", "utf-8"): - # first check if chardet detects an encoding with confidence - result = chardet.detect(content_data) - if result["confidence"] >= 0.9: - encoding = result["encoding"] - content_data = content_data.decode(encoding).encode("utf-8") + # first check if charset_normalizer detects an encoding with confidence + result = charset_normalizer.detect(content_data) + assert isinstance(result["confidence"], float) + if cast(float, result["confidence"]) >= 0.9: + encoding = cast(str, result["encoding"]) + content_data = content_data.decode(encoding, "replace").encode("utf-8") elif encoding == "unknown-8bit": # probably a malformed UTF-8 content, re-encode it # by replacing invalid chars with a substitution one -- GitLab