Skip to content
Snippets Groups Projects
Commit 458ad280 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

browse/utils: Replace chardet use by charset-normalizer

Chardet does not seem really maintained and failed to properly detect some
encodings, resulting in 500 errors when attempting to browse some contents.

So prefer to use the charset-normalizer package instead which is better at
detecting encoding while being faster than chardet.
parent 5d126911
No related branches found
No related tags found
No related merge requests found
......@@ -12,7 +12,6 @@ swh.core[http] >= 3.0.0
swh.loader.git >= 0.8.0
swh-scheduler[testing] >= 2.0.0
swh.storage >= 0.1.1
types-chardet
types-cryptography
types-docutils
types-psycopg2
......
......@@ -3,7 +3,7 @@
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
beautifulsoup4
chardet
charset-normalizer
cryptography
django
django-cors-headers
......
......@@ -104,7 +104,7 @@ def test_re_encode_content_for_shift_jis_encoding():
_, encoding, re_encoded_data = re_encode_content(mime_type, encoding, data)
assert encoding == "SHIFT_JIS"
assert encoding == "CP932"
assert data.decode(encoding) == re_encoded_data.decode("utf-8")
assert re_encoded_data.decode("utf-8") == "/* 関連の文字コード変換 */"
......
......@@ -6,9 +6,9 @@
import base64
import stat
import textwrap
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
import chardet
import charset_normalizer
import magic
from django.utils.html import escape, format_html
......@@ -107,11 +107,12 @@ def re_encode_content(
after processing), content raw bytes (possibly reencoded to UTF-8)
"""
if mimetype.startswith("text/") and encoding not in ("us-ascii", "utf-8"):
# first check if chardet detects an encoding with confidence
result = chardet.detect(content_data)
if result["confidence"] >= 0.9:
encoding = result["encoding"]
content_data = content_data.decode(encoding).encode("utf-8")
# first check if charset_normalizer detects an encoding with confidence
result = charset_normalizer.detect(content_data)
assert isinstance(result["confidence"], float)
if cast(float, result["confidence"]) >= 0.9:
encoding = cast(str, result["encoding"])
content_data = content_data.decode(encoding, "replace").encode("utf-8")
elif encoding == "unknown-8bit":
# probably a malformed UTF-8 content, re-encode it
# by replacing invalid chars with a substitution one
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment