Skip to content
Snippets Groups Projects
Commit 38204c22 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

browse/utils: Reinstate chardet use but only as a fallback

While charset_normalizer performs usually better than chardet, it
remains some edge cases where it can fail to detect content encoding
while chardet succeeds.

So try to detect content encoding with chardet as a fallback when
charset_normalizer failed.
parent 2d123083
No related branches found
No related tags found
No related merge requests found
......@@ -37,7 +37,8 @@ repos:
(?x)^(
cypress/integration/directory.spec.js|
yarn.lock|
package.json
package.json|
swh/web/browse/tests/data/content_iso-8859-7_encoded
)$
args: [-L edn, -L crate]
stages: [commit]
......
......@@ -2,6 +2,7 @@
# should match https://pypi.python.org/pypi names. For the full spec or
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
chardet
charset-normalizer
cryptography
django
......
This diff is collapsed.
......@@ -3,6 +3,7 @@
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import re
import pytest
......@@ -109,6 +110,14 @@ def test_re_encode_content_for_shift_jis_encoding():
assert re_encoded_data.decode("utf-8") == "/* 関連の文字コード変換 */"
def test_re_encode_content_chardet_fallback(datadir):
with open(os.path.join(datadir, "content_iso-8859-7_encoded"), "rb") as f:
content = f.read()
mime_type, encoding = get_mimetype_and_encoding_for_content(content)
_, encoding, _ = re_encode_content(mime_type, encoding, content)
assert encoding == "ISO-8859-7"
@pytest.mark.parametrize(
"input_,expected_output",
[
......
......@@ -8,6 +8,7 @@ import stat
import textwrap
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
import chardet
import charset_normalizer
import magic
......@@ -112,6 +113,10 @@ def re_encode_content(
if result.get("confidence") and cast(float, result["confidence"]) >= 0.9:
encoding = cast(str, result["encoding"])
content_data = content_data.decode(encoding, "replace").encode("utf-8")
# then try to detect encoding with chardet if the above failed
elif (cresult := chardet.detect(content_data)).get("confidence", 0) >= 0.9:
encoding = cast(str, cresult["encoding"])
content_data = content_data.decode(encoding, "replace").encode("utf-8")
elif encoding == "unknown-8bit":
# probably a malformed UTF-8 content, re-encode it
# by replacing invalid chars with a substitution one
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment