diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -12,6 +12,7 @@ swh.loader.git >= 0.8.0 swh-scheduler[testing] >= 0.5.0 swh.storage >= 0.1.1 +types-chardet types-docutils types-psycopg2 types-pyyaml diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html beautifulsoup4 +chardet cryptography django < 3 django-cors-headers diff --git a/swh/web/browse/utils.py b/swh/web/browse/utils.py --- a/swh/web/browse/utils.py +++ b/swh/web/browse/utils.py @@ -6,7 +6,9 @@ import base64 import stat import textwrap +from typing import Tuple +import chardet import magic import sentry_sdk @@ -91,12 +93,30 @@ content_display_max_size = get_config()["content_display_max_size"] -def _re_encode_content(mimetype, encoding, content_data): - # encode textual content to utf-8 if needed - if mimetype.startswith("text/"): - # probably a malformed UTF-8 content, re-encode it - # by replacing invalid chars with a substitution one - if encoding == "unknown-8bit": +def re_encode_content( + mimetype: str, encoding: str, content_data: bytes +) -> Tuple[str, str, bytes]: + """Try to re-encode textual content if it is not encoded to UTF-8 + for proper display in the browse Web UI. + + Args: + mimetype: content mimetype as detected by python-magic + encoding: content encoding as detected by python-magic + content_data: raw content bytes + + Returns: + A tuple with 3 members: content mimetype, content encoding (possibly updated + after processing), content raw bytes (possibly reencoded to UTF-8) + """ + if mimetype.startswith("text/") and encoding not in ("us-ascii", "utf-8"): + # first check if chardet detects an encoding with confidence + result = chardet.detect(content_data) + if result["confidence"] >= 0.9: + encoding = result["encoding"] + content_data = content_data.decode(encoding).encode("utf-8") + elif encoding == "unknown-8bit": + # probably a malformed UTF-8 content, re-encode it + # by replacing invalid chars with a substitution one content_data = content_data.decode("utf-8", "replace").encode("utf-8") elif encoding not in ["utf-8", "binary"]: content_data = content_data.decode(encoding, "replace").encode("utf-8") @@ -176,7 +196,7 @@ ) if re_encode: - mimetype, encoding, raw_data = _re_encode_content( + mimetype, encoding, raw_data = re_encode_content( mimetype, encoding, content_data["raw_data"] ) content_data["raw_data"] = raw_data diff --git a/swh/web/tests/browse/test_utils.py b/swh/web/tests/browse/test_utils.py --- a/swh/web/tests/browse/test_utils.py +++ b/swh/web/tests/browse/test_utils.py @@ -11,6 +11,7 @@ gen_revision_link, get_mimetype_and_encoding_for_content, prepare_content_for_display, + re_encode_content, ) from swh.web.common.utils import reverse @@ -87,3 +88,14 @@ content_data=b"", mime_type="", path=path ) assert content_display["language"] == expected_language + + +def test_re_encode_content_for_shift_jis_encoding(): + data = b"/* \x8a\xd6\x98A\x82\xcc\x95\xb6\x8e\x9a\x83R\x81[\x83h\x95\xcf\x8a\xb7 */" + mime_type, encoding = get_mimetype_and_encoding_for_content(data) + + _, encoding, re_encoded_data = re_encode_content(mime_type, encoding, data) + + assert encoding == "SHIFT_JIS" + assert data.decode(encoding) == re_encoded_data.decode("utf-8") + assert re_encoded_data.decode("utf-8") == "/* 関連の文字コード変換 */" diff --git a/swh/web/tests/browse/views/test_content.py b/swh/web/tests/browse/views/test_content.py --- a/swh/web/tests/browse/views/test_content.py +++ b/swh/web/tests/browse/views/test_content.py @@ -16,9 +16,9 @@ from swh.model.swhids import ObjectType from swh.web.browse.snapshot_context import process_snapshot_branches from swh.web.browse.utils import ( - _re_encode_content, get_mimetype_and_encoding_for_content, prepare_content_for_display, + re_encode_content, ) from swh.web.common.exc import NotFoundExc from swh.web.common.identifiers import gen_swhid @@ -632,7 +632,7 @@ mime_type, encoding = get_mimetype_and_encoding_for_content(content_data["data"]) - mime_type, encoding, content_data = _re_encode_content( + mime_type, encoding, content_data = re_encode_content( mime_type, encoding, content_data["data"] ) diff --git a/swh/web/tests/data.py b/swh/web/tests/data.py --- a/swh/web/tests/data.py +++ b/swh/web/tests/data.py @@ -37,9 +37,9 @@ from swh.storage.utils import now from swh.web import config from swh.web.browse.utils import ( - _re_encode_content, get_mimetype_and_encoding_for_content, prepare_content_for_display, + re_encode_content, ) from swh.web.common import archive @@ -407,7 +407,7 @@ cnt_data = storage.content_get_data(content.sha1) assert cnt_data is not None mimetype, encoding = get_mimetype_and_encoding_for_content(cnt_data) - _, _, cnt_data = _re_encode_content(mimetype, encoding, cnt_data) + _, _, cnt_data = re_encode_content(mimetype, encoding, cnt_data) content_display_data = prepare_content_for_display(cnt_data, mimetype, path)