diff --git a/swh/web/browse/utils.py b/swh/web/browse/utils.py --- a/swh/web/browse/utils.py +++ b/swh/web/browse/utils.py @@ -129,19 +129,19 @@ elif mimetype.startswith('application/octet-stream'): # file may detect a text content as binary # so try to decode it for display - encodings = ['us-ascii'] + encodings = ['us-ascii', 'utf-8'] encodings += ['iso-8859-%s' % i for i in range(1, 17)] - for encoding in encodings: + for enc in encodings: try: - content_data = content_data.decode(encoding)\ - .encode('utf-8') + content_data = content_data.decode(enc).encode('utf-8') except Exception: pass else: # ensure display in content view + encoding = enc mimetype = 'text/plain' break - return mimetype, content_data + return mimetype, encoding, content_data def request_content(query_string, max_size=content_display_max_size, @@ -215,7 +215,7 @@ get_mimetype_and_encoding_for_content(content_data['raw_data']) # noqa if re_encode: - mimetype, raw_data = _re_encode_content( + mimetype, encoding, raw_data = _re_encode_content( mimetype, encoding, content_data['raw_data']) content_data['raw_data'] = raw_data diff --git a/swh/web/templates/includes/content-display.html b/swh/web/templates/includes/content-display.html --- a/swh/web/templates/includes/content-display.html +++ b/swh/web/templates/includes/content-display.html @@ -40,7 +40,8 @@ {% elif content %} - Content with mime type {{ swh_object_metadata.mimetype }} can not be displayed. + Content with mime type {{ swh_object_metadata.mimetype }} and encoding + {{ swh_object_metadata.encoding }} can not be displayed. {% else %} {% include "includes/http-error.html" %} {% endif %} diff --git a/swh/web/tests/browse/views/test_content.py b/swh/web/tests/browse/views/test_content.py --- a/swh/web/tests/browse/views/test_content.py +++ b/swh/web/tests/browse/views/test_content.py @@ -19,7 +19,8 @@ ) from swh.web.tests.strategies import ( content, content_text_non_utf8, content_text_no_highlight, - content_image_type, content_text, invalid_sha1, unknown_content + content_image_type, content_text, invalid_sha1, unknown_content, + content_utf8_detected_as_binary ) @@ -356,14 +357,26 @@ assert resp['location'] == redirect_url +@given(content_utf8_detected_as_binary()) +def test_content_utf8_detected_as_binary_display(client, archive_data, + content): + url = reverse('browse-content', + url_args={'query_string': content['sha1']}) + resp = client.get(url) + + content_display = _process_content_for_display(archive_data, content) + + assert_contains(resp, escape(content_display['content_data'])) + + def _process_content_for_display(archive_data, content): content_data = archive_data.content_get(content['sha1']) mime_type, encoding = get_mimetype_and_encoding_for_content( content_data['data']) - mime_type, content_data = _re_encode_content(mime_type, encoding, - content_data['data']) + mime_type, encoding, content_data = _re_encode_content( + mime_type, encoding, content_data['data']) return prepare_content_for_display(content_data, mime_type, content['path']) diff --git a/swh/web/tests/strategies.py b/swh/web/tests/strategies.py --- a/swh/web/tests/strategies.py +++ b/swh/web/tests/strategies.py @@ -123,6 +123,24 @@ return content().filter(lambda c: c['mimetype'].startswith('image/')) +def content_utf8_detected_as_binary(): + """ + Hypothesis strategy returning random textual contents detected as binary + by libmagic while they are valid UTF-8 encoded files. + """ + def utf8_binary_detected(content): + if content['encoding'] != 'binary': + return False + try: + content['data'].decode('utf-8') + except Exception: + return False + else: + return True + + return content().filter(utf8_binary_detected) + + @composite def new_content(draw): blake2s256_hex = draw(sha256())