Page MenuHomeSoftware Heritage

D43.diff
No OneTemporary

D43.diff

diff --git a/swh/core/tests/test_utils.py b/swh/core/tests/test_utils.py
--- a/swh/core/tests/test_utils.py
+++ b/swh/core/tests/test_utils.py
@@ -31,3 +31,37 @@
out.append(list(d)) # force generator resolution for checks
self.assertEqual(out, [[9, 8, 7, 6], [5, 4, 3, 2], [1]])
+
+ @istest
+ def backslashescape_errors(self):
+ raw_data_err = b'abcd\x80'
+ with self.assertRaises(UnicodeDecodeError):
+ raw_data_err.decode('utf-8', 'strict')
+
+ self.assertEquals(
+ raw_data_err.decode('utf-8', 'backslashescape'),
+ 'abcd\\x80',
+ )
+
+ raw_data_ok = b'abcd\xc3\xa9'
+ self.assertEquals(
+ raw_data_ok.decode('utf-8', 'backslashescape'),
+ raw_data_ok.decode('utf-8', 'strict'),
+ )
+
+ unicode_data = 'abcdef\u00a3'
+ self.assertEquals(
+ unicode_data.encode('ascii', 'backslashescape'),
+ b'abcdef\\xa3',
+ )
+
+ @istest
+ def decode_invalid(self):
+ # given
+ invalid_str = b'my invalid \xff \xff string'
+
+ # when
+ actual_data = utils.decode_with_escape(invalid_str)
+
+ # then
+ self.assertEqual(actual_data, 'my invalid \\xff \\xff string')
diff --git a/swh/core/utils.py b/swh/core/utils.py
--- a/swh/core/utils.py
+++ b/swh/core/utils.py
@@ -5,6 +5,7 @@
import itertools
+import codecs
def grouper(iterable, n):
@@ -22,3 +23,25 @@
args = [iter(iterable)] * n
for _data in itertools.zip_longest(*args, fillvalue=None):
yield (d for d in _data if d is not None)
+
+
+def backslashescape_errors(exception):
+ if isinstance(exception, UnicodeDecodeError):
+ bad_data = exception.object[exception.start:exception.end]
+ escaped = ''.join(r'\x%02x' % x for x in bad_data)
+ return escaped, exception.end
+
+ return codecs.backslashreplace_errors(exception)
+
+codecs.register_error('backslashescape', backslashescape_errors)
+
+
+def decode_with_escape(value):
+ """Decode a bytestring as utf-8, escaping the bytes of invalid utf-8 sequences
+ as \\x<hex value>. We also escape NUL bytes as they are invalid in JSON
+ strings.
+ """
+ # escape backslashes
+ value = value.replace(b'\\', b'\\\\')
+ value = value.replace(b'\x00', b'\\x00')
+ return value.decode('utf-8', 'backslashescape')

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:21 PM (5 d, 22 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219026

Event Timeline