diff --git a/swh/objstorage/backends/libcloud.py b/swh/objstorage/backends/libcloud.py --- a/swh/objstorage/backends/libcloud.py +++ b/swh/objstorage/backends/libcloud.py @@ -142,7 +142,12 @@ def get(self, obj_id): obj = b''.join(self._get_object(obj_id).as_stream()) - return decompressors[self.compression](obj) + d = decompressors[self.compression]() + ret = d.decompress(obj) + if d.unused_data: + hex_obj_id = hashutil.hash_to_hex(obj_id) + raise Error('Corrupt object %s: trailing data found' % hex_obj_id) + return ret def check(self, obj_id): # Check that the file exists, as _get_object raises ObjNotFoundError diff --git a/swh/objstorage/backends/seaweed.py b/swh/objstorage/backends/seaweed.py --- a/swh/objstorage/backends/seaweed.py +++ b/swh/objstorage/backends/seaweed.py @@ -149,10 +149,16 @@ def get(self, obj_id): try: obj = self.wf.get(self._path(obj_id)) - return decompressors[self.compression](obj) except Exception: raise ObjNotFoundError(obj_id) + d = decompressors[self.compression]() + ret = d.decompress(obj) + if d.unused_data: + hex_obj_id = hashutil.hash_to_hex(obj_id) + raise Error('Corrupt object %s: trailing data found' % hex_obj_id) + return ret + def check(self, obj_id): # Check the content integrity obj_content = self.get(obj_id) diff --git a/swh/objstorage/objstorage.py b/swh/objstorage/objstorage.py --- a/swh/objstorage/objstorage.py +++ b/swh/objstorage/objstorage.py @@ -6,7 +6,6 @@ import abc from itertools import dropwhile, islice import bz2 -import gzip import lzma import zlib @@ -46,13 +45,22 @@ return b'' +class NullDecompressor: + def decompress(self, data): + return data + + @property + def unused_data(self): + return b'' + + decompressors = { - 'bz2': bz2.decompress, - 'lzma': lzma.decompress, - 'gzip': gzip.decompress, - 'zlib': zlib.decompress, - None: lambda x: x, - } + 'bz2': bz2.BZ2Decompressor, + 'lzma': lzma.LZMADecompressor, + 'gzip': lambda: zlib.decompressobj(wbits=31), + 'zlib': zlib.decompressobj, + None: NullDecompressor, +} compressors = { 'bz2': bz2.BZ2Compressor, diff --git a/swh/objstorage/tests/test_objstorage_cloud.py b/swh/objstorage/tests/test_objstorage_cloud.py --- a/swh/objstorage/tests/test_objstorage_cloud.py +++ b/swh/objstorage/tests/test_objstorage_cloud.py @@ -111,7 +111,9 @@ obj_id = hashutil.hash_to_hex(obj_id) raw_content = b''.join(data[obj_id].content) - assert decompressors[self.compression](raw_content) == content + d = decompressors[self.compression]() + assert d.decompress(raw_content) == content + assert d.unused_data == b'' class TestCloudObjStorageBz2(TestCloudObjStorage):