diff --git a/swh/objstorage/__init__.py b/swh/objstorage/__init__.py --- a/swh/objstorage/__init__.py +++ b/swh/objstorage/__init__.py @@ -11,6 +11,8 @@ MultiplexerObjStorage, StripingObjStorage) from swh.objstorage.multiplexer.filter import add_filters from swh.objstorage.backends.seaweed import WeedObjStorage +from swh.objstorage.backends.generator import RandomGeneratorObjStorage + __all__ = ['get_objstorage', 'ObjStorage'] @@ -20,6 +22,7 @@ 'remote': RemoteObjStorage, 'memory': InMemoryObjStorage, 'weed': WeedObjStorage, + 'random': RandomGeneratorObjStorage, } _STORAGE_CLASSES_MISSING = { diff --git a/swh/objstorage/backends/generator.py b/swh/objstorage/backends/generator.py new file mode 100644 --- /dev/null +++ b/swh/objstorage/backends/generator.py @@ -0,0 +1,126 @@ +from itertools import count, islice +import random +import io +import functools +import logging + +from swh.objstorage.objstorage import ( + ObjStorage, DEFAULT_CHUNK_SIZE, DEFAULT_LIMIT) + +logger = logging.getLogger(__name__) + + +class Randomizer: + def __init__(self): + self.size = 0 + self.data = None + + def read(self, size): + if size > self.size: + with open('/dev/urandom', 'rb') as fobj: + self.data = fobj.read(2*size) + self.size = len(self.data) + idx = random.randint(0, self.size - size - 1) + return self.data[idx:idx+size] + + +def gen_filesize(total=None, filesize=None): + '''generates numbers according to the rought distribution of file size in the + SWH archive + ''' + # these are the histogram bounds of the pg content.length column + bounds = [0, 2, 72, 119, 165, 208, 256, 300, 345, 383, 429, 474, 521, 572, + 618, 676, 726, 779, 830, 879, 931, 992, 1054, 1119, 1183, 1244, + 1302, 1370, 1437, 1504, 1576, 1652, 1725, 1806, 1883, 1968, 2045, + 2133, 2236, 2338, 2433, 2552, 2659, 2774, 2905, 3049, 3190, 3322, + 3489, 3667, 3834, 4013, 4217, 4361, 4562, 4779, 5008, 5233, 5502, + 5788, 6088, 6396, 6728, 7094, 7457, 7835, 8244, 8758, 9233, 9757, + 10313, 10981, 11693, 12391, 13237, 14048, 14932, 15846, 16842, + 18051, 19487, 20949, 22595, 24337, 26590, 28840, 31604, 34653, + 37982, 41964, 46260, 51808, 58561, 66584, 78645, 95743, 122883, + 167016, 236108, 421057, 1047367, 55056238] + + nbounds = len(bounds) + for i in count(): + if total and i > total: + break + if filesize is not None: + yield filesize + else: + idx = random.randint(1, nbounds-1) + lower = bounds[idx-1] + upper = bounds[idx] + yield random.randint(lower, upper-1) + + +def gen_random_file(total=None, filesize=None): + '''generates random file content which sizes roughly follows the SWH + archive file size distribution + + ''' + randomizer = Randomizer() + for objsize in gen_filesize(total, filesize): + yield randomizer.read(objsize) + + +class RandomGeneratorObjStorage(ObjStorage): + '''A stupid read-only storage that generated blobs for testing purpose. + ''' + + def __init__(self, filesize=None, total=None, **kwargs): + super().__init__() + if filesize: + filesize = int(filesize) + self.filesize = filesize + if total: + total = int(total) + self.total = total + self._gen = None + + @property + def gen(self): + if self._gen is None: + self._gen = gen_random_file(self.total, self.filesize) + return self._gen + + def check_config(self, *, check_write): + return True + + def __contains__(self, obj_id, *args, **kwargs): + return False + + def __iter__(self): + i = 0 + while True: + j = yield (b'%d' % i) + if self.total and i > self.total: + logger.debug('DONE') + break + if j is not None: + i = j + else: + i += 1 + + def get(self, obj_id, *args, **kwargs): + return next(self.gen) + + def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): + pass + + def check(self, obj_id, *args, **kwargs): + return True + + def delete(self, obj_id, *args, **kwargs): + return True + + def get_stream(self, obj_id, chunk_size=DEFAULT_CHUNK_SIZE): + data = io.BytesIO(next(self.gen)) + reader = functools.partial(data.read, chunk_size) + yield from iter(reader, b'') + + def list_content(self, last_obj_id=None, limit=DEFAULT_LIMIT): + it = iter(self) + if last_obj_id: + next(it) + it.send(int(last_obj_id)) + return islice(it, limit)