diff --git a/swh/objstorage/__init__.py b/swh/objstorage/__init__.py --- a/swh/objstorage/__init__.py +++ b/swh/objstorage/__init__.py @@ -11,6 +11,8 @@ MultiplexerObjStorage, StripingObjStorage) from swh.objstorage.multiplexer.filter import add_filters from swh.objstorage.backends.seaweed import WeedObjStorage +from swh.objstorage.backends.generator import RandomGeneratorObjStorage + __all__ = ['get_objstorage', 'ObjStorage'] @@ -20,6 +22,7 @@ 'remote': RemoteObjStorage, 'memory': InMemoryObjStorage, 'weed': WeedObjStorage, + 'random': RandomGeneratorObjStorage, } _STORAGE_CLASSES_MISSING = { diff --git a/swh/objstorage/backends/generator.py b/swh/objstorage/backends/generator.py new file mode 100644 --- /dev/null +++ b/swh/objstorage/backends/generator.py @@ -0,0 +1,135 @@ +from itertools import count, islice, repeat +import random +import io +import functools +import logging + +from swh.objstorage.objstorage import ( + ObjStorage, DEFAULT_CHUNK_SIZE, DEFAULT_LIMIT) + +logger = logging.getLogger(__name__) + + +class Randomizer: + def __init__(self): + self.size = 0 + self.read(1024) # create a not-so-small initial buffer + + def read(self, size): + if size > self.size: + with open('/dev/urandom', 'rb') as fobj: + self.data = fobj.read(2*size) + self.size = len(self.data) + # pick a random subset of our existing buffer + idx = random.randint(0, self.size - size - 1) + return self.data[idx:idx+size] + + +def gen_sizes(): + '''generates numbers according to the rought distribution of file size in the + SWH archive + ''' + # these are the histogram bounds of the pg content.length column + bounds = [0, 2, 72, 119, 165, 208, 256, 300, 345, 383, 429, 474, 521, 572, + 618, 676, 726, 779, 830, 879, 931, 992, 1054, 1119, 1183, 1244, + 1302, 1370, 1437, 1504, 1576, 1652, 1725, 1806, 1883, 1968, 2045, + 2133, 2236, 2338, 2433, 2552, 2659, 2774, 2905, 3049, 3190, 3322, + 3489, 3667, 3834, 4013, 4217, 4361, 4562, 4779, 5008, 5233, 5502, + 5788, 6088, 6396, 6728, 7094, 7457, 7835, 8244, 8758, 9233, 9757, + 10313, 10981, 11693, 12391, 13237, 14048, 14932, 15846, 16842, + 18051, 19487, 20949, 22595, 24337, 26590, 28840, 31604, 34653, + 37982, 41964, 46260, 51808, 58561, 66584, 78645, 95743, 122883, + 167016, 236108, 421057, 1047367, 55056238] + + nbounds = len(bounds) + for i in count(): + idx = random.randint(1, nbounds-1) + lower = bounds[idx-1] + upper = bounds[idx] + yield random.randint(lower, upper-1) + + +def gen_random_content(total=None, filesize=None): + '''generates random (file) content which sizes roughly follows the SWH + archive file size distribution (by default). + + Args: + total (int): the total number of objects to generate. Infinite if + unset. + filesize (int): generate objects with fixed size instead of random + ones. + + ''' + randomizer = Randomizer() + if filesize: + gen = repeat(filesize) + else: + gen = gen_sizes() + if total: + gen = islice(gen, total) + for objsize in gen: + yield randomizer.read(objsize) + + +class RandomGeneratorObjStorage(ObjStorage): + '''A stupid read-only storage that generates blobs for testing purpose. + ''' + + def __init__(self, filesize=None, total=None, **kwargs): + super().__init__() + if filesize: + filesize = int(filesize) + self.filesize = filesize + if total: + total = int(total) + self.total = total + self._content_generator = None + + @property + def content_generator(self): + if self._content_generator is None: + self._content_generator = gen_random_content( + self.total, self.filesize) + return self._content_generator + + def check_config(self, *, check_write): + return True + + def __contains__(self, obj_id, *args, **kwargs): + return False + + def __iter__(self): + i = 1 + while True: + j = yield (b'%d' % i) + if self.total and i >= self.total: + logger.debug('DONE') + break + if j is not None: + i = j + else: + i += 1 + + def get(self, obj_id, *args, **kwargs): + return next(self.content_generator) + + def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): + pass + + def check(self, obj_id, *args, **kwargs): + return True + + def delete(self, obj_id, *args, **kwargs): + return True + + def get_stream(self, obj_id, chunk_size=DEFAULT_CHUNK_SIZE): + data = io.BytesIO(next(self.content_generator)) + reader = functools.partial(data.read, chunk_size) + yield from iter(reader, b'') + + def list_content(self, last_obj_id=None, limit=DEFAULT_LIMIT): + it = iter(self) + if last_obj_id: + next(it) + it.send(int(last_obj_id)) + return islice(it, limit) diff --git a/swh/objstorage/tests/test_objstorage_random_generator.py b/swh/objstorage/tests/test_objstorage_random_generator.py new file mode 100644 --- /dev/null +++ b/swh/objstorage/tests/test_objstorage_random_generator.py @@ -0,0 +1,46 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from collections import Iterator +from swh.objstorage import get_objstorage + + +def test_random_generator_objstorage(): + sto = get_objstorage('random', {}) + assert sto + + blobs = [sto.get(None) for i in range(100)] + lengths = [len(x) for x in blobs] + assert max(lengths) <= 55056238 + + +def test_random_generator_objstorage_get_stream(): + sto = get_objstorage('random', {}) + gen = sto.get_stream(None) + assert isinstance(gen, Iterator) + assert list(gen) # ensure the iterator can be consumed + + +def test_random_generator_objstorage_list_content(): + sto = get_objstorage('random', {'total': 100}) + assert isinstance(sto.list_content(), Iterator) + + assert list(sto.list_content()) == \ + [b'%d' % i for i in range(1, 101)] + assert list(sto.list_content(limit=10)) == \ + [b'%d' % i for i in range(1, 11)] + assert list(sto.list_content(last_obj_id=b'10', limit=10)) == \ + [b'%d' % i for i in range(11, 21)] + + +def test_random_generator_objstorage_total(): + sto = get_objstorage('random', {'total': 5}) + assert len([x for x in sto]) == 5 + + +def test_random_generator_objstorage_size(): + sto = get_objstorage('random', {'filesize': 10}) + for i in range(10): + assert len(sto.get(None)) == 10