diff --git a/swh/objstorage/__init__.py b/swh/objstorage/__init__.py index d88d589..d254b9a 100644 --- a/swh/objstorage/__init__.py +++ b/swh/objstorage/__init__.py @@ -1,105 +1,108 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.objstorage.objstorage import ObjStorage, ID_HASH_LENGTH # noqa from swh.objstorage.backends.pathslicing import PathSlicingObjStorage from swh.objstorage.backends.in_memory import InMemoryObjStorage from swh.objstorage.api.client import RemoteObjStorage from swh.objstorage.multiplexer import ( MultiplexerObjStorage, StripingObjStorage) from swh.objstorage.multiplexer.filter import add_filters from swh.objstorage.backends.seaweed import WeedObjStorage +from swh.objstorage.backends.generator import RandomGeneratorObjStorage + __all__ = ['get_objstorage', 'ObjStorage'] _STORAGE_CLASSES = { 'pathslicing': PathSlicingObjStorage, 'remote': RemoteObjStorage, 'memory': InMemoryObjStorage, 'weed': WeedObjStorage, + 'random': RandomGeneratorObjStorage, } _STORAGE_CLASSES_MISSING = { } try: from swh.objstorage.backends.azure import ( AzureCloudObjStorage, PrefixedAzureCloudObjStorage, ) _STORAGE_CLASSES['azure'] = AzureCloudObjStorage _STORAGE_CLASSES['azure-prefixed'] = PrefixedAzureCloudObjStorage except ImportError as e: _STORAGE_CLASSES_MISSING['azure'] = e.args[0] _STORAGE_CLASSES_MISSING['azure-prefixed'] = e.args[0] try: from swh.objstorage.backends.rados import RADOSObjStorage _STORAGE_CLASSES['rados'] = RADOSObjStorage except ImportError as e: _STORAGE_CLASSES_MISSING['rados'] = e.args[0] try: from swh.objstorage.backends.libcloud import ( AwsCloudObjStorage, OpenStackCloudObjStorage, ) _STORAGE_CLASSES['s3'] = AwsCloudObjStorage _STORAGE_CLASSES['swift'] = OpenStackCloudObjStorage except ImportError as e: _STORAGE_CLASSES_MISSING['s3'] = e.args[0] _STORAGE_CLASSES_MISSING['swift'] = e.args[0] def get_objstorage(cls, args): """ Create an ObjStorage using the given implementation class. Args: cls (str): objstorage class unique key contained in the _STORAGE_CLASSES dict. args (dict): arguments for the required class of objstorage that must match exactly the one in the `__init__` method of the class. Returns: subclass of ObjStorage that match the given `storage_class` argument. Raises: ValueError: if the given storage class is not a valid objstorage key. """ if cls in _STORAGE_CLASSES: return _STORAGE_CLASSES[cls](**args) else: raise ValueError('Storage class {} is not available: {}'.format( cls, _STORAGE_CLASSES_MISSING.get(cls, 'unknown name'))) def _construct_filtered_objstorage(storage_conf, filters_conf): return add_filters( get_objstorage(**storage_conf), filters_conf ) _STORAGE_CLASSES['filtered'] = _construct_filtered_objstorage def _construct_multiplexer_objstorage(objstorages): storages = [get_objstorage(**conf) for conf in objstorages] return MultiplexerObjStorage(storages) _STORAGE_CLASSES['multiplexer'] = _construct_multiplexer_objstorage def _construct_striping_objstorage(objstorages): storages = [get_objstorage(**conf) for conf in objstorages] return StripingObjStorage(storages) _STORAGE_CLASSES['striping'] = _construct_striping_objstorage diff --git a/swh/objstorage/backends/generator.py b/swh/objstorage/backends/generator.py new file mode 100644 index 0000000..08b21da --- /dev/null +++ b/swh/objstorage/backends/generator.py @@ -0,0 +1,135 @@ +from itertools import count, islice, repeat +import random +import io +import functools +import logging + +from swh.objstorage.objstorage import ( + ObjStorage, DEFAULT_CHUNK_SIZE, DEFAULT_LIMIT) + +logger = logging.getLogger(__name__) + + +class Randomizer: + def __init__(self): + self.size = 0 + self.read(1024) # create a not-so-small initial buffer + + def read(self, size): + if size > self.size: + with open('/dev/urandom', 'rb') as fobj: + self.data = fobj.read(2*size) + self.size = len(self.data) + # pick a random subset of our existing buffer + idx = random.randint(0, self.size - size - 1) + return self.data[idx:idx+size] + + +def gen_sizes(): + '''generates numbers according to the rought distribution of file size in the + SWH archive + ''' + # these are the histogram bounds of the pg content.length column + bounds = [0, 2, 72, 119, 165, 208, 256, 300, 345, 383, 429, 474, 521, 572, + 618, 676, 726, 779, 830, 879, 931, 992, 1054, 1119, 1183, 1244, + 1302, 1370, 1437, 1504, 1576, 1652, 1725, 1806, 1883, 1968, 2045, + 2133, 2236, 2338, 2433, 2552, 2659, 2774, 2905, 3049, 3190, 3322, + 3489, 3667, 3834, 4013, 4217, 4361, 4562, 4779, 5008, 5233, 5502, + 5788, 6088, 6396, 6728, 7094, 7457, 7835, 8244, 8758, 9233, 9757, + 10313, 10981, 11693, 12391, 13237, 14048, 14932, 15846, 16842, + 18051, 19487, 20949, 22595, 24337, 26590, 28840, 31604, 34653, + 37982, 41964, 46260, 51808, 58561, 66584, 78645, 95743, 122883, + 167016, 236108, 421057, 1047367, 55056238] + + nbounds = len(bounds) + for i in count(): + idx = random.randint(1, nbounds-1) + lower = bounds[idx-1] + upper = bounds[idx] + yield random.randint(lower, upper-1) + + +def gen_random_content(total=None, filesize=None): + '''generates random (file) content which sizes roughly follows the SWH + archive file size distribution (by default). + + Args: + total (int): the total number of objects to generate. Infinite if + unset. + filesize (int): generate objects with fixed size instead of random + ones. + + ''' + randomizer = Randomizer() + if filesize: + gen = repeat(filesize) + else: + gen = gen_sizes() + if total: + gen = islice(gen, total) + for objsize in gen: + yield randomizer.read(objsize) + + +class RandomGeneratorObjStorage(ObjStorage): + '''A stupid read-only storage that generates blobs for testing purpose. + ''' + + def __init__(self, filesize=None, total=None, **kwargs): + super().__init__() + if filesize: + filesize = int(filesize) + self.filesize = filesize + if total: + total = int(total) + self.total = total + self._content_generator = None + + @property + def content_generator(self): + if self._content_generator is None: + self._content_generator = gen_random_content( + self.total, self.filesize) + return self._content_generator + + def check_config(self, *, check_write): + return True + + def __contains__(self, obj_id, *args, **kwargs): + return False + + def __iter__(self): + i = 1 + while True: + j = yield (b'%d' % i) + if self.total and i >= self.total: + logger.debug('DONE') + break + if j is not None: + i = j + else: + i += 1 + + def get(self, obj_id, *args, **kwargs): + return next(self.content_generator) + + def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): + pass + + def check(self, obj_id, *args, **kwargs): + return True + + def delete(self, obj_id, *args, **kwargs): + return True + + def get_stream(self, obj_id, chunk_size=DEFAULT_CHUNK_SIZE): + data = io.BytesIO(next(self.content_generator)) + reader = functools.partial(data.read, chunk_size) + yield from iter(reader, b'') + + def list_content(self, last_obj_id=None, limit=DEFAULT_LIMIT): + it = iter(self) + if last_obj_id: + next(it) + it.send(int(last_obj_id)) + return islice(it, limit) diff --git a/swh/objstorage/tests/test_objstorage_random_generator.py b/swh/objstorage/tests/test_objstorage_random_generator.py new file mode 100644 index 0000000..9b0b17d --- /dev/null +++ b/swh/objstorage/tests/test_objstorage_random_generator.py @@ -0,0 +1,46 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from collections import Iterator +from swh.objstorage import get_objstorage + + +def test_random_generator_objstorage(): + sto = get_objstorage('random', {}) + assert sto + + blobs = [sto.get(None) for i in range(100)] + lengths = [len(x) for x in blobs] + assert max(lengths) <= 55056238 + + +def test_random_generator_objstorage_get_stream(): + sto = get_objstorage('random', {}) + gen = sto.get_stream(None) + assert isinstance(gen, Iterator) + assert list(gen) # ensure the iterator can be consumed + + +def test_random_generator_objstorage_list_content(): + sto = get_objstorage('random', {'total': 100}) + assert isinstance(sto.list_content(), Iterator) + + assert list(sto.list_content()) == \ + [b'%d' % i for i in range(1, 101)] + assert list(sto.list_content(limit=10)) == \ + [b'%d' % i for i in range(1, 11)] + assert list(sto.list_content(last_obj_id=b'10', limit=10)) == \ + [b'%d' % i for i in range(11, 21)] + + +def test_random_generator_objstorage_total(): + sto = get_objstorage('random', {'total': 5}) + assert len([x for x in sto]) == 5 + + +def test_random_generator_objstorage_size(): + sto = get_objstorage('random', {'filesize': 10}) + for i in range(10): + assert len(sto.get(None)) == 10