Page MenuHomeSoftware Heritage

D1303.id4143.diff
No OneTemporary

D1303.id4143.diff

diff --git a/swh/objstorage/__init__.py b/swh/objstorage/__init__.py
--- a/swh/objstorage/__init__.py
+++ b/swh/objstorage/__init__.py
@@ -11,6 +11,8 @@
MultiplexerObjStorage, StripingObjStorage)
from swh.objstorage.multiplexer.filter import add_filters
from swh.objstorage.backends.seaweed import WeedObjStorage
+from swh.objstorage.backends.generator import RandomGeneratorObjStorage
+
__all__ = ['get_objstorage', 'ObjStorage']
@@ -20,6 +22,7 @@
'remote': RemoteObjStorage,
'memory': InMemoryObjStorage,
'weed': WeedObjStorage,
+ 'random': RandomGeneratorObjStorage,
}
_STORAGE_CLASSES_MISSING = {
diff --git a/swh/objstorage/backends/generator.py b/swh/objstorage/backends/generator.py
new file mode 100644
--- /dev/null
+++ b/swh/objstorage/backends/generator.py
@@ -0,0 +1,135 @@
+from itertools import count, islice, repeat
+import random
+import io
+import functools
+import logging
+
+from swh.objstorage.objstorage import (
+ ObjStorage, DEFAULT_CHUNK_SIZE, DEFAULT_LIMIT)
+
+logger = logging.getLogger(__name__)
+
+
+class Randomizer:
+ def __init__(self):
+ self.size = 0
+ self.read(1024) # create a not-so-small initial buffer
+
+ def read(self, size):
+ if size > self.size:
+ with open('/dev/urandom', 'rb') as fobj:
+ self.data = fobj.read(2*size)
+ self.size = len(self.data)
+ # pick a random subset of our existing buffer
+ idx = random.randint(0, self.size - size - 1)
+ return self.data[idx:idx+size]
+
+
+def gen_sizes():
+ '''generates numbers according to the rought distribution of file size in the
+ SWH archive
+ '''
+ # these are the histogram bounds of the pg content.length column
+ bounds = [0, 2, 72, 119, 165, 208, 256, 300, 345, 383, 429, 474, 521, 572,
+ 618, 676, 726, 779, 830, 879, 931, 992, 1054, 1119, 1183, 1244,
+ 1302, 1370, 1437, 1504, 1576, 1652, 1725, 1806, 1883, 1968, 2045,
+ 2133, 2236, 2338, 2433, 2552, 2659, 2774, 2905, 3049, 3190, 3322,
+ 3489, 3667, 3834, 4013, 4217, 4361, 4562, 4779, 5008, 5233, 5502,
+ 5788, 6088, 6396, 6728, 7094, 7457, 7835, 8244, 8758, 9233, 9757,
+ 10313, 10981, 11693, 12391, 13237, 14048, 14932, 15846, 16842,
+ 18051, 19487, 20949, 22595, 24337, 26590, 28840, 31604, 34653,
+ 37982, 41964, 46260, 51808, 58561, 66584, 78645, 95743, 122883,
+ 167016, 236108, 421057, 1047367, 55056238]
+
+ nbounds = len(bounds)
+ for i in count():
+ idx = random.randint(1, nbounds-1)
+ lower = bounds[idx-1]
+ upper = bounds[idx]
+ yield random.randint(lower, upper-1)
+
+
+def gen_random_content(total=None, filesize=None):
+ '''generates random (file) content which sizes roughly follows the SWH
+ archive file size distribution (by default).
+
+ Args:
+ total (int): the total number of objects to generate. Infinite if
+ unset.
+ filesize (int): generate objects with fixed size instead of random
+ ones.
+
+ '''
+ randomizer = Randomizer()
+ if filesize:
+ gen = repeat(filesize)
+ else:
+ gen = gen_sizes()
+ if total:
+ gen = islice(gen, total)
+ for objsize in gen:
+ yield randomizer.read(objsize)
+
+
+class RandomGeneratorObjStorage(ObjStorage):
+ '''A stupid read-only storage that generates blobs for testing purpose.
+ '''
+
+ def __init__(self, filesize=None, total=None, **kwargs):
+ super().__init__()
+ if filesize:
+ filesize = int(filesize)
+ self.filesize = filesize
+ if total:
+ total = int(total)
+ self.total = total
+ self._content_generator = None
+
+ @property
+ def content_generator(self):
+ if self._content_generator is None:
+ self._content_generator = gen_random_content(
+ self.total, self.filesize)
+ return self._content_generator
+
+ def check_config(self, *, check_write):
+ return True
+
+ def __contains__(self, obj_id, *args, **kwargs):
+ return False
+
+ def __iter__(self):
+ i = 1
+ while True:
+ j = yield (b'%d' % i)
+ if self.total and i >= self.total:
+ logger.debug('DONE')
+ break
+ if j is not None:
+ i = j
+ else:
+ i += 1
+
+ def get(self, obj_id, *args, **kwargs):
+ return next(self.content_generator)
+
+ def add(self, content, obj_id=None, check_presence=True, *args, **kwargs):
+ pass
+
+ def check(self, obj_id, *args, **kwargs):
+ return True
+
+ def delete(self, obj_id, *args, **kwargs):
+ return True
+
+ def get_stream(self, obj_id, chunk_size=DEFAULT_CHUNK_SIZE):
+ data = io.BytesIO(next(self.content_generator))
+ reader = functools.partial(data.read, chunk_size)
+ yield from iter(reader, b'')
+
+ def list_content(self, last_obj_id=None, limit=DEFAULT_LIMIT):
+ it = iter(self)
+ if last_obj_id:
+ next(it)
+ it.send(int(last_obj_id))
+ return islice(it, limit)
diff --git a/swh/objstorage/tests/test_objstorage_random_generator.py b/swh/objstorage/tests/test_objstorage_random_generator.py
new file mode 100644
--- /dev/null
+++ b/swh/objstorage/tests/test_objstorage_random_generator.py
@@ -0,0 +1,46 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from collections import Iterator
+from swh.objstorage import get_objstorage
+
+
+def test_random_generator_objstorage():
+ sto = get_objstorage('random', {})
+ assert sto
+
+ blobs = [sto.get(None) for i in range(100)]
+ lengths = [len(x) for x in blobs]
+ assert max(lengths) <= 55056238
+
+
+def test_random_generator_objstorage_get_stream():
+ sto = get_objstorage('random', {})
+ gen = sto.get_stream(None)
+ assert isinstance(gen, Iterator)
+ assert list(gen) # ensure the iterator can be consumed
+
+
+def test_random_generator_objstorage_list_content():
+ sto = get_objstorage('random', {'total': 100})
+ assert isinstance(sto.list_content(), Iterator)
+
+ assert list(sto.list_content()) == \
+ [b'%d' % i for i in range(1, 101)]
+ assert list(sto.list_content(limit=10)) == \
+ [b'%d' % i for i in range(1, 11)]
+ assert list(sto.list_content(last_obj_id=b'10', limit=10)) == \
+ [b'%d' % i for i in range(11, 21)]
+
+
+def test_random_generator_objstorage_total():
+ sto = get_objstorage('random', {'total': 5})
+ assert len([x for x in sto]) == 5
+
+
+def test_random_generator_objstorage_size():
+ sto = get_objstorage('random', {'filesize': 10})
+ for i in range(10):
+ assert len(sto.get(None)) == 10

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 10:16 AM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222750

Event Timeline