Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163555
D1303.id4143.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D1303.id4143.diff
View Options
diff --git a/swh/objstorage/__init__.py b/swh/objstorage/__init__.py
--- a/swh/objstorage/__init__.py
+++ b/swh/objstorage/__init__.py
@@ -11,6 +11,8 @@
MultiplexerObjStorage, StripingObjStorage)
from swh.objstorage.multiplexer.filter import add_filters
from swh.objstorage.backends.seaweed import WeedObjStorage
+from swh.objstorage.backends.generator import RandomGeneratorObjStorage
+
__all__ = ['get_objstorage', 'ObjStorage']
@@ -20,6 +22,7 @@
'remote': RemoteObjStorage,
'memory': InMemoryObjStorage,
'weed': WeedObjStorage,
+ 'random': RandomGeneratorObjStorage,
}
_STORAGE_CLASSES_MISSING = {
diff --git a/swh/objstorage/backends/generator.py b/swh/objstorage/backends/generator.py
new file mode 100644
--- /dev/null
+++ b/swh/objstorage/backends/generator.py
@@ -0,0 +1,135 @@
+from itertools import count, islice, repeat
+import random
+import io
+import functools
+import logging
+
+from swh.objstorage.objstorage import (
+ ObjStorage, DEFAULT_CHUNK_SIZE, DEFAULT_LIMIT)
+
+logger = logging.getLogger(__name__)
+
+
+class Randomizer:
+ def __init__(self):
+ self.size = 0
+ self.read(1024) # create a not-so-small initial buffer
+
+ def read(self, size):
+ if size > self.size:
+ with open('/dev/urandom', 'rb') as fobj:
+ self.data = fobj.read(2*size)
+ self.size = len(self.data)
+ # pick a random subset of our existing buffer
+ idx = random.randint(0, self.size - size - 1)
+ return self.data[idx:idx+size]
+
+
+def gen_sizes():
+ '''generates numbers according to the rought distribution of file size in the
+ SWH archive
+ '''
+ # these are the histogram bounds of the pg content.length column
+ bounds = [0, 2, 72, 119, 165, 208, 256, 300, 345, 383, 429, 474, 521, 572,
+ 618, 676, 726, 779, 830, 879, 931, 992, 1054, 1119, 1183, 1244,
+ 1302, 1370, 1437, 1504, 1576, 1652, 1725, 1806, 1883, 1968, 2045,
+ 2133, 2236, 2338, 2433, 2552, 2659, 2774, 2905, 3049, 3190, 3322,
+ 3489, 3667, 3834, 4013, 4217, 4361, 4562, 4779, 5008, 5233, 5502,
+ 5788, 6088, 6396, 6728, 7094, 7457, 7835, 8244, 8758, 9233, 9757,
+ 10313, 10981, 11693, 12391, 13237, 14048, 14932, 15846, 16842,
+ 18051, 19487, 20949, 22595, 24337, 26590, 28840, 31604, 34653,
+ 37982, 41964, 46260, 51808, 58561, 66584, 78645, 95743, 122883,
+ 167016, 236108, 421057, 1047367, 55056238]
+
+ nbounds = len(bounds)
+ for i in count():
+ idx = random.randint(1, nbounds-1)
+ lower = bounds[idx-1]
+ upper = bounds[idx]
+ yield random.randint(lower, upper-1)
+
+
+def gen_random_content(total=None, filesize=None):
+ '''generates random (file) content which sizes roughly follows the SWH
+ archive file size distribution (by default).
+
+ Args:
+ total (int): the total number of objects to generate. Infinite if
+ unset.
+ filesize (int): generate objects with fixed size instead of random
+ ones.
+
+ '''
+ randomizer = Randomizer()
+ if filesize:
+ gen = repeat(filesize)
+ else:
+ gen = gen_sizes()
+ if total:
+ gen = islice(gen, total)
+ for objsize in gen:
+ yield randomizer.read(objsize)
+
+
+class RandomGeneratorObjStorage(ObjStorage):
+ '''A stupid read-only storage that generates blobs for testing purpose.
+ '''
+
+ def __init__(self, filesize=None, total=None, **kwargs):
+ super().__init__()
+ if filesize:
+ filesize = int(filesize)
+ self.filesize = filesize
+ if total:
+ total = int(total)
+ self.total = total
+ self._content_generator = None
+
+ @property
+ def content_generator(self):
+ if self._content_generator is None:
+ self._content_generator = gen_random_content(
+ self.total, self.filesize)
+ return self._content_generator
+
+ def check_config(self, *, check_write):
+ return True
+
+ def __contains__(self, obj_id, *args, **kwargs):
+ return False
+
+ def __iter__(self):
+ i = 1
+ while True:
+ j = yield (b'%d' % i)
+ if self.total and i >= self.total:
+ logger.debug('DONE')
+ break
+ if j is not None:
+ i = j
+ else:
+ i += 1
+
+ def get(self, obj_id, *args, **kwargs):
+ return next(self.content_generator)
+
+ def add(self, content, obj_id=None, check_presence=True, *args, **kwargs):
+ pass
+
+ def check(self, obj_id, *args, **kwargs):
+ return True
+
+ def delete(self, obj_id, *args, **kwargs):
+ return True
+
+ def get_stream(self, obj_id, chunk_size=DEFAULT_CHUNK_SIZE):
+ data = io.BytesIO(next(self.content_generator))
+ reader = functools.partial(data.read, chunk_size)
+ yield from iter(reader, b'')
+
+ def list_content(self, last_obj_id=None, limit=DEFAULT_LIMIT):
+ it = iter(self)
+ if last_obj_id:
+ next(it)
+ it.send(int(last_obj_id))
+ return islice(it, limit)
diff --git a/swh/objstorage/tests/test_objstorage_random_generator.py b/swh/objstorage/tests/test_objstorage_random_generator.py
new file mode 100644
--- /dev/null
+++ b/swh/objstorage/tests/test_objstorage_random_generator.py
@@ -0,0 +1,46 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from collections import Iterator
+from swh.objstorage import get_objstorage
+
+
+def test_random_generator_objstorage():
+ sto = get_objstorage('random', {})
+ assert sto
+
+ blobs = [sto.get(None) for i in range(100)]
+ lengths = [len(x) for x in blobs]
+ assert max(lengths) <= 55056238
+
+
+def test_random_generator_objstorage_get_stream():
+ sto = get_objstorage('random', {})
+ gen = sto.get_stream(None)
+ assert isinstance(gen, Iterator)
+ assert list(gen) # ensure the iterator can be consumed
+
+
+def test_random_generator_objstorage_list_content():
+ sto = get_objstorage('random', {'total': 100})
+ assert isinstance(sto.list_content(), Iterator)
+
+ assert list(sto.list_content()) == \
+ [b'%d' % i for i in range(1, 101)]
+ assert list(sto.list_content(limit=10)) == \
+ [b'%d' % i for i in range(1, 11)]
+ assert list(sto.list_content(last_obj_id=b'10', limit=10)) == \
+ [b'%d' % i for i in range(11, 21)]
+
+
+def test_random_generator_objstorage_total():
+ sto = get_objstorage('random', {'total': 5})
+ assert len([x for x in sto]) == 5
+
+
+def test_random_generator_objstorage_size():
+ sto = get_objstorage('random', {'filesize': 10})
+ for i in range(10):
+ assert len(sto.get(None)) == 10
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 10:16 AM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222750
Attached To
D1303: Add a random content generator backend
Event Timeline
Log In to Comment