diff --git a/swh/journal/replay.py b/swh/journal/replay.py --- a/swh/journal/replay.py +++ b/swh/journal/replay.py @@ -15,6 +15,9 @@ logger = logging.getLogger(__name__) +SHA1_SIZE = 20 + + def process_replay_objects(all_objects, *, storage): for (object_type, objects) in all_objects.items(): _insert_objects(object_type, objects, storage) @@ -55,6 +58,54 @@ object_type) +def is_hash_in_bytearray(hash_, array, nb_hashes, hash_size=SHA1_SIZE): + """ + Checks if the given hash is in the provided `array`. The array must be + a *sorted* list of sha1 hashes, and contain `nb_hashes` hashes + (so its size must by `nb_hashes*hash_size` bytes). + + Args: + hash_ (bytes): the hash to look for + array (bytes): a sorted concatenated array of hashes (may be of + any type supporting slice indexing, eg. :py:cls:`mmap.mmap`) + nb_hashes (int): number of hashes in the array + hash_size (int): size of a hash (defaults to 20, for SHA1) + + Example: + + >>> import os + >>> hash1 = os.urandom(20) + >>> hash2 = os.urandom(20) + >>> hash3 = os.urandom(20) + >>> array = b''.join(sorted([hash1, hash2])) + >>> is_hash_in_bytearray(hash1, array, 2) + True + >>> is_hash_in_bytearray(hash2, array, 2) + True + >>> is_hash_in_bytearray(hash3, array, 2) + False + """ + if len(hash_) != hash_size: + raise ValueError('hash_ does not match the provided hash_size.') + + def get_hash(position): + return array[position*hash_size:(position+1)*hash_size] + + # Regular dichotomy: + left = 0 + right = nb_hashes + while left < right-1: + middle = int((right+left)/2) + pivot = get_hash(middle) + if pivot == hash_: + return True + elif pivot < hash_: + left = middle + else: + right = middle + return get_hash(left) == hash_ + + def copy_object(obj_id, src, dst): statsd_name = 'swh_journal_content_replayer_%s_duration_seconds' try: diff --git a/swh/journal/tests/test_replay.py b/swh/journal/tests/test_replay.py --- a/swh/journal/tests/test_replay.py +++ b/swh/journal/tests/test_replay.py @@ -11,13 +11,14 @@ import dateutil from kafka import KafkaProducer +from hypothesis import strategies, given, settings from swh.storage import get_storage from swh.storage.in_memory import ENABLE_ORIGIN_IDS from swh.journal.client import JournalClient from swh.journal.serializers import key_to_kafka, value_to_kafka -from swh.journal.replay import process_replay_objects +from swh.journal.replay import process_replay_objects, is_hash_in_bytearray from .conftest import OBJECT_TYPE_KEYS from .utils import MockedJournalClient, MockedKafkaWriter @@ -199,3 +200,17 @@ 'date': now, 'type': 'git', }] + + +hash_strategy = strategies.binary(min_size=20, max_size=20) + + +@settings(max_examples=500) +@given(strategies.sets(hash_strategy, min_size=0, max_size=500), + strategies.sets(hash_strategy, min_size=10)) +def test_is_hash_in_bytearray(haystack, needles): + array = b''.join(sorted(haystack)) + needles |= haystack # Exhaustively test for all objects in the array + for needle in needles: + assert is_hash_in_bytearray(needle, array, len(haystack)) == \ + (needle in haystack) diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ .[testing] pytest-cov commands = - pytest --cov=swh --cov-branch {posargs} + pytest --cov=swh --cov-branch --doctest-modules {posargs} [testenv:py3-no-origin-ids] passenv=SWH_KAFKA_ROOT @@ -17,7 +17,7 @@ setenv = SWH_STORAGE_IN_MEMORY_ENABLE_ORIGIN_IDS=false commands = - pytest --cov=swh --cov-branch {posargs} + pytest --cov=swh --cov-branch --doctest-modules {posargs} [testenv:flake8] skip_install = true