diff --git a/swh/journal/replay.py b/swh/journal/replay.py
--- a/swh/journal/replay.py
+++ b/swh/journal/replay.py
@@ -15,6 +15,9 @@
 logger = logging.getLogger(__name__)
 
 
+SHA1_SIZE = 20
+
+
 def process_replay_objects(all_objects, *, storage):
     for (object_type, objects) in all_objects.items():
         _insert_objects(object_type, objects, storage)
@@ -55,6 +58,54 @@
                        object_type)
 
 
+def is_hash_in_bytearray(hash_, array, nb_hashes, hash_size=SHA1_SIZE):
+    """
+    Checks if the given hash is in the provided `array`. The array must be
+    a *sorted* list of sha1 hashes, and contain `nb_hashes` hashes
+    (so its size must by `nb_hashes*hash_size` bytes).
+
+    Args:
+        hash_ (bytes): the hash to look for
+        array (bytes): a sorted concatenated array of hashes (may be of
+            any type supporting slice indexing, eg. :py:cls:`mmap.mmap`)
+        nb_hashes (int): number of hashes in the array
+        hash_size (int): size of a hash (defaults to 20, for SHA1)
+
+    Example:
+
+    >>> import os
+    >>> hash1 = os.urandom(20)
+    >>> hash2 = os.urandom(20)
+    >>> hash3 = os.urandom(20)
+    >>> array = b''.join(sorted([hash1, hash2]))
+    >>> is_hash_in_bytearray(hash1, array, 2)
+    True
+    >>> is_hash_in_bytearray(hash2, array, 2)
+    True
+    >>> is_hash_in_bytearray(hash3, array, 2)
+    False
+    """
+    if len(hash_) != hash_size:
+        raise ValueError('hash_ does not match the provided hash_size.')
+
+    def get_hash(position):
+        return array[position*hash_size:(position+1)*hash_size]
+
+    # Regular dichotomy:
+    left = 0
+    right = nb_hashes
+    while left < right-1:
+        middle = int((right+left)/2)
+        pivot = get_hash(middle)
+        if pivot == hash_:
+            return True
+        elif pivot < hash_:
+            left = middle
+        else:
+            right = middle
+    return get_hash(left) == hash_
+
+
 def copy_object(obj_id, src, dst):
     statsd_name = 'swh_journal_content_replayer_%s_duration_seconds'
     try:
diff --git a/swh/journal/tests/test_replay.py b/swh/journal/tests/test_replay.py
--- a/swh/journal/tests/test_replay.py
+++ b/swh/journal/tests/test_replay.py
@@ -11,13 +11,14 @@
 
 import dateutil
 from kafka import KafkaProducer
+from hypothesis import strategies, given, settings
 
 from swh.storage import get_storage
 from swh.storage.in_memory import ENABLE_ORIGIN_IDS
 
 from swh.journal.client import JournalClient
 from swh.journal.serializers import key_to_kafka, value_to_kafka
-from swh.journal.replay import process_replay_objects
+from swh.journal.replay import process_replay_objects, is_hash_in_bytearray
 
 from .conftest import OBJECT_TYPE_KEYS
 from .utils import MockedJournalClient, MockedKafkaWriter
@@ -199,3 +200,17 @@
             'date': now,
             'type': 'git',
         }]
+
+
+hash_strategy = strategies.binary(min_size=20, max_size=20)
+
+
+@settings(max_examples=500)
+@given(strategies.sets(hash_strategy, min_size=0, max_size=500),
+       strategies.sets(hash_strategy, min_size=10))
+def test_is_hash_in_bytearray(haystack, needles):
+    array = b''.join(sorted(haystack))
+    needles |= haystack  # Exhaustively test for all objects in the array
+    for needle in needles:
+        assert is_hash_in_bytearray(needle, array, len(haystack)) == \
+            (needle in haystack)
diff --git a/tox.ini b/tox.ini
--- a/tox.ini
+++ b/tox.ini
@@ -7,7 +7,7 @@
   .[testing]
   pytest-cov
 commands =
-  pytest --cov=swh --cov-branch {posargs}
+  pytest --cov=swh --cov-branch --doctest-modules {posargs}
 
 [testenv:py3-no-origin-ids]
 passenv=SWH_KAFKA_ROOT
@@ -17,7 +17,7 @@
 setenv =
   SWH_STORAGE_IN_MEMORY_ENABLE_ORIGIN_IDS=false
 commands =
-  pytest --cov=swh --cov-branch {posargs}
+  pytest --cov=swh --cov-branch --doctest-modules {posargs}
 
 [testenv:flake8]
 skip_install = true