diff --git a/requirements-swh.txt b/requirements-swh.txt
index 62d97ee..0b5f629 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,3 +1,3 @@
 swh.core[db,http] >= 0.0.60
 swh.model >= 0.0.60
-swh.storage >= 0.0.172
+swh.storage >= 0.0.177
diff --git a/swh/journal/replay.py b/swh/journal/replay.py
index 7743533..e1cc83d 100644
--- a/swh/journal/replay.py
+++ b/swh/journal/replay.py
@@ -1,519 +1,559 @@
 # Copyright (C) 2019-2020 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import copy
 import logging
 from time import time
-from typing import Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, Iterable, List, Optional
 
 from sentry_sdk import capture_exception, push_scope
 try:
     from systemd.daemon import notify
 except ImportError:
     notify = None
 
 from tenacity import (
     retry, retry_if_exception_type, stop_after_attempt,
     wait_random_exponential,
 )
 
 from swh.core.statsd import statsd
 from swh.model.identifiers import normalize_timestamp
 from swh.model.hashutil import hash_to_hex
-from swh.model.model import SHA1_SIZE
+from swh.model.model import BaseContent, SkippedContent, SHA1_SIZE
 from swh.objstorage.objstorage import (
     ID_HASH_ALGO, ObjNotFoundError, ObjStorage,
 )
 from swh.storage import HashCollision
 
 logger = logging.getLogger(__name__)
 
 GRAPH_OPERATIONS_METRIC = "swh_graph_replayer_operations_total"
 GRAPH_DURATION_METRIC = "swh_graph_replayer_duration_seconds"
 CONTENT_OPERATIONS_METRIC = "swh_content_replayer_operations_total"
 CONTENT_RETRY_METRIC = "swh_content_replayer_retries_total"
 CONTENT_BYTES_METRIC = "swh_content_replayer_bytes"
 CONTENT_DURATION_METRIC = "swh_content_replayer_duration_seconds"
 
 
 def process_replay_objects(all_objects, *, storage):
     for (object_type, objects) in all_objects.items():
         logger.debug("Inserting %s %s objects", len(objects), object_type)
         with statsd.timed(GRAPH_DURATION_METRIC,
                           tags={'object_type': object_type}):
             _insert_objects(object_type, objects, storage)
         statsd.increment(GRAPH_OPERATIONS_METRIC, len(objects),
                          tags={'object_type': object_type})
     if notify:
         notify('WATCHDOG=1')
 
 
 def _fix_revision_pypi_empty_string(rev):
     """PyPI loader failed to encode empty strings as bytes, see:
     swh:1:rev:8f0095ee0664867055d03de9bcc8f95b91d8a2b9
     or https://forge.softwareheritage.org/D1772
     """
     rev = {
         **rev,
         'author': rev['author'].copy(),
         'committer': rev['committer'].copy(),
     }
     if rev['author'].get('email') == '':
         rev['author']['email'] = b''
     if rev['author'].get('name') == '':
         rev['author']['name'] = b''
     if rev['committer'].get('email') == '':
         rev['committer']['email'] = b''
     if rev['committer'].get('name') == '':
         rev['committer']['name'] = b''
     return rev
 
 
 def _fix_revision_transplant_source(rev):
     if rev.get('metadata') and rev['metadata'].get('extra_headers'):
         rev = copy.deepcopy(rev)
         rev['metadata']['extra_headers'] = [
             [key, value.encode('ascii')]
             if key == 'transplant_source' and isinstance(value, str)
             else [key, value]
             for (key, value) in rev['metadata']['extra_headers']]
     return rev
 
 
 def _check_date(date):
     """Returns whether the date can be represented in backends with sane
     limits on timestamps and timezones (resp. signed 64-bits and
     signed 16 bits), and that microseconds is valid (ie. between 0 and 10^6).
     """
     if date is None:
         return True
     date = normalize_timestamp(date)
     return (-2**63 <= date['timestamp']['seconds'] < 2**63) \
         and (0 <= date['timestamp']['microseconds'] < 10**6) \
         and (-2**15 <= date['offset'] < 2**15)
 
 
 def _check_revision_date(rev):
     """Exclude revisions with invalid dates.
     See https://forge.softwareheritage.org/T1339"""
     return _check_date(rev['date']) and _check_date(rev['committer_date'])
 
 
 def _fix_revisions(revisions):
     good_revisions = []
     for rev in revisions:
         rev = _fix_revision_pypi_empty_string(rev)
         rev = _fix_revision_transplant_source(rev)
         if not _check_revision_date(rev):
             logging.warning('Excluding revision (invalid date): %r', rev)
             continue
         if rev not in good_revisions:
             good_revisions.append(rev)
     return good_revisions
 
 
 def _fix_origin_visits(visits):
     good_visits = []
     for visit in visits:
         visit = visit.copy()
         if 'type' not in visit:
             if isinstance(visit['origin'], dict) and 'type' in visit['origin']:
                 # Very old version of the schema: visits did not have a type,
                 # but their 'origin' field was a dict with a 'type' key.
                 visit['type'] = visit['origin']['type']
             else:
                 # Very very old version of the schema: 'type' is missing,
                 # so there is nothing we can do to fix it.
                 raise ValueError('Got an origin_visit too old to be replayed.')
         if isinstance(visit['origin'], dict):
             # Old version of the schema: visit['origin'] was a dict.
             visit['origin'] = visit['origin']['url']
         good_visits.append(visit)
     return good_visits
 
 
 def fix_objects(object_type, objects):
     """Converts a possibly old object from the journal to its current
     expected format.
 
     List of conversions:
 
     Empty author name/email in PyPI releases:
 
     >>> from pprint import pprint
     >>> date = {
     ...     'timestamp': {
     ...         'seconds': 1565096932,
     ...         'microseconds': 0,
     ...     },
     ...     'offset': 0,
     ... }
     >>> pprint(fix_objects('revision', [{
     ...     'author': {'email': '', 'fullname': b'', 'name': ''},
     ...     'committer': {'email': '', 'fullname': b'', 'name': ''},
     ...     'date': date,
     ...     'committer_date': date,
     ... }]))
     [{'author': {'email': b'', 'fullname': b'', 'name': b''},
       'committer': {'email': b'', 'fullname': b'', 'name': b''},
       'committer_date': {'offset': 0,
                          'timestamp': {'microseconds': 0, 'seconds': 1565096932}},
       'date': {'offset': 0,
                'timestamp': {'microseconds': 0, 'seconds': 1565096932}}}]
 
     Fix type of 'transplant_source' extra headers:
 
     >>> revs = fix_objects('revision', [{
     ...     'author': {'email': '', 'fullname': b'', 'name': ''},
     ...     'committer': {'email': '', 'fullname': b'', 'name': ''},
     ...     'date': date,
     ...     'committer_date': date,
     ...     'metadata': {
     ...         'extra_headers': [
     ...             ['time_offset_seconds', b'-3600'],
     ...             ['transplant_source', '29c154a012a70f49df983625090434587622b39e']
     ...     ]}
     ... }])
     >>> pprint(revs[0]['metadata']['extra_headers'])
     [['time_offset_seconds', b'-3600'],
      ['transplant_source', b'29c154a012a70f49df983625090434587622b39e']]
 
     Filter out revisions with invalid dates:
 
     >>> from copy import deepcopy
     >>> invalid_date1 = deepcopy(date)
     >>> invalid_date1['timestamp']['microseconds'] = 1000000000  # > 10^6
     >>> fix_objects('revision', [{
     ...     'author': {'email': '', 'fullname': b'', 'name': b''},
     ...     'committer': {'email': '', 'fullname': b'', 'name': b''},
     ...     'date': invalid_date1,
     ...     'committer_date': date,
     ... }])
     []
 
     >>> invalid_date2 = deepcopy(date)
     >>> invalid_date2['timestamp']['seconds'] = 2**70  # > 10^63
     >>> fix_objects('revision', [{
     ...     'author': {'email': '', 'fullname': b'', 'name': b''},
     ...     'committer': {'email': '', 'fullname': b'', 'name': b''},
     ...     'date': invalid_date2,
     ...     'committer_date': date,
     ... }])
     []
 
     >>> invalid_date3 = deepcopy(date)
     >>> invalid_date3['offset'] = 2**20  # > 10^15
     >>> fix_objects('revision', [{
     ...     'author': {'email': '', 'fullname': b'', 'name': b''},
     ...     'committer': {'email': '', 'fullname': b'', 'name': b''},
     ...     'date': date,
     ...     'committer_date': invalid_date3,
     ... }])
     []
 
 
     `visit['origin']` is a dict instead of an URL:
 
     >>> pprint(fix_objects('origin_visit', [{
     ...     'origin': {'url': 'http://foo'},
     ...     'type': 'git',
     ... }]))
     [{'origin': 'http://foo', 'type': 'git'}]
 
     `visit['type']` is missing , but `origin['visit']['type']` exists:
 
     >>> pprint(fix_objects('origin_visit', [
     ...     {'origin': {'type': 'hg', 'url': 'http://foo'}
     ... }]))
     [{'origin': 'http://foo', 'type': 'hg'}]
     """  # noqa
 
     if object_type == 'revision':
         objects = _fix_revisions(objects)
     elif object_type == 'origin_visit':
         objects = _fix_origin_visits(objects)
     return objects
 
 
+def collision_aware_content_add(
+        content_add_fn: Callable[[Iterable[Any]], None],
+        contents: List[BaseContent]) -> None:
+    """Add contents to storage. If a hash collision is detected, an error is
+       logged. Then this adds the other non colliding contents to the storage.
+
+    Args:
+        content_add_fn: Storage content callable
+        contents: List of contents or skipped contents to add to storage
+
+    """
+    if not contents:
+        return
+    colliding_content_hashes: List[Dict[str, Any]] = []
+    while True:
+        try:
+            content_add_fn(c.to_dict() for c in contents)
+        except HashCollision as e:
+            algo, hash_id, colliding_hashes = e.args
+            hash_id = hash_to_hex(hash_id)
+            colliding_content_hashes.append({
+                'algo': algo,
+                'hash': hash_to_hex(hash_id),
+                'objects': [{k: hash_to_hex(v) for k, v in collision.items()}
+                            for collision in colliding_hashes]
+            })
+            # Drop the colliding contents from the transaction
+            contents = [c for c in contents
+                        if c.hashes() not in colliding_hashes]
+        else:
+            # Successfully added contents, we are done
+            break
+    if colliding_content_hashes:
+        for collision in colliding_content_hashes:
+            logger.error('Collision detected: %(collision)s', {
+                'collision': collision
+            })
+
+
 def _insert_objects(object_type, objects, storage):
     objects = fix_objects(object_type, objects)
     if object_type == 'content':
-        try:
-            storage.skipped_content_add(
-              (obj for obj in objects if obj.get('status') == 'absent'))
-        except HashCollision as e:
-            logger.error('(SkippedContent) Hash collision: %s', e.args)
+        contents, skipped_contents = [], []
+        for content in objects:
+            c = BaseContent.from_dict(content)
+            if isinstance(c, SkippedContent):
+                skipped_contents.append(c)
+            else:
+                contents.append(c)
 
-        try:
-            storage.content_add_metadata(
-              (obj for obj in objects if obj.get('status') != 'absent'))
-        except HashCollision as e:
-            logger.error('(Content) Hash collision: %s', e.args)
+        collision_aware_content_add(
+            storage.skipped_content_add, skipped_contents)
+        collision_aware_content_add(
+            storage.content_add_metadata, contents)
 
     elif object_type in ('directory', 'revision', 'release',
                          'snapshot', 'origin'):
         # TODO: split batches that are too large for the storage
         # to handle?
         method = getattr(storage, object_type + '_add')
         method(objects)
     elif object_type == 'origin_visit':
         for visit in objects:
             storage.origin_add_one({'url': visit['origin']})
             if 'metadata' not in visit:
                 visit['metadata'] = None
         storage.origin_visit_upsert(objects)
     else:
         logger.warning('Received a series of %s, this should not happen',
                        object_type)
 
 
 def is_hash_in_bytearray(hash_, array, nb_hashes, hash_size=SHA1_SIZE):
     """
     Checks if the given hash is in the provided `array`. The array must be
     a *sorted* list of sha1 hashes, and contain `nb_hashes` hashes
     (so its size must by `nb_hashes*hash_size` bytes).
 
     Args:
         hash_ (bytes): the hash to look for
         array (bytes): a sorted concatenated array of hashes (may be of
             any type supporting slice indexing, eg. :class:`mmap.mmap`)
         nb_hashes (int): number of hashes in the array
         hash_size (int): size of a hash (defaults to 20, for SHA1)
 
     Example:
 
     >>> import os
     >>> hash1 = os.urandom(20)
     >>> hash2 = os.urandom(20)
     >>> hash3 = os.urandom(20)
     >>> array = b''.join(sorted([hash1, hash2]))
     >>> is_hash_in_bytearray(hash1, array, 2)
     True
     >>> is_hash_in_bytearray(hash2, array, 2)
     True
     >>> is_hash_in_bytearray(hash3, array, 2)
     False
     """
     if len(hash_) != hash_size:
         raise ValueError('hash_ does not match the provided hash_size.')
 
     def get_hash(position):
         return array[position*hash_size:(position+1)*hash_size]
 
     # Regular dichotomy:
     left = 0
     right = nb_hashes
     while left < right-1:
         middle = int((right+left)/2)
         pivot = get_hash(middle)
         if pivot == hash_:
             return True
         elif pivot < hash_:
             left = middle
         else:
             right = middle
     return get_hash(left) == hash_
 
 
 class ReplayError(Exception):
     """An error occurred during the replay of an object"""
     def __init__(self, operation, *, obj_id, exc):
         self.operation = operation
         self.obj_id = hash_to_hex(obj_id)
         self.exc = exc
 
     def __str__(self):
         return "ReplayError(doing %s, %s, %s)" % (
             self.operation, self.obj_id, self.exc
         )
 
 
 def log_replay_retry(retry_obj, sleep, last_result):
     """Log a retry of the content replayer"""
     exc = last_result.exception()
     logger.debug('Retry operation %(operation)s on %(obj_id)s: %(exc)s',
                  {'operation': exc.operation, 'obj_id': exc.obj_id,
                   'exc': str(exc.exc)})
 
     statsd.increment(CONTENT_RETRY_METRIC, tags={
         'operation': exc.operation,
         'attempt': str(retry_obj.statistics['attempt_number']),
     })
 
 
 def log_replay_error(last_attempt):
     """Log a replay error to sentry"""
     exc = last_attempt.exception()
     with push_scope() as scope:
         scope.set_tag('operation', exc.operation)
         scope.set_extra('obj_id', exc.obj_id)
         capture_exception(exc.exc)
 
     logger.error(
         'Failed operation %(operation)s on %(obj_id)s after %(retries)s'
         ' retries: %(exc)s', {
             'obj_id': exc.obj_id, 'operation': exc.operation,
             'exc': str(exc.exc), 'retries': last_attempt.attempt_number,
         })
 
     return None
 
 
 CONTENT_REPLAY_RETRIES = 3
 
 content_replay_retry = retry(
     retry=retry_if_exception_type(ReplayError),
     stop=stop_after_attempt(CONTENT_REPLAY_RETRIES),
     wait=wait_random_exponential(multiplier=1, max=60),
     before_sleep=log_replay_retry,
     retry_error_callback=log_replay_error,
 )
 
 
 @content_replay_retry
 def copy_object(obj_id, src, dst):
     hex_obj_id = hash_to_hex(obj_id)
     obj = ''
     try:
         with statsd.timed(CONTENT_DURATION_METRIC, tags={'request': 'get'}):
             obj = src.get(obj_id)
             logger.debug('retrieved %(obj_id)s', {'obj_id': hex_obj_id})
 
         with statsd.timed(CONTENT_DURATION_METRIC, tags={'request': 'put'}):
             dst.add(obj, obj_id=obj_id, check_presence=False)
             logger.debug('copied %(obj_id)s', {'obj_id': hex_obj_id})
         statsd.increment(CONTENT_BYTES_METRIC, len(obj))
     except ObjNotFoundError:
         logger.error('Failed to copy %(obj_id)s: object not found',
                      {'obj_id': hex_obj_id})
         raise
     except Exception as exc:
         raise ReplayError('copy', obj_id=obj_id, exc=exc) from None
     return len(obj)
 
 
 @content_replay_retry
 def obj_in_objstorage(obj_id, dst):
     """Check if an object is already in an objstorage, tenaciously"""
     try:
         return obj_id in dst
     except Exception as exc:
         raise ReplayError('in_dst', obj_id=obj_id, exc=exc) from None
 
 
 def process_replay_objects_content(
         all_objects: Dict[str, List[dict]],
         *,
         src: ObjStorage,
         dst: ObjStorage,
         exclude_fn: Optional[Callable[[dict], bool]] = None,
         check_dst: bool = True,
 ):
     """
     Takes a list of records from Kafka (see
     :py:func:`swh.journal.client.JournalClient.process`) and copies them
     from the `src` objstorage to the `dst` objstorage, if:
 
     * `obj['status']` is `'visible'`
     * `exclude_fn(obj)` is `False` (if `exclude_fn` is provided)
     * `obj['sha1'] not in dst` (if `check_dst` is True)
 
     Args:
         all_objects: Objects passed by the Kafka client. Most importantly,
             `all_objects['content'][*]['sha1']` is the sha1 hash of each
             content.
         src: An object storage (see :py:func:`swh.objstorage.get_objstorage`)
         dst: An object storage (see :py:func:`swh.objstorage.get_objstorage`)
         exclude_fn: Determines whether an object should be copied.
         check_dst: Determines whether we should check the destination
             objstorage before copying.
 
     Example:
 
     >>> from swh.objstorage import get_objstorage
     >>> src = get_objstorage('memory', {})
     >>> dst = get_objstorage('memory', {})
     >>> id1 = src.add(b'foo bar')
     >>> id2 = src.add(b'baz qux')
     >>> kafka_partitions = {
     ...     'content': [
     ...         {
     ...             'sha1': id1,
     ...             'status': 'visible',
     ...         },
     ...         {
     ...             'sha1': id2,
     ...             'status': 'visible',
     ...         },
     ...     ]
     ... }
     >>> process_replay_objects_content(
     ...     kafka_partitions, src=src, dst=dst,
     ...     exclude_fn=lambda obj: obj['sha1'] == id1)
     >>> id1 in dst
     False
     >>> id2 in dst
     True
     """
     vol = []
     nb_skipped = 0
     nb_failures = 0
     t0 = time()
 
     for (object_type, objects) in all_objects.items():
         if object_type != 'content':
             logger.warning(
                 'Received a series of %s, this should not happen',
                 object_type)
             continue
         for obj in objects:
             obj_id = obj[ID_HASH_ALGO]
             if obj['status'] != 'visible':
                 nb_skipped += 1
                 logger.debug('skipped %s (status=%s)',
                              hash_to_hex(obj_id), obj['status'])
                 statsd.increment(CONTENT_OPERATIONS_METRIC,
                                  tags={"decision": "skipped",
                                        "status": obj["status"]})
             elif exclude_fn and exclude_fn(obj):
                 nb_skipped += 1
                 logger.debug('skipped %s (manually excluded)',
                              hash_to_hex(obj_id))
                 statsd.increment(CONTENT_OPERATIONS_METRIC,
                                  tags={"decision": "excluded"})
             elif check_dst and obj_in_objstorage(obj_id, dst):
                 nb_skipped += 1
                 logger.debug('skipped %s (in dst)', hash_to_hex(obj_id))
                 statsd.increment(CONTENT_OPERATIONS_METRIC,
                                  tags={"decision": "in_dst"})
             else:
                 try:
                     copied = copy_object(obj_id, src, dst)
                 except ObjNotFoundError:
                     nb_skipped += 1
                     statsd.increment(CONTENT_OPERATIONS_METRIC,
                                      tags={"decision": "not_in_src"})
                 else:
                     if copied is None:
                         nb_failures += 1
                         statsd.increment(CONTENT_OPERATIONS_METRIC,
                                          tags={"decision": "failed"})
                     else:
                         vol.append(copied)
                         statsd.increment(CONTENT_OPERATIONS_METRIC,
                                          tags={"decision": "copied"})
 
     dt = time() - t0
     logger.info(
         'processed %s content objects in %.1fsec '
         '(%.1f obj/sec, %.1fMB/sec) - %d failed - %d skipped',
         len(vol), dt,
         len(vol)/dt,
         sum(vol)/1024/1024/dt,
         nb_failures,
         nb_skipped)
 
     if notify:
         notify('WATCHDOG=1')
diff --git a/swh/journal/tests/conftest.py b/swh/journal/tests/conftest.py
index 2689ca5..d75792a 100644
--- a/swh/journal/tests/conftest.py
+++ b/swh/journal/tests/conftest.py
@@ -1,245 +1,264 @@
 # Copyright (C) 2019 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import os
 import pytest
 import logging
 import random
 import string
 
 from confluent_kafka import Consumer
 from subprocess import Popen
 from typing import Any, Dict, List, Optional, Tuple
 
 from pathlib import Path
 from pytest_kafka import (
     make_zookeeper_process, make_kafka_server, ZOOKEEPER_CONFIG_TEMPLATE,
 )
 
 from swh.model.hashutil import hash_to_bytes
 
 
 logger = logging.getLogger(__name__)
 
-
 CONTENTS = [
     {
         'length': 3,
         'sha1': hash_to_bytes(
             '34973274ccef6ab4dfaaf86599792fa9c3fe4689'),
         'sha1_git': b'foo',
         'blake2s256': b'bar',
         'sha256': b'baz',
         'status': 'visible',
     },
 ]
 
+duplicate_content1 = {
+    'length': 4,
+    'sha1': hash_to_bytes(
+        '44973274ccef6ab4dfaaf86599792fa9c3fe4689'),
+    'sha1_git': b'another-foo',
+    'blake2s256': b'another-bar',
+    'sha256': b'another-baz',
+    'status': 'visible',
+}
+
+# Craft a sha1 collision
+duplicate_content2 = duplicate_content1.copy()
+sha1_array = bytearray(duplicate_content1['sha1_git'])
+sha1_array[0] += 1
+duplicate_content2['sha1_git'] = bytes(sha1_array)
+
+
+DUPLICATE_CONTENTS = [duplicate_content1, duplicate_content2]
+
+
 COMMITTERS = [
     {
         'fullname': b'foo',
         'name': b'foo',
         'email': b'',
     },
     {
         'fullname': b'bar',
         'name': b'bar',
         'email': b'',
     }
 ]
 
 DATES = [
     {
         'timestamp': {
             'seconds': 1234567891,
             'microseconds': 0,
         },
         'offset': 120,
         'negative_utc': None,
     },
     {
         'timestamp': {
             'seconds': 1234567892,
             'microseconds': 0,
         },
         'offset': 120,
         'negative_utc': None,
     }
 ]
 
 REVISIONS = [
     {
         'id': hash_to_bytes('7026b7c1a2af56521e951c01ed20f255fa054238'),
         'message': b'hello',
         'date': DATES[0],
         'committer': COMMITTERS[0],
         'author':  COMMITTERS[0],
         'committer_date': DATES[0],
         'type': 'git',
         'directory': '\x01'*20,
         'synthetic': False,
         'metadata': None,
         'parents': [],
     },
     {
         'id': hash_to_bytes('368a48fe15b7db2383775f97c6b247011b3f14f4'),
         'message': b'hello again',
         'date': DATES[1],
         'committer': COMMITTERS[1],
         'author':  COMMITTERS[1],
         'committer_date': DATES[1],
         'type': 'hg',
         'directory': '\x02'*20,
         'synthetic': False,
         'metadata': None,
         'parents': [],
     },
 ]
 
 RELEASES = [
     {
         'id': hash_to_bytes('d81cc0710eb6cf9efd5b920a8453e1e07157b6cd'),
         'name': b'v0.0.1',
         'date': {
             'timestamp': {
                 'seconds': 1234567890,
                 'microseconds': 0,
             },
             'offset': 120,
             'negative_utc': None,
         },
         'author': COMMITTERS[0],
         'target_type': 'revision',
         'target': b'\x04'*20,
         'message': b'foo',
         'synthetic': False,
     },
 ]
 
 ORIGINS = [
     {
         'url': 'https://somewhere.org/den/fox',
     },
     {
         'url': 'https://overtherainbow.org/fox/den',
     }
 ]
 
 ORIGIN_VISITS = [
     {
         'origin': ORIGINS[0]['url'],
         'date': '2013-05-07 04:20:39.369271+00:00',
         'snapshot': None,  # TODO
         'status': 'ongoing',  # TODO
         'metadata': {'foo': 'bar'},
         'type': 'git',
     },
     {
         'origin': ORIGINS[0]['url'],
         'date': '2018-11-27 17:20:39+00:00',
         'snapshot': None,  # TODO
         'status': 'ongoing',  # TODO
         'metadata': {'baz': 'qux'},
         'type': 'git',
     }
 ]
 
 # From type to tuple (id, <objects instances to test>)
 OBJECT_TYPE_KEYS = {
     'content': ('sha1', CONTENTS),
     'revision': ('id', REVISIONS),
     'release': ('id', RELEASES),
     'origin': (None, ORIGINS),
     'origin_visit': (None, ORIGIN_VISITS),
 }  # type: Dict[str, Tuple[Optional[str], List[Dict[str, Any]]]]
 
 
 KAFKA_ROOT = os.environ.get('SWH_KAFKA_ROOT')
 KAFKA_ROOT = KAFKA_ROOT if KAFKA_ROOT else os.path.dirname(__file__) + '/kafka'
 if not os.path.exists(KAFKA_ROOT):
     msg = ('Development error: %s must exist and target an '
            'existing kafka installation' % KAFKA_ROOT)
     raise ValueError(msg)
 
 KAFKA_SCRIPTS = Path(KAFKA_ROOT) / 'bin'
 
 KAFKA_BIN = str(KAFKA_SCRIPTS / 'kafka-server-start.sh')
 ZOOKEEPER_BIN = str(KAFKA_SCRIPTS / 'zookeeper-server-start.sh')
 
 ZK_CONFIG_TEMPLATE = ZOOKEEPER_CONFIG_TEMPLATE + '\nadmin.enableServer=false\n'
 
 # Those defines fixtures
 zookeeper_proc = make_zookeeper_process(ZOOKEEPER_BIN,
                                         zk_config_template=ZK_CONFIG_TEMPLATE,
                                         scope='session')
 os.environ['KAFKA_LOG4J_OPTS'] = \
     '-Dlog4j.configuration=file:%s/log4j.properties' % \
     os.path.dirname(__file__)
 kafka_server = make_kafka_server(KAFKA_BIN, 'zookeeper_proc', scope='session')
 
 kafka_logger = logging.getLogger('kafka')
 kafka_logger.setLevel(logging.WARN)
 
 
 @pytest.fixture(scope='function')
 def kafka_prefix():
     """Pick a random prefix for kafka topics on each call"""
     return ''.join(random.choice(string.ascii_lowercase) for _ in range(10))
 
 
 @pytest.fixture(scope='function')
 def kafka_consumer_group(kafka_prefix: str):
     """Pick a random consumer group for kafka consumers on each call"""
     return "test-consumer-%s" % kafka_prefix
 
 
 TEST_CONFIG = {
     'consumer_id': 'swh.journal.consumer',
     'object_types': OBJECT_TYPE_KEYS.keys(),
     'stop_after_objects': 1,  # will read 1 object and stop
     'storage': {'cls': 'memory', 'args': {}},
 }
 
 
 @pytest.fixture
 def test_config(kafka_server: Tuple[Popen, int],
                 kafka_prefix: str):
     """Test configuration needed for producer/consumer
 
     """
     _, port = kafka_server
     return {
         **TEST_CONFIG,
         'brokers': ['127.0.0.1:{}'.format(port)],
         'prefix': kafka_prefix + '.swh.journal.objects',
     }
 
 
 @pytest.fixture
 def consumer(
     kafka_server: Tuple[Popen, int],
     test_config: Dict,
     kafka_consumer_group: str,
 ) -> Consumer:
     """Get a connected Kafka consumer.
 
     """
     _, kafka_port = kafka_server
     consumer = Consumer({
         'bootstrap.servers': '127.0.0.1:{}'.format(kafka_port),
         'auto.offset.reset': 'earliest',
         'enable.auto.commit': True,
         'group.id': kafka_consumer_group,
     })
 
     kafka_topics = [
         '%s.%s' % (test_config['prefix'], object_type)
         for object_type in test_config['object_types']
     ]
 
     consumer.subscribe(kafka_topics)
 
     yield consumer
 
     consumer.close()
diff --git a/swh/journal/tests/test_replay.py b/swh/journal/tests/test_replay.py
index 6dfb2db..e212485 100644
--- a/swh/journal/tests/test_replay.py
+++ b/swh/journal/tests/test_replay.py
@@ -1,244 +1,401 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import datetime
 import functools
+import logging
 import random
 from subprocess import Popen
-from typing import Tuple
+from typing import Dict, Tuple
 
 import dateutil
 from confluent_kafka import Producer
 from hypothesis import strategies, given, settings
 import pytest
 
 from swh.storage import get_storage
 
 from swh.journal.client import JournalClient
 from swh.journal.serializers import key_to_kafka, value_to_kafka
 from swh.journal.replay import process_replay_objects, is_hash_in_bytearray
+from swh.model.hashutil import hash_to_hex
+from swh.model.model import Content
 
-from .conftest import OBJECT_TYPE_KEYS
+from .conftest import OBJECT_TYPE_KEYS, DUPLICATE_CONTENTS
 from .utils import MockedJournalClient, MockedKafkaWriter
 
 
 storage_config = {
     'cls': 'pipeline',
     'steps': [
         {'cls': 'validate'},
         {'cls': 'memory'},
     ]
 }
 
 
+def make_topic(kafka_prefix: str, object_type: str) -> str:
+    return kafka_prefix + '.' + object_type
+
+
 def test_storage_play(
         kafka_prefix: str,
         kafka_consumer_group: str,
-        kafka_server: Tuple[Popen, int]):
+        kafka_server: Tuple[Popen, int],
+        caplog):
+    """Optimal replayer scenario.
+
+    This:
+    - writes objects to the topic
+    - replayer consumes objects from the topic and replay them
+
+    """
     (_, port) = kafka_server
     kafka_prefix += '.swh.journal.objects'
 
     storage = get_storage(**storage_config)
 
     producer = Producer({
         'bootstrap.servers': 'localhost:{}'.format(port),
         'client.id': 'test producer',
         'acks': 'all',
     })
 
     now = datetime.datetime.now(tz=datetime.timezone.utc)
 
     # Fill Kafka
     nb_sent = 0
     nb_visits = 0
     for (object_type, (_, objects)) in OBJECT_TYPE_KEYS.items():
-        topic = kafka_prefix + '.' + object_type
+        topic = make_topic(kafka_prefix, object_type)
+        for object_ in objects:
+            key = bytes(random.randint(0, 255) for _ in range(40))
+            object_ = object_.copy()
+            if object_type == 'content':
+                object_['ctime'] = now
+            elif object_type == 'origin_visit':
+                nb_visits += 1
+                object_['visit'] = nb_visits
+            producer.produce(
+                topic=topic, key=key_to_kafka(key),
+                value=value_to_kafka(object_),
+            )
+            nb_sent += 1
+
+    producer.flush()
+
+    caplog.set_level(logging.ERROR, 'swh.journal.replay')
+    # Fill the storage from Kafka
+    replayer = JournalClient(
+        brokers='localhost:%d' % kafka_server[1],
+        group_id=kafka_consumer_group,
+        prefix=kafka_prefix,
+        stop_after_objects=nb_sent,
+    )
+    worker_fn = functools.partial(process_replay_objects, storage=storage)
+    nb_inserted = 0
+    while nb_inserted < nb_sent:
+        nb_inserted += replayer.process(worker_fn)
+    assert nb_sent == nb_inserted
+
+    # Check the objects were actually inserted in the storage
+    assert OBJECT_TYPE_KEYS['revision'][1] == \
+        list(storage.revision_get(
+            [rev['id'] for rev in OBJECT_TYPE_KEYS['revision'][1]]))
+    assert OBJECT_TYPE_KEYS['release'][1] == \
+        list(storage.release_get(
+            [rel['id'] for rel in OBJECT_TYPE_KEYS['release'][1]]))
+
+    origins = list(storage.origin_get(
+            [orig for orig in OBJECT_TYPE_KEYS['origin'][1]]))
+    assert OBJECT_TYPE_KEYS['origin'][1] == \
+        [{'url': orig['url']} for orig in origins]
+    for origin in origins:
+        origin_url = origin['url']
+        expected_visits = [
+            {
+                **visit,
+                'origin': origin_url,
+                'date': dateutil.parser.parse(visit['date']),
+            }
+            for visit in OBJECT_TYPE_KEYS['origin_visit'][1]
+            if visit['origin'] == origin['url']
+        ]
+        actual_visits = list(storage.origin_visit_get(
+            origin_url))
+        for visit in actual_visits:
+            del visit['visit']  # opaque identifier
+        assert expected_visits == actual_visits
+
+    input_contents = OBJECT_TYPE_KEYS['content'][1]
+    contents = storage.content_get_metadata(
+            [cont['sha1'] for cont in input_contents])
+    assert len(contents) == len(input_contents)
+    assert contents == {cont['sha1']: [cont] for cont in input_contents}
+
+    collision = 0
+    for record in caplog.records:
+        logtext = record.getMessage()
+        if 'Colliding contents:' in logtext:
+            collision += 1
+
+    assert collision == 0, "No collision should be detected"
+
+
+def test_storage_play_with_collision(
+        kafka_prefix: str,
+        kafka_consumer_group: str,
+        kafka_server: Tuple[Popen, int],
+        caplog):
+    """Another replayer scenario with collisions.
+
+    This:
+    - writes objects to the topic, including colliding contents
+    - replayer consumes objects from the topic and replay them
+    - This drops the colliding contents from the replay when detected
+
+    """
+    (_, port) = kafka_server
+    kafka_prefix += '.swh.journal.objects'
+
+    storage = get_storage(**storage_config)
+
+    producer = Producer({
+        'bootstrap.servers': 'localhost:{}'.format(port),
+        'client.id': 'test producer',
+        'enable.idempotence': 'true',
+    })
+
+    now = datetime.datetime.now(tz=datetime.timezone.utc)
+
+    # Fill Kafka
+    nb_sent = 0
+    nb_visits = 0
+    for (object_type, (_, objects)) in OBJECT_TYPE_KEYS.items():
+        topic = make_topic(kafka_prefix, object_type)
         for object_ in objects:
             key = bytes(random.randint(0, 255) for _ in range(40))
             object_ = object_.copy()
             if object_type == 'content':
                 object_['ctime'] = now
             elif object_type == 'origin_visit':
                 nb_visits += 1
                 object_['visit'] = nb_visits
             producer.produce(
                 topic=topic, key=key_to_kafka(key),
                 value=value_to_kafka(object_),
             )
             nb_sent += 1
 
+    # Create collision in input data
+    # They are not written in the destination
+    for content in DUPLICATE_CONTENTS:
+        topic = make_topic(kafka_prefix, 'content')
+        producer.produce(
+            topic=topic, key=key_to_kafka(key),
+            value=value_to_kafka(content),
+        )
+
+        nb_sent += 1
+
     producer.flush()
 
+    caplog.set_level(logging.ERROR, 'swh.journal.replay')
     # Fill the storage from Kafka
     replayer = JournalClient(
         brokers='localhost:%d' % kafka_server[1],
         group_id=kafka_consumer_group,
         prefix=kafka_prefix,
         stop_after_objects=nb_sent,
     )
     worker_fn = functools.partial(process_replay_objects, storage=storage)
     nb_inserted = 0
     while nb_inserted < nb_sent:
         nb_inserted += replayer.process(worker_fn)
     assert nb_sent == nb_inserted
 
     # Check the objects were actually inserted in the storage
     assert OBJECT_TYPE_KEYS['revision'][1] == \
         list(storage.revision_get(
             [rev['id'] for rev in OBJECT_TYPE_KEYS['revision'][1]]))
     assert OBJECT_TYPE_KEYS['release'][1] == \
         list(storage.release_get(
             [rel['id'] for rel in OBJECT_TYPE_KEYS['release'][1]]))
 
     origins = list(storage.origin_get(
             [orig for orig in OBJECT_TYPE_KEYS['origin'][1]]))
     assert OBJECT_TYPE_KEYS['origin'][1] == \
         [{'url': orig['url']} for orig in origins]
     for origin in origins:
         origin_url = origin['url']
         expected_visits = [
             {
                 **visit,
                 'origin': origin_url,
                 'date': dateutil.parser.parse(visit['date']),
             }
             for visit in OBJECT_TYPE_KEYS['origin_visit'][1]
             if visit['origin'] == origin['url']
         ]
         actual_visits = list(storage.origin_visit_get(
             origin_url))
         for visit in actual_visits:
             del visit['visit']  # opaque identifier
         assert expected_visits == actual_visits
 
     input_contents = OBJECT_TYPE_KEYS['content'][1]
     contents = storage.content_get_metadata(
             [cont['sha1'] for cont in input_contents])
     assert len(contents) == len(input_contents)
     assert contents == {cont['sha1']: [cont] for cont in input_contents}
 
+    nb_collisions = 0
+
+    actual_collision: Dict
+    for record in caplog.records:
+        logtext = record.getMessage()
+        if 'Collision detected:' in logtext:
+            nb_collisions += 1
+            actual_collision = record.args['collision']
+
+    assert nb_collisions == 1, "1 collision should be detected"
+
+    algo = 'sha1'
+    assert actual_collision['algo'] == algo
+    expected_colliding_hash = hash_to_hex(DUPLICATE_CONTENTS[0][algo])
+    assert actual_collision['hash'] == expected_colliding_hash
+
+    actual_colliding_hashes = actual_collision['objects']
+    assert len(actual_colliding_hashes) == len(DUPLICATE_CONTENTS)
+    for content in DUPLICATE_CONTENTS:
+        expected_content_hashes = {
+            k: hash_to_hex(v)
+            for k, v in Content.from_dict(content).hashes().items()
+        }
+        assert expected_content_hashes in actual_colliding_hashes
+
 
 def _test_write_replay_origin_visit(visits):
     """Helper function to write tests for origin_visit.
 
     Each visit (a dict) given in the 'visits' argument will be sent to
     a (mocked) kafka queue, which a in-memory-storage backed replayer is
     listening to.
 
     Check that corresponding origin visits entities are present in the storage
     and have correct values.
 
     """
     queue = []
     replayer = MockedJournalClient(queue)
     writer = MockedKafkaWriter(queue)
 
     # Note that flipping the order of these two insertions will crash
     # the test, because the legacy origin_format does not allow to create
     # the origin when needed (type is missing)
     writer.send('origin', 'foo', {
         'url': 'http://example.com/',
         'type': 'git',
     })
     for visit in visits:
         writer.send('origin_visit', 'foo', visit)
 
     queue_size = len(queue)
     assert replayer.stop_after_objects is None
     replayer.stop_after_objects = queue_size
 
     storage = get_storage(**storage_config)
     worker_fn = functools.partial(process_replay_objects, storage=storage)
 
     replayer.process(worker_fn)
 
     actual_visits = list(storage.origin_visit_get('http://example.com/'))
 
     assert len(actual_visits) == len(visits), actual_visits
 
     for vin, vout in zip(visits, actual_visits):
         vin = vin.copy()
         vout = vout.copy()
         assert vout.pop('origin') == 'http://example.com/'
         vin.pop('origin')
         vin.setdefault('type', 'git')
         vin.setdefault('metadata', None)
         assert vin == vout
 
 
 def test_write_replay_origin_visit():
     """Test origin_visit when the 'origin' is just a string."""
     now = datetime.datetime.now()
     visits = [{
         'visit': 1,
         'origin': 'http://example.com/',
         'date': now,
         'type': 'git',
         'status': 'partial',
         'snapshot': None,
     }]
     _test_write_replay_origin_visit(visits)
 
 
 def test_write_replay_legacy_origin_visit1():
     """Test origin_visit when there is no type."""
     now = datetime.datetime.now()
     visits = [{
         'visit': 1,
         'origin': 'http://example.com/',
         'date': now,
         'status': 'partial',
         'snapshot': None,
     }]
     with pytest.raises(ValueError, match='too old'):
         _test_write_replay_origin_visit(visits)
 
 
 def test_write_replay_legacy_origin_visit2():
     """Test origin_visit when 'type' is missing from the visit, but not
     from the origin."""
     now = datetime.datetime.now()
     visits = [{
         'visit': 1,
         'origin': {
             'url': 'http://example.com/',
             'type': 'git',
         },
         'date': now,
         'type': 'git',
         'status': 'partial',
         'snapshot': None,
     }]
     _test_write_replay_origin_visit(visits)
 
 
 def test_write_replay_legacy_origin_visit3():
     """Test origin_visit when the origin is a dict"""
     now = datetime.datetime.now()
     visits = [{
         'visit': 1,
         'origin': {
             'url': 'http://example.com/',
         },
         'date': now,
         'type': 'git',
         'status': 'partial',
         'snapshot': None,
     }]
     _test_write_replay_origin_visit(visits)
 
 
 hash_strategy = strategies.binary(min_size=20, max_size=20)
 
 
 @settings(max_examples=500)
 @given(strategies.sets(hash_strategy, min_size=0, max_size=500),
        strategies.sets(hash_strategy, min_size=10))
 def test_is_hash_in_bytearray(haystack, needles):
     array = b''.join(sorted(haystack))
     needles |= haystack  # Exhaustively test for all objects in the array
     for needle in needles:
         assert is_hash_in_bytearray(needle, array, len(haystack)) == \
             (needle in haystack)