diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
index 4987333..d853f17 100644
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -1,452 +1,455 @@
 # Copyright (C) 2015-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 
 import json
 import psycopg2
 import psycopg2.pool
 
-from collections import defaultdict
+from collections import defaultdict, Counter
 
 from swh.storage.common import db_transaction_generator, db_transaction
 from swh.storage.exc import StorageDBError
-from .db import Db
 
 from . import converters
+from .db import Db
+from .exc import IndexerStorageArgumentException, DuplicateId
 
 
 INDEXER_CFG_KEY = 'indexer_storage'
 
 
 MAPPING_NAMES = ['codemeta', 'gemspec', 'maven', 'npm', 'pkg-info']
 
 
 def get_indexer_storage(cls, args):
     """Get an indexer storage object of class `storage_class` with
     arguments `storage_args`.
 
     Args:
         cls (str): storage's class, either 'local' or 'remote'
         args (dict): dictionary of arguments passed to the
             storage class constructor
 
     Returns:
         an instance of swh.indexer's storage (either local or remote)
 
     Raises:
         ValueError if passed an unknown storage class.
 
     """
     if cls == 'remote':
         from .api.client import RemoteStorage as IndexerStorage
     elif cls == 'local':
         from . import IndexerStorage
     elif cls == 'memory':
         from .in_memory import IndexerStorage
     else:
         raise ValueError('Unknown indexer storage class `%s`' % cls)
 
     return IndexerStorage(**args)
 
 
-def _check_id_duplicates(data):
+def check_id_duplicates(data):
     """
     If any two dictionaries in `data` have the same id, raises
     a `ValueError`.
 
     Values associated to the key must be hashable.
 
     Args:
         data (List[dict]): List of dictionaries to be inserted
 
-    >>> _check_id_duplicates([
+    >>> check_id_duplicates([
     ...     {'id': 'foo', 'data': 'spam'},
     ...     {'id': 'bar', 'data': 'egg'},
     ... ])
-    >>> _check_id_duplicates([
+    >>> check_id_duplicates([
     ...     {'id': 'foo', 'data': 'spam'},
     ...     {'id': 'foo', 'data': 'egg'},
     ... ])
     Traceback (most recent call last):
       ...
-    ValueError: The same id is present more than once.
+    swh.indexer.storage.exc.DuplicateId: ['foo']
     """
-    if len({item['id'] for item in data}) < len(data):
-        raise ValueError('The same id is present more than once.')
+    counter = Counter(item['id'] for item in data)
+    duplicates = [id_ for (id_, count) in counter.items() if count >= 2]
+    if duplicates:
+        raise DuplicateId(duplicates)
 
 
 class IndexerStorage:
     """SWH Indexer Storage
 
     """
     def __init__(self, db, min_pool_conns=1, max_pool_conns=10):
         """
         Args:
             db_conn: either a libpq connection string, or a psycopg2 connection
 
         """
         try:
             if isinstance(db, psycopg2.extensions.connection):
                 self._pool = None
                 self._db = Db(db)
             else:
                 self._pool = psycopg2.pool.ThreadedConnectionPool(
                     min_pool_conns, max_pool_conns, db
                 )
                 self._db = None
         except psycopg2.OperationalError as e:
             raise StorageDBError(e)
 
     def get_db(self):
         if self._db:
             return self._db
         return Db.from_pool(self._pool)
 
     def put_db(self, db):
         if db is not self._db:
             db.put_conn()
 
     @db_transaction()
     def check_config(self, *, check_write, db=None, cur=None):
         # Check permissions on one of the tables
         if check_write:
             check = 'INSERT'
         else:
             check = 'SELECT'
 
         cur.execute(
             "select has_table_privilege(current_user, 'content_mimetype', %s)",  # noqa
             (check,)
         )
         return cur.fetchone()[0]
 
     @db_transaction_generator()
     def content_mimetype_missing(self, mimetypes, db=None, cur=None):
         for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
             yield obj[0]
 
     def _content_get_range(self, content_type, start, end,
                            indexer_configuration_id, limit=1000,
                            with_textual_data=False,
                            db=None, cur=None):
         if limit is None:
-            raise ValueError('Development error: limit should not be None')
+            raise IndexerStorageArgumentException('limit should not be None')
         if content_type not in db.content_indexer_names:
-            err = 'Development error: Wrong type. Should be one of [%s]' % (
+            err = 'Wrong type. Should be one of [%s]' % (
                 ','.join(db.content_indexer_names))
-            raise ValueError(err)
+            raise IndexerStorageArgumentException(err)
 
         ids = []
         next_id = None
         for counter, obj in enumerate(db.content_get_range(
                 content_type, start, end, indexer_configuration_id,
                 limit=limit+1, with_textual_data=with_textual_data, cur=cur)):
             _id = obj[0]
             if counter >= limit:
                 next_id = _id
                 break
 
             ids.append(_id)
 
         return {
             'ids': ids,
             'next': next_id
         }
 
     @db_transaction()
     def content_mimetype_get_range(self, start, end, indexer_configuration_id,
                                    limit=1000, db=None, cur=None):
         return self._content_get_range('mimetype', start, end,
                                        indexer_configuration_id, limit=limit,
                                        db=db, cur=cur)
 
     @db_transaction()
     def content_mimetype_add(self, mimetypes, conflict_update=False, db=None,
                              cur=None):
-        _check_id_duplicates(mimetypes)
+        check_id_duplicates(mimetypes)
         mimetypes.sort(key=lambda m: m['id'])
         db.mktemp_content_mimetype(cur)
         db.copy_to(mimetypes, 'tmp_content_mimetype',
                    ['id', 'mimetype', 'encoding', 'indexer_configuration_id'],
                    cur)
         db.content_mimetype_add_from_temp(conflict_update, cur)
 
     @db_transaction_generator()
     def content_mimetype_get(self, ids, db=None, cur=None):
         for c in db.content_mimetype_get_from_list(ids, cur):
             yield converters.db_to_mimetype(
                 dict(zip(db.content_mimetype_cols, c)))
 
     @db_transaction_generator()
     def content_language_missing(self, languages, db=None, cur=None):
         for obj in db.content_language_missing_from_list(languages, cur):
             yield obj[0]
 
     @db_transaction_generator()
     def content_language_get(self, ids, db=None, cur=None):
         for c in db.content_language_get_from_list(ids, cur):
             yield converters.db_to_language(
                 dict(zip(db.content_language_cols, c)))
 
     @db_transaction()
     def content_language_add(self, languages, conflict_update=False, db=None,
                              cur=None):
-        _check_id_duplicates(languages)
+        check_id_duplicates(languages)
         languages.sort(key=lambda m: m['id'])
         db.mktemp_content_language(cur)
         # empty language is mapped to 'unknown'
         db.copy_to(
             ({
                 'id': l['id'],
                 'lang': 'unknown' if not l['lang'] else l['lang'],
                 'indexer_configuration_id': l['indexer_configuration_id'],
             } for l in languages),
             'tmp_content_language',
             ['id', 'lang', 'indexer_configuration_id'], cur)
 
         db.content_language_add_from_temp(conflict_update, cur)
 
     @db_transaction_generator()
     def content_ctags_missing(self, ctags, db=None, cur=None):
         for obj in db.content_ctags_missing_from_list(ctags, cur):
             yield obj[0]
 
     @db_transaction_generator()
     def content_ctags_get(self, ids, db=None, cur=None):
         for c in db.content_ctags_get_from_list(ids, cur):
             yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c)))
 
     @db_transaction()
     def content_ctags_add(self, ctags, conflict_update=False, db=None,
                           cur=None):
-        _check_id_duplicates(ctags)
+        check_id_duplicates(ctags)
         ctags.sort(key=lambda m: m['id'])
 
         def _convert_ctags(__ctags):
             """Convert ctags dict to list of ctags.
 
             """
             for ctags in __ctags:
                 yield from converters.ctags_to_db(ctags)
 
         db.mktemp_content_ctags(cur)
         db.copy_to(list(_convert_ctags(ctags)),
                    tblname='tmp_content_ctags',
                    columns=['id', 'name', 'kind', 'line',
                             'lang', 'indexer_configuration_id'],
                    cur=cur)
 
         db.content_ctags_add_from_temp(conflict_update, cur)
 
     @db_transaction_generator()
     def content_ctags_search(self, expression,
                              limit=10, last_sha1=None, db=None, cur=None):
         for obj in db.content_ctags_search(expression, last_sha1, limit,
                                            cur=cur):
             yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj)))
 
     @db_transaction_generator()
     def content_fossology_license_get(self, ids, db=None, cur=None):
         d = defaultdict(list)
         for c in db.content_fossology_license_get_from_list(ids, cur):
             license = dict(zip(db.content_fossology_license_cols, c))
 
             id_ = license['id']
             d[id_].append(converters.db_to_fossology_license(license))
 
         for id_, facts in d.items():
             yield {id_: facts}
 
     @db_transaction()
     def content_fossology_license_add(self, licenses, conflict_update=False,
                                       db=None, cur=None):
-        _check_id_duplicates(licenses)
+        check_id_duplicates(licenses)
         licenses.sort(key=lambda m: m['id'])
         db.mktemp_content_fossology_license(cur)
         db.copy_to(
             ({
                 'id': sha1['id'],
                 'indexer_configuration_id': sha1['indexer_configuration_id'],
                 'license': license,
               } for sha1 in licenses
                 for license in sha1['licenses']),
             tblname='tmp_content_fossology_license',
             columns=['id', 'license', 'indexer_configuration_id'],
             cur=cur)
         db.content_fossology_license_add_from_temp(conflict_update, cur)
 
     @db_transaction()
     def content_fossology_license_get_range(
             self, start, end, indexer_configuration_id,
             limit=1000, db=None, cur=None):
         return self._content_get_range('fossology_license', start, end,
                                        indexer_configuration_id, limit=limit,
                                        with_textual_data=True, db=db, cur=cur)
 
     @db_transaction_generator()
     def content_metadata_missing(self, metadata, db=None, cur=None):
         for obj in db.content_metadata_missing_from_list(metadata, cur):
             yield obj[0]
 
     @db_transaction_generator()
     def content_metadata_get(self, ids, db=None, cur=None):
         for c in db.content_metadata_get_from_list(ids, cur):
             yield converters.db_to_metadata(
                 dict(zip(db.content_metadata_cols, c)))
 
     @db_transaction()
     def content_metadata_add(self, metadata, conflict_update=False, db=None,
                              cur=None):
-        _check_id_duplicates(metadata)
+        check_id_duplicates(metadata)
         metadata.sort(key=lambda m: m['id'])
 
         db.mktemp_content_metadata(cur)
 
         db.copy_to(metadata, 'tmp_content_metadata',
                    ['id', 'metadata', 'indexer_configuration_id'],
                    cur)
         db.content_metadata_add_from_temp(conflict_update, cur)
 
     @db_transaction_generator()
     def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None):
         for obj in db.revision_intrinsic_metadata_missing_from_list(
                 metadata, cur):
             yield obj[0]
 
     @db_transaction_generator()
     def revision_intrinsic_metadata_get(self, ids, db=None, cur=None):
         for c in db.revision_intrinsic_metadata_get_from_list(ids, cur):
             yield converters.db_to_metadata(
                 dict(zip(db.revision_intrinsic_metadata_cols, c)))
 
     @db_transaction()
     def revision_intrinsic_metadata_add(self, metadata, conflict_update=False,
                                         db=None, cur=None):
-        _check_id_duplicates(metadata)
+        check_id_duplicates(metadata)
         metadata.sort(key=lambda m: m['id'])
 
         db.mktemp_revision_intrinsic_metadata(cur)
 
         db.copy_to(metadata, 'tmp_revision_intrinsic_metadata',
                    ['id', 'metadata', 'mappings',
                     'indexer_configuration_id'],
                    cur)
         db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur)
 
     @db_transaction()
     def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None):
         db.revision_intrinsic_metadata_delete(entries, cur)
 
     @db_transaction_generator()
     def origin_intrinsic_metadata_get(self, ids, db=None, cur=None):
         for c in db.origin_intrinsic_metadata_get_from_list(ids, cur):
             yield converters.db_to_metadata(
                 dict(zip(db.origin_intrinsic_metadata_cols, c)))
 
     @db_transaction()
     def origin_intrinsic_metadata_add(self, metadata,
                                       conflict_update=False, db=None,
                                       cur=None):
-        _check_id_duplicates(metadata)
+        check_id_duplicates(metadata)
         metadata.sort(key=lambda m: m['id'])
 
         db.mktemp_origin_intrinsic_metadata(cur)
 
         db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',
                    ['id', 'metadata',
                     'indexer_configuration_id',
                     'from_revision', 'mappings'],
                    cur)
         db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)
 
     @db_transaction()
     def origin_intrinsic_metadata_delete(
             self, entries, db=None, cur=None):
         db.origin_intrinsic_metadata_delete(entries, cur)
 
     @db_transaction_generator()
     def origin_intrinsic_metadata_search_fulltext(
             self, conjunction, limit=100, db=None, cur=None):
         for c in db.origin_intrinsic_metadata_search_fulltext(
                 conjunction, limit=limit, cur=cur):
             yield converters.db_to_metadata(
                 dict(zip(db.origin_intrinsic_metadata_cols, c)))
 
     @db_transaction()
     def origin_intrinsic_metadata_search_by_producer(
             self, page_token='', limit=100, ids_only=False,
             mappings=None, tool_ids=None,
             db=None, cur=None):
         assert isinstance(page_token, str)
         # we go to limit+1 to check whether we should add next_page_token in
         # the response
         res = db.origin_intrinsic_metadata_search_by_producer(
             page_token, limit + 1, ids_only, mappings, tool_ids, cur)
         result = {}
         if ids_only:
             result['origins'] = [origin for (origin,) in res]
             if len(result['origins']) > limit:
                 result['origins'][limit:] = []
                 result['next_page_token'] = result['origins'][-1]
         else:
             result['origins'] = [converters.db_to_metadata(
                 dict(zip(db.origin_intrinsic_metadata_cols, c)))for c in res]
             if len(result['origins']) > limit:
                 result['origins'][limit:] = []
                 result['next_page_token'] = result['origins'][-1]['id']
         return result
 
     @db_transaction()
     def origin_intrinsic_metadata_stats(
             self, db=None, cur=None):
         mapping_names = [m for m in MAPPING_NAMES]
         select_parts = []
 
         # Count rows for each mapping
         for mapping_name in mapping_names:
             select_parts.append((
                 "sum(case when (mappings @> ARRAY['%s']) "
                 "         then 1 else 0 end)"
                 ) % mapping_name)
 
         # Total
         select_parts.append("sum(1)")
 
         # Rows whose metadata has at least one key that is not '@context'
         select_parts.append(
             "sum(case when ('{}'::jsonb @> (metadata - '@context')) "
             "         then 0 else 1 end)")
         cur.execute('select ' + ', '.join(select_parts)
                     + ' from origin_intrinsic_metadata')
         results = dict(zip(mapping_names + ['total', 'non_empty'],
                            cur.fetchone()))
         return {
             'total': results.pop('total'),
             'non_empty': results.pop('non_empty'),
             'per_mapping': results,
         }
 
     @db_transaction_generator()
     def indexer_configuration_add(self, tools, db=None, cur=None):
         db.mktemp_indexer_configuration(cur)
         db.copy_to(tools, 'tmp_indexer_configuration',
                    ['tool_name', 'tool_version', 'tool_configuration'],
                    cur)
 
         tools = db.indexer_configuration_add_from_temp(cur)
         for line in tools:
             yield dict(zip(db.indexer_configuration_cols, line))
 
     @db_transaction()
     def indexer_configuration_get(self, tool, db=None, cur=None):
         tool_conf = tool['tool_configuration']
         if isinstance(tool_conf, dict):
             tool_conf = json.dumps(tool_conf)
         idx = db.indexer_configuration_get(tool['tool_name'],
                                            tool['tool_version'],
                                            tool_conf)
         if not idx:
             return None
         return dict(zip(db.indexer_configuration_cols, idx))
diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py
index 0e62adc..f1768f9 100644
--- a/swh/indexer/storage/api/client.py
+++ b/swh/indexer/storage/api/client.py
@@ -1,17 +1,23 @@
 # Copyright (C) 2015-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from swh.core.api import RPCClient
 
-from swh.storage.exc import StorageAPIError
+from swh.indexer.storage.exc import (
+    IndexerStorageAPIError, IndexerStorageArgumentException,
+    DuplicateId,
+)
 
 from ..interface import IndexerStorageInterface
 
 
 class RemoteStorage(RPCClient):
     """Proxy to a remote storage API"""
 
     backend_class = IndexerStorageInterface
-    api_exception = StorageAPIError
+    api_exception = IndexerStorageAPIError
+    reraise_exceptions = [
+        IndexerStorageArgumentException, DuplicateId
+    ]
diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py
index 2b71f2b..0266a55 100644
--- a/swh/indexer/storage/api/server.py
+++ b/swh/indexer/storage/api/server.py
@@ -1,107 +1,113 @@
 # Copyright (C) 2015-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import os
 import logging
 
 from swh.core import config
 from swh.core.api import (RPCServerApp, error_handler,
                           encode_data_server as encode_data)
 from swh.indexer.storage import (
     get_indexer_storage, INDEXER_CFG_KEY
 )
+from swh.indexer.storage.exc import IndexerStorageArgumentException
 from swh.indexer.storage.interface import IndexerStorageInterface
 
 
 def get_storage():
     global storage
     if not storage:
         storage = get_indexer_storage(**app.config[INDEXER_CFG_KEY])
 
     return storage
 
 
 app = RPCServerApp(__name__,
                    backend_class=IndexerStorageInterface,
                    backend_factory=get_storage)
 storage = None
 
 
 @app.errorhandler(Exception)
 def my_error_handler(exception):
     return error_handler(exception, encode_data)
 
 
+@app.errorhandler(IndexerStorageArgumentException)
+def argument_error_handler(exception):
+    return error_handler(exception, encode_data, status_code=400)
+
+
 @app.route('/')
 def index():
     return 'SWH Indexer Storage API server'
 
 
 api_cfg = None
 
 
 def load_and_check_config(config_file, type='local'):
     """Check the minimal configuration is set to run the api or raise an
        error explanation.
 
     Args:
         config_file (str): Path to the configuration file to load
         type (str): configuration type. For 'local' type, more
                     checks are done.
 
     Raises:
         Error if the setup is not as expected
 
     Returns:
         configuration as a dict
 
     """
     if not config_file:
         raise EnvironmentError('Configuration file must be defined')
 
     if not os.path.exists(config_file):
         raise FileNotFoundError('Configuration file %s does not exist' % (
             config_file, ))
 
     cfg = config.read(config_file)
     if 'indexer_storage' not in cfg:
         raise KeyError("Missing '%indexer_storage' configuration")
 
     if type == 'local':
         vcfg = cfg['indexer_storage']
         cls = vcfg.get('cls')
         if cls != 'local':
             raise ValueError(
                 "The indexer_storage backend can only be started with a "
                 "'local' configuration")
 
         args = vcfg['args']
         if not args.get('db'):
             raise ValueError(
                 "Invalid configuration; missing 'db' config entry")
 
     return cfg
 
 
 def make_app_from_configfile():
     """Run the WSGI app from the webserver, loading the configuration from
        a configuration file.
 
        SWH_CONFIG_FILENAME environment variable defines the
        configuration path to load.
 
     """
     global api_cfg
     if not api_cfg:
         config_file = os.environ.get('SWH_CONFIG_FILENAME')
         api_cfg = load_and_check_config(config_file)
         app.config.update(api_cfg)
     handler = logging.StreamHandler()
     app.logger.addHandler(handler)
     return app
 
 
 if __name__ == '__main__':
     print('Deprecated. Use swh-indexer')
diff --git a/swh/indexer/storage/exc.py b/swh/indexer/storage/exc.py
new file mode 100644
index 0000000..841ee0c
--- /dev/null
+++ b/swh/indexer/storage/exc.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+class IndexerStorageAPIError(Exception):
+    """Generic error of the indexer storage."""
+    pass
+
+
+class IndexerStorageArgumentException(Exception):
+    """Argument passed to an IndexerStorage endpoint is invalid."""
+    pass
+
+
+class DuplicateId(IndexerStorageArgumentException):
+    """The same identifier is present more than once."""
+    pass
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
index 74a41a5..46e1516 100644
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -1,421 +1,421 @@
 # Copyright (C) 2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import bisect
 from collections import defaultdict, Counter
 import itertools
 import json
 import operator
 import math
 import re
+from typing import Any, Dict, List
 
-from . import MAPPING_NAMES
+from . import MAPPING_NAMES, check_id_duplicates
+from .exc import IndexerStorageArgumentException
 
 SHA1_DIGEST_SIZE = 160
 
 
 def _transform_tool(tool):
     return {
         'id': tool['id'],
         'name': tool['tool_name'],
         'version': tool['tool_version'],
         'configuration': tool['tool_configuration'],
     }
 
 
+def check_id_types(data: List[Dict[str, Any]]):
+    """Checks all elements of the list have an 'id' whose type is 'bytes'."""
+    if not all(isinstance(item.get('id'), bytes) for item in data):
+        raise IndexerStorageArgumentException('identifiers must be bytes.')
+
+
 class SubStorage:
     """Implements common missing/get/add logic for each indexer type."""
     def __init__(self, tools):
         self._tools = tools
         self._sorted_ids = []
         self._data = {}  # map (id_, tool_id) -> metadata_dict
         self._tools_per_id = defaultdict(set)  # map id_ -> Set[tool_id]
 
     def missing(self, ids):
         """List data missing from storage.
 
         Args:
             data (iterable): dictionaries with keys:
 
                 - **id** (bytes): sha1 identifier
                 - **indexer_configuration_id** (int): tool used to compute
                   the results
 
         Yields:
             missing sha1s
 
         """
         for id_ in ids:
             tool_id = id_['indexer_configuration_id']
             id_ = id_['id']
             if tool_id not in self._tools_per_id.get(id_, set()):
                 yield id_
 
     def get(self, ids):
         """Retrieve data per id.
 
         Args:
             ids (iterable): sha1 checksums
 
         Yields:
             dict: dictionaries with the following keys:
 
               - **id** (bytes)
               - **tool** (dict): tool used to compute metadata
               - arbitrary data (as provided to `add`)
 
         """
         for id_ in ids:
             for tool_id in self._tools_per_id.get(id_, set()):
                 key = (id_, tool_id)
                 yield {
                     'id': id_,
                     'tool': _transform_tool(self._tools[tool_id]),
                     **self._data[key],
                 }
 
     def get_all(self):
         yield from self.get(self._sorted_ids)
 
     def get_range(self, start, end, indexer_configuration_id, limit):
         """Retrieve data within range [start, end] bound by limit.
 
         Args:
             **start** (bytes): Starting identifier range (expected smaller
                            than end)
             **end** (bytes): Ending identifier range (expected larger
                              than start)
             **indexer_configuration_id** (int): The tool used to index data
             **limit** (int): Limit result
 
         Raises:
-            ValueError for limit to None
+            IndexerStorageArgumentException for limit to None
 
         Returns:
             a dict with keys:
             - **ids** [bytes]: iterable of content ids within the range.
             - **next** (Optional[bytes]): The next range of sha1 starts at
                                           this sha1 if any
 
         """
         if limit is None:
-            raise ValueError('Development error: limit should not be None')
+            raise IndexerStorageArgumentException('limit should not be None')
         from_index = bisect.bisect_left(self._sorted_ids, start)
         to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index)
         if to_index - from_index >= limit:
             return {
                 'ids': self._sorted_ids[from_index:from_index+limit],
                 'next': self._sorted_ids[from_index+limit],
             }
         else:
             return {
                 'ids': self._sorted_ids[from_index:to_index],
                 'next': None,
                 }
 
     def add(self, data, conflict_update):
         """Add data not present in storage.
 
         Args:
             data (iterable): dictionaries with keys:
 
               - **id**: sha1
               - **indexer_configuration_id**: tool used to compute the
                 results
               - arbitrary data
 
             conflict_update (bool): Flag to determine if we want to overwrite
               (true) or skip duplicates (false)
 
         """
         data = list(data)
-        if len({x['id'] for x in data}) < len(data):
-            # For "exception-compatibility" with the pgsql backend
-            raise ValueError('The same id is present more than once.')
+        check_id_duplicates(data)
         for item in data:
             item = item.copy()
             tool_id = item.pop('indexer_configuration_id')
             id_ = item.pop('id')
             data = item
             if not conflict_update and \
                     tool_id in self._tools_per_id.get(id_, set()):
                 # Duplicate, should not be updated
                 continue
             key = (id_, tool_id)
             self._data[key] = data
             self._tools_per_id[id_].add(tool_id)
             if id_ not in self._sorted_ids:
                 bisect.insort(self._sorted_ids, id_)
 
     def add_merge(self, new_data, conflict_update, merged_key):
         for new_item in new_data:
             id_ = new_item['id']
             tool_id = new_item['indexer_configuration_id']
             if conflict_update:
                 all_subitems = []
             else:
                 existing = list(self.get([id_]))
                 all_subitems = [
                     old_subitem
                     for existing_item in existing
                     if existing_item['tool']['id'] == tool_id
                     for old_subitem in existing_item[merged_key]
                 ]
             for new_subitem in new_item[merged_key]:
                 if new_subitem not in all_subitems:
                     all_subitems.append(new_subitem)
             self.add([
                 {
                     'id': id_,
                     'indexer_configuration_id': tool_id,
                     merged_key: all_subitems,
                 }
             ], conflict_update=True)
             if id_ not in self._sorted_ids:
                 bisect.insort(self._sorted_ids, id_)
 
     def delete(self, entries):
         for entry in entries:
             (id_, tool_id) = (entry['id'], entry['indexer_configuration_id'])
             key = (id_, tool_id)
             if tool_id in self._tools_per_id[id_]:
                 self._tools_per_id[id_].remove(tool_id)
             if key in self._data:
                 del self._data[key]
 
 
 class IndexerStorage:
     """In-memory SWH indexer storage."""
 
     def __init__(self):
         self._tools = {}
         self._mimetypes = SubStorage(self._tools)
         self._languages = SubStorage(self._tools)
         self._content_ctags = SubStorage(self._tools)
         self._licenses = SubStorage(self._tools)
         self._content_metadata = SubStorage(self._tools)
         self._revision_intrinsic_metadata = SubStorage(self._tools)
         self._origin_intrinsic_metadata = SubStorage(self._tools)
 
     def check_config(self, *, check_write):
         return True
 
     def content_mimetype_missing(self, mimetypes):
         yield from self._mimetypes.missing(mimetypes)
 
     def content_mimetype_get_range(
             self, start, end, indexer_configuration_id, limit=1000):
         return self._mimetypes.get_range(
             start, end, indexer_configuration_id, limit)
 
     def content_mimetype_add(self, mimetypes, conflict_update=False):
-        if not all(isinstance(x['id'], bytes) for x in mimetypes):
-            raise TypeError('identifiers must be bytes.')
+        check_id_types(mimetypes)
         self._mimetypes.add(mimetypes, conflict_update)
 
     def content_mimetype_get(self, ids):
         yield from self._mimetypes.get(ids)
 
     def content_language_missing(self, languages):
         yield from self._languages.missing(languages)
 
     def content_language_get(self, ids):
         yield from self._languages.get(ids)
 
     def content_language_add(self, languages, conflict_update=False):
-        if not all(isinstance(x['id'], bytes) for x in languages):
-            raise TypeError('identifiers must be bytes.')
+        check_id_types(languages)
         self._languages.add(languages, conflict_update)
 
     def content_ctags_missing(self, ctags):
         yield from self._content_ctags.missing(ctags)
 
     def content_ctags_get(self, ids):
         for item in self._content_ctags.get(ids):
             for item_ctags_item in item['ctags']:
                 yield {
                     'id': item['id'],
                     'tool': item['tool'],
                     **item_ctags_item
                 }
 
     def content_ctags_add(self, ctags, conflict_update=False):
-        if not all(isinstance(x['id'], bytes) for x in ctags):
-            raise TypeError('identifiers must be bytes.')
+        check_id_types(ctags)
         self._content_ctags.add_merge(ctags, conflict_update, 'ctags')
 
     def content_ctags_search(self, expression,
                              limit=10, last_sha1=None):
         nb_matches = 0
         for ((id_, tool_id), item) in \
                 sorted(self._content_ctags._data.items()):
             if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))):
                 continue
             for ctags_item in item['ctags']:
                 if ctags_item['name'] != expression:
                     continue
                 nb_matches += 1
                 yield {
                     'id': id_,
                     'tool': _transform_tool(self._tools[tool_id]),
                     **ctags_item
                 }
                 if nb_matches >= limit:
                     return
 
     def content_fossology_license_get(self, ids):
         # Rewrites the output of SubStorage.get from the old format to
         # the new one. SubStorage.get should be updated once all other
         # *_get methods use the new format.
         # See: https://forge.softwareheritage.org/T1433
         res = {}
         for d in self._licenses.get(ids):
             res.setdefault(d.pop('id'), []).append(d)
         for (id_, facts) in res.items():
             yield {id_: facts}
 
     def content_fossology_license_add(self, licenses, conflict_update=False):
-        if not all(isinstance(x['id'], bytes) for x in licenses):
-            raise TypeError('identifiers must be bytes.')
+        check_id_types(licenses)
         self._licenses.add_merge(licenses, conflict_update, 'licenses')
 
     def content_fossology_license_get_range(
             self, start, end, indexer_configuration_id, limit=1000):
         return self._licenses.get_range(
             start, end, indexer_configuration_id, limit)
 
     def content_metadata_missing(self, metadata):
         yield from self._content_metadata.missing(metadata)
 
     def content_metadata_get(self, ids):
         yield from self._content_metadata.get(ids)
 
     def content_metadata_add(self, metadata, conflict_update=False):
-        if not all(isinstance(x['id'], bytes) for x in metadata):
-            raise TypeError('identifiers must be bytes.')
+        check_id_types(metadata)
         self._content_metadata.add(metadata, conflict_update)
 
     def revision_intrinsic_metadata_missing(self, metadata):
         yield from self._revision_intrinsic_metadata.missing(metadata)
 
     def revision_intrinsic_metadata_get(self, ids):
         yield from self._revision_intrinsic_metadata.get(ids)
 
     def revision_intrinsic_metadata_add(self, metadata, conflict_update=False):
-        if not all(isinstance(x['id'], bytes) for x in metadata):
-            raise TypeError('identifiers must be bytes.')
+        check_id_types(metadata)
         self._revision_intrinsic_metadata.add(metadata, conflict_update)
 
     def revision_intrinsic_metadata_delete(self, entries):
         self._revision_intrinsic_metadata.delete(entries)
 
     def origin_intrinsic_metadata_get(self, ids):
         yield from self._origin_intrinsic_metadata.get(ids)
 
     def origin_intrinsic_metadata_add(self, metadata,
                                       conflict_update=False):
         self._origin_intrinsic_metadata.add(metadata, conflict_update)
 
     def origin_intrinsic_metadata_delete(self, entries):
         self._origin_intrinsic_metadata.delete(entries)
 
     def origin_intrinsic_metadata_search_fulltext(
             self, conjunction, limit=100):
         # A very crude fulltext search implementation, but that's enough
         # to work on English metadata
         tokens_re = re.compile('[a-zA-Z0-9]+')
         search_tokens = list(itertools.chain(
             *map(tokens_re.findall, conjunction)))
 
         def rank(data):
             # Tokenize the metadata
             text = json.dumps(data['metadata'])
             text_tokens = tokens_re.findall(text)
             text_token_occurences = Counter(text_tokens)
 
             # Count the number of occurrences of search tokens in the text
             score = 0
             for search_token in search_tokens:
                 if text_token_occurences[search_token] == 0:
                     # Search token is not in the text.
                     return 0
                 score += text_token_occurences[search_token]
 
             # Normalize according to the text's length
             return score / math.log(len(text_tokens))
 
         results = [(rank(data), data)
                    for data in self._origin_intrinsic_metadata.get_all()]
         results = [(rank_, data) for (rank_, data) in results if rank_ > 0]
         results.sort(key=operator.itemgetter(0),  # Don't try to order 'data'
                      reverse=True)
         for (rank_, result) in results[:limit]:
             yield result
 
     def origin_intrinsic_metadata_search_by_producer(
             self, page_token='', limit=100, ids_only=False,
             mappings=None, tool_ids=None):
         assert isinstance(page_token, str)
         nb_results = 0
         if mappings is not None:
             mappings = frozenset(mappings)
         if tool_ids is not None:
             tool_ids = frozenset(tool_ids)
         origins = []
 
         # we go to limit+1 to check whether we should add next_page_token in
         # the response
         for entry in self._origin_intrinsic_metadata.get_all():
             if entry['id'] <= page_token:
                 continue
             if nb_results >= (limit + 1):
                 break
             if mappings is not None and mappings.isdisjoint(entry['mappings']):
                 continue
             if tool_ids is not None and entry['tool']['id'] not in tool_ids:
                 continue
             origins.append(entry)
             nb_results += 1
 
         result = {}
         if len(origins) > limit:
             origins = origins[:limit]
             result['next_page_token'] = origins[-1]['id']
         if ids_only:
             origins = [origin['id'] for origin in origins]
         result['origins'] = origins
         return result
 
     def origin_intrinsic_metadata_stats(self):
         mapping_count = {m: 0 for m in MAPPING_NAMES}
         total = non_empty = 0
         for data in self._origin_intrinsic_metadata.get_all():
             total += 1
             if set(data['metadata']) - {'@context'}:
                 non_empty += 1
             for mapping in data['mappings']:
                 mapping_count[mapping] += 1
         return {
             'per_mapping': mapping_count,
             'total': total,
             'non_empty': non_empty
         }
 
     def indexer_configuration_add(self, tools):
         inserted = []
         for tool in tools:
             tool = tool.copy()
             id_ = self._tool_key(tool)
             tool['id'] = id_
             self._tools[id_] = tool
             inserted.append(tool)
         return inserted
 
     def indexer_configuration_get(self, tool):
         return self._tools.get(self._tool_key(tool))
 
     def _tool_key(self, tool):
         return hash((tool['tool_name'], tool['tool_version'],
                      json.dumps(tool['tool_configuration'], sort_keys=True)))
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
index 3218dc2..8f4c2fb 100644
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -1,1888 +1,1889 @@
 # Copyright (C) 2015-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import inspect
 import threading
 
 import pytest
 
 from swh.model.hashutil import hash_to_bytes
 
+from swh.indexer.storage.exc import (
+    IndexerStorageArgumentException, DuplicateId,
+)
 from swh.indexer.storage.interface import IndexerStorageInterface
 
 
 def prepare_mimetypes_from(fossology_licenses):
     """Fossology license needs some consistent data in db to run.
 
     """
     mimetypes = []
     for c in fossology_licenses:
         mimetypes.append({
             'id': c['id'],
             'mimetype': 'text/plain',
             'encoding': 'utf-8',
             'indexer_configuration_id': c['indexer_configuration_id'],
         })
     return mimetypes
 
 
 def endpoint(storage, endpoint_type, endpoint_name):
     return getattr(storage, endpoint_type + '_' + endpoint_name)
 
 
 class StorageETypeTester:
     """Base class for testing a series of common behaviour between a bunch of
     endpoint types supported by an IndexerStorage.
 
     This is supposed to be inherited with the following class attributes:
     - endpoint_type
     - tool_name
     - example_data
 
     See below for example usage.
     """
 
     def test_missing(self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         etype = self.endpoint_type
         tool_id = data.tools[self.tool_name]['id']
 
         # given 2 (hopefully) unknown objects
         query = [
             {
                 'id': data.sha1_1,
                 'indexer_configuration_id': tool_id,
             },
             {
                 'id': data.sha1_2,
                 'indexer_configuration_id': tool_id,
             }]
 
         # we expect these are both returned by the xxx_missing endpoint
         actual_missing = endpoint(storage, etype, 'missing')(query)
         assert list(actual_missing) == [
             data.sha1_1,
             data.sha1_2,
         ]
 
         # now, when we add one of them
         endpoint(storage, etype, 'add')([{
             'id': data.sha1_2,
             **self.example_data[0],
             'indexer_configuration_id': tool_id,
         }])
 
         # we expect only the other one returned
         actual_missing = endpoint(storage, etype, 'missing')(query)
         assert list(actual_missing) == [data.sha1_1]
 
     def test_add__drop_duplicate(self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         etype = self.endpoint_type
         tool_id = data.tools[self.tool_name]['id']
 
         # add the first object
         data_v1 = {
             'id': data.sha1_2,
             **self.example_data[0],
             'indexer_configuration_id': tool_id,
         }
         endpoint(storage, etype, 'add')([data_v1])
 
         # should be able to retrieve it
         actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2]))
         expected_data_v1 = [{
             'id': data.sha1_2,
             **self.example_data[0],
             'tool': data.tools[self.tool_name],
         }]
         assert actual_data == expected_data_v1
 
         # now if we add a modified version of the same object (same id)
         data_v2 = data_v1.copy()
         data_v2.update(self.example_data[1])
         endpoint(storage, etype, 'add')([data_v2])
 
         # we expect to retrieve the original data, not the modified one
         actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2]))
         assert actual_data == expected_data_v1
 
     def test_add__update_in_place_duplicate(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         etype = self.endpoint_type
         tool = data.tools[self.tool_name]
 
         data_v1 = {
             'id': data.sha1_2,
             **self.example_data[0],
             'indexer_configuration_id': tool['id'],
         }
 
         # given
         endpoint(storage, etype, 'add')([data_v1])
 
         # when
         actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2]))
 
         expected_data_v1 = [{
             'id': data.sha1_2,
             **self.example_data[0],
             'tool': tool,
         }]
 
         # then
         assert actual_data == expected_data_v1
 
         # given
         data_v2 = data_v1.copy()
         data_v2.update(self.example_data[1])
 
         endpoint(storage, etype, 'add')([data_v2], conflict_update=True)
 
         actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2]))
 
         expected_data_v2 = [{
             'id': data.sha1_2,
             **self.example_data[1],
             'tool': tool,
         }]
 
         # data did change as the v2 was used to overwrite v1
         assert actual_data == expected_data_v2
 
     def test_add__update_in_place_deadlock(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         etype = self.endpoint_type
         tool = data.tools[self.tool_name]
 
         hashes = [
             hash_to_bytes(
                 '34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}'.format(i))
             for i in range(1000)]
 
         data_v1 = [
             {
                 'id': hash_,
                 **self.example_data[0],
                 'indexer_configuration_id': tool['id'],
             }
             for hash_ in hashes
         ]
         data_v2 = [
             {
                 'id': hash_,
                 **self.example_data[1],
                 'indexer_configuration_id': tool['id'],
             }
             for hash_ in hashes
         ]
 
         # Remove one item from each, so that both queries have to succeed for
         # all items to be in the DB.
         data_v2a = data_v2[1:]
         data_v2b = list(reversed(data_v2[0:-1]))
 
         # given
         endpoint(storage, etype, 'add')(data_v1)
 
         # when
         actual_data = list(endpoint(storage, etype, 'get')(hashes))
 
         expected_data_v1 = [
             {
                 'id': hash_,
                 **self.example_data[0],
                 'tool': tool,
             }
             for hash_ in hashes
         ]
 
         # then
         assert actual_data == expected_data_v1
 
         # given
         def f1():
             endpoint(storage, etype, 'add')(data_v2a, conflict_update=True)
 
         def f2():
             endpoint(storage, etype, 'add')(data_v2b, conflict_update=True)
 
         t1 = threading.Thread(target=f1)
         t2 = threading.Thread(target=f2)
         t2.start()
         t1.start()
 
         t1.join()
         t2.join()
 
         actual_data = sorted(endpoint(storage, etype, 'get')(hashes),
                              key=lambda x: x['id'])
 
         expected_data_v2 = [
             {
                 'id': hash_,
                 **self.example_data[1],
                 'tool': tool,
             }
             for hash_ in hashes
         ]
 
         assert actual_data == expected_data_v2
 
     def test_add__duplicate_twice(self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         etype = self.endpoint_type
         tool = data.tools[self.tool_name]
 
         data_rev1 = {
             'id': data.revision_id_2,
             **self.example_data[0],
             'indexer_configuration_id': tool['id']
         }
 
         data_rev2 = {
             'id': data.revision_id_2,
             **self.example_data[1],
             'indexer_configuration_id': tool['id']
         }
 
         # when
         endpoint(storage, etype, 'add')([data_rev1])
 
-        with pytest.raises(ValueError):
+        with pytest.raises(DuplicateId):
             endpoint(storage, etype, 'add')(
                 [data_rev2, data_rev2],
                 conflict_update=True)
 
         # then
         actual_data = list(endpoint(storage, etype, 'get')(
             [data.revision_id_2, data.revision_id_1]))
 
         expected_data = [{
             'id': data.revision_id_2,
             **self.example_data[0],
             'tool': tool,
         }]
         assert actual_data == expected_data
 
     def test_get(self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         etype = self.endpoint_type
         tool = data.tools[self.tool_name]
 
         query = [data.sha1_2, data.sha1_1]
         data1 = {
             'id': data.sha1_2,
             **self.example_data[0],
             'indexer_configuration_id': tool['id'],
         }
 
         # when
         endpoint(storage, etype, 'add')([data1])
 
         # then
         actual_data = list(endpoint(storage, etype, 'get')(query))
 
         # then
         expected_data = [{
             'id': data.sha1_2,
             **self.example_data[0],
             'tool': tool,
         }]
 
         assert actual_data == expected_data
 
 
 class TestIndexerStorageContentMimetypes(StorageETypeTester):
     """Test Indexer Storage content_mimetype related methods
     """
     endpoint_type = 'content_mimetype'
     tool_name = 'file'
     example_data = [
         {
             'mimetype': 'text/plain',
             'encoding': 'utf-8',
         },
         {
             'mimetype': 'text/html',
             'encoding': 'us-ascii',
         },
         ]
 
     def test_generate_content_mimetype_get_range_limit_none(
             self, swh_indexer_storage):
         """mimetype_get_range call with wrong limit input should fail"""
         storage = swh_indexer_storage
-        with pytest.raises(ValueError) as e:
+        with pytest.raises(IndexerStorageArgumentException) as e:
             storage.content_mimetype_get_range(
                 start=None, end=None, indexer_configuration_id=None,
                 limit=None)
 
-        assert e.value.args == (
-            'Development error: limit should not be None',)
+        assert e.value.args == ('limit should not be None',)
 
     def test_generate_content_mimetype_get_range_no_limit(
             self, swh_indexer_storage_with_data):
         """mimetype_get_range returns mimetypes within range provided"""
         storage, data = swh_indexer_storage_with_data
         mimetypes = data.mimetypes
 
         # All ids from the db
         content_ids = sorted([c['id'] for c in mimetypes])
 
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve mimetypes
         tool_id = mimetypes[0]['indexer_configuration_id']
         actual_result = storage.content_mimetype_get_range(
             start, end, indexer_configuration_id=tool_id)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         assert len(mimetypes) == len(actual_ids)
         assert actual_next is None
         assert content_ids == actual_ids
 
     def test_generate_content_mimetype_get_range_limit(
             self, swh_indexer_storage_with_data):
         """mimetype_get_range paginates results if limit exceeded"""
         storage, data = swh_indexer_storage_with_data
 
         indexer_configuration_id = data.tools['file']['id']
 
         # input the list of sha1s we want from storage
         content_ids = sorted(
             [c['id'] for c in data.mimetypes])
         mimetypes = list(storage.content_mimetype_get(content_ids))
         assert len(mimetypes) == len(data.mimetypes)
 
         start = content_ids[0]
         end = content_ids[-1]
         # retrieve mimetypes limited to 10 results
         actual_result = storage.content_mimetype_get_range(
             start, end,
             indexer_configuration_id=indexer_configuration_id,
             limit=10)
 
         assert actual_result
         assert set(actual_result.keys()) == {'ids', 'next'}
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         assert len(actual_ids) == 10
         assert actual_next is not None
         assert actual_next == content_ids[10]
 
         expected_mimetypes = content_ids[:10]
         assert expected_mimetypes == actual_ids
 
         # retrieve next part
         actual_result = storage.content_mimetype_get_range(
             start=end, end=end,
             indexer_configuration_id=indexer_configuration_id)
         assert set(actual_result.keys()) == {'ids', 'next'}
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         assert actual_next is None
         expected_mimetypes = [content_ids[-1]]
         assert expected_mimetypes == actual_ids
 
 
 class TestIndexerStorageContentLanguage(StorageETypeTester):
     """Test Indexer Storage content_language related methods
     """
     endpoint_type = 'content_language'
     tool_name = 'pygments'
     example_data = [
         {
             'lang': 'haskell',
         },
         {
             'lang': 'common-lisp',
         },
         ]
 
 
 class TestIndexerStorageContentCTags(StorageETypeTester):
     """Test Indexer Storage content_ctags related methods
     """
     endpoint_type = 'content_ctags'
     tool_name = 'universal-ctags'
     example_data = [
         {
             'ctags': [{
                 'name': 'done',
                 'kind': 'variable',
                 'line': 119,
                 'lang': 'OCaml',
             }]
         },
         {
             'ctags': [
                 {
                     'name': 'done',
                     'kind': 'variable',
                     'line': 100,
                     'lang': 'Python',
                 },
                 {
                     'name': 'main',
                     'kind': 'function',
                     'line': 119,
                     'lang': 'Python',
                 }]
         },
         ]
 
     # the following tests are disabled because CTAGS behaves differently
     @pytest.mark.skip
     def test_add__drop_duplicate(self):
         pass
 
     @pytest.mark.skip
     def test_add__update_in_place_duplicate(self):
         pass
 
     @pytest.mark.skip
     def test_add__update_in_place_deadlock(self):
         pass
 
     @pytest.mark.skip
     def test_add__duplicate_twice(self):
         pass
 
     @pytest.mark.skip
     def test_get(self):
         pass
 
     def test_content_ctags_search(self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # 1. given
         tool = data.tools['universal-ctags']
         tool_id = tool['id']
 
         ctag1 = {
             'id': data.sha1_1,
             'indexer_configuration_id': tool_id,
             'ctags': [
                 {
                     'name': 'hello',
                     'kind': 'function',
                     'line': 133,
                     'lang': 'Python',
                 },
                 {
                     'name': 'counter',
                     'kind': 'variable',
                     'line': 119,
                     'lang': 'Python',
                 },
                 {
                     'name': 'hello',
                     'kind': 'variable',
                     'line': 210,
                     'lang': 'Python',
                 },
             ]
         }
 
         ctag2 = {
             'id': data.sha1_2,
             'indexer_configuration_id': tool_id,
             'ctags': [
                 {
                     'name': 'hello',
                     'kind': 'variable',
                     'line': 100,
                     'lang': 'C',
                 },
                 {
                     'name': 'result',
                     'kind': 'variable',
                     'line': 120,
                     'lang': 'C',
                 },
             ]
         }
 
         storage.content_ctags_add([ctag1, ctag2])
 
         # 1. when
         actual_ctags = list(storage.content_ctags_search('hello', limit=1))
 
         # 1. then
         assert actual_ctags == [
             {
                 'id': ctag1['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'function',
                 'line': 133,
                 'lang': 'Python',
             }
         ]
 
         # 2. when
         actual_ctags = list(storage.content_ctags_search(
             'hello',
             limit=1,
             last_sha1=ctag1['id']))
 
         # 2. then
         assert actual_ctags == [
             {
                 'id': ctag2['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'C',
             }
         ]
 
         # 3. when
         actual_ctags = list(storage.content_ctags_search('hello'))
 
         # 3. then
         assert actual_ctags == [
             {
                 'id': ctag1['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'function',
                 'line': 133,
                 'lang': 'Python',
             },
             {
                 'id': ctag1['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'variable',
                 'line': 210,
                 'lang': 'Python',
             },
             {
                 'id': ctag2['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'C',
             },
         ]
 
         # 4. when
         actual_ctags = list(storage.content_ctags_search('counter'))
 
         # then
         assert actual_ctags == [{
             'id': ctag1['id'],
             'tool': tool,
             'name': 'counter',
             'kind': 'variable',
             'line': 119,
             'lang': 'Python',
         }]
 
         # 5. when
         actual_ctags = list(storage.content_ctags_search('result', limit=1))
 
         # then
         assert actual_ctags == [{
             'id': ctag2['id'],
             'tool': tool,
             'name': 'result',
             'kind': 'variable',
             'line': 120,
             'lang': 'C',
         }]
 
     def test_content_ctags_search_no_result(self, swh_indexer_storage):
         storage = swh_indexer_storage
         actual_ctags = list(storage.content_ctags_search('counter'))
 
         assert not actual_ctags
 
     def test_content_ctags_add__add_new_ctags_added(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
 
         # given
         tool = data.tools['universal-ctags']
         tool_id = tool['id']
 
         ctag_v1 = {
             'id': data.sha1_2,
             'indexer_configuration_id': tool_id,
             'ctags': [{
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
             }]
         }
 
         # given
         storage.content_ctags_add([ctag_v1])
         storage.content_ctags_add([ctag_v1])  # conflict does nothing
 
         # when
         actual_ctags = list(storage.content_ctags_get([data.sha1_2]))
 
         # then
         expected_ctags = [{
             'id': data.sha1_2,
             'name': 'done',
             'kind': 'variable',
             'line': 100,
             'lang': 'Scheme',
             'tool': tool,
         }]
 
         assert actual_ctags == expected_ctags
 
         # given
         ctag_v2 = ctag_v1.copy()
         ctag_v2.update({
             'ctags': [
                 {
                     'name': 'defn',
                     'kind': 'function',
                     'line': 120,
                     'lang': 'Scheme',
                 }
             ]
         })
 
         storage.content_ctags_add([ctag_v2])
 
         expected_ctags = [
             {
                 'id': data.sha1_2,
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
                 'tool': tool,
             }, {
                 'id': data.sha1_2,
                 'name': 'defn',
                 'kind': 'function',
                 'line': 120,
                 'lang': 'Scheme',
                 'tool': tool,
             }
         ]
 
         actual_ctags = list(storage.content_ctags_get(
             [data.sha1_2]))
 
         assert actual_ctags == expected_ctags
 
     def test_content_ctags_add__update_in_place(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # given
         tool = data.tools['universal-ctags']
         tool_id = tool['id']
 
         ctag_v1 = {
             'id': data.sha1_2,
             'indexer_configuration_id': tool_id,
             'ctags': [{
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
             }]
         }
 
         # given
         storage.content_ctags_add([ctag_v1])
 
         # when
         actual_ctags = list(storage.content_ctags_get(
             [data.sha1_2]))
 
         # then
         expected_ctags = [
             {
                 'id': data.sha1_2,
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
                 'tool': tool
             }
         ]
         assert actual_ctags == expected_ctags
 
         # given
         ctag_v2 = ctag_v1.copy()
         ctag_v2.update({
             'ctags': [
                 {
                     'name': 'done',
                     'kind': 'variable',
                     'line': 100,
                     'lang': 'Scheme',
                 },
                 {
                     'name': 'defn',
                     'kind': 'function',
                     'line': 120,
                     'lang': 'Scheme',
                 }
             ]
         })
 
         storage.content_ctags_add([ctag_v2], conflict_update=True)
 
         actual_ctags = list(storage.content_ctags_get(
             [data.sha1_2]))
 
         # ctag did change as the v2 was used to overwrite v1
         expected_ctags = [
             {
                 'id': data.sha1_2,
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
                 'tool': tool,
             },
             {
                 'id': data.sha1_2,
                 'name': 'defn',
                 'kind': 'function',
                 'line': 120,
                 'lang': 'Scheme',
                 'tool': tool,
             }
         ]
         assert actual_ctags == expected_ctags
 
 
 class TestIndexerStorageContentMetadata(StorageETypeTester):
     """Test Indexer Storage content_metadata related methods
     """
     tool_name = 'swh-metadata-detector'
     endpoint_type = 'content_metadata'
     example_data = [
         {
             'metadata': {
                 'other': {},
                 'codeRepository': {
                     'type': 'git',
                     'url': 'https://github.com/moranegg/metadata_test'
                 },
                 'description': 'Simple package.json test for indexer',
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
         },
         {
             'metadata': {
                 'other': {},
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
         },
         ]
 
 
 class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester):
     """Test Indexer Storage revision_intrinsic_metadata related methods
     """
     tool_name = 'swh-metadata-detector'
     endpoint_type = 'revision_intrinsic_metadata'
     example_data = [
         {
             'metadata': {
                 'other': {},
                 'codeRepository': {
                     'type': 'git',
                     'url': 'https://github.com/moranegg/metadata_test'
                 },
                 'description': 'Simple package.json test for indexer',
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
             'mappings': ['mapping1'],
         },
         {
             'metadata': {
                 'other': {},
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
             'mappings': ['mapping2'],
         },
         ]
 
     def test_revision_intrinsic_metadata_delete(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         etype = self.endpoint_type
         tool = data.tools[self.tool_name]
 
         query = [data.sha1_2, data.sha1_1]
         data1 = {
             'id': data.sha1_2,
             **self.example_data[0],
             'indexer_configuration_id': tool['id'],
         }
 
         # when
         endpoint(storage, etype, 'add')([data1])
         endpoint(storage, etype, 'delete')([
             {
                 'id': data.sha1_2,
                 'indexer_configuration_id': tool['id'],
             }
         ])
 
         # then
         actual_data = list(endpoint(storage, etype, 'get')(query))
 
         # then
         assert not actual_data
 
     def test_revision_intrinsic_metadata_delete_nonexisting(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         etype = self.endpoint_type
         tool = data.tools[self.tool_name]
         endpoint(storage, etype, 'delete')([
             {
                 'id': data.sha1_2,
                 'indexer_configuration_id': tool['id'],
             }
         ])
 
 
 class TestIndexerStorageContentFossologyLicence:
     def test_content_fossology_license_add__new_license_added(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # given
         tool = data.tools['nomos']
         tool_id = tool['id']
 
         license_v1 = {
             'id': data.sha1_1,
             'licenses': ['Apache-2.0'],
             'indexer_configuration_id': tool_id,
         }
 
         # given
         storage.content_fossology_license_add([license_v1])
         # conflict does nothing
         storage.content_fossology_license_add([license_v1])
 
         # when
         actual_licenses = list(storage.content_fossology_license_get(
             [data.sha1_1]))
 
         # then
         expected_license = {
             data.sha1_1: [{
                 'licenses': ['Apache-2.0'],
                 'tool': tool,
             }]
         }
         assert actual_licenses == [expected_license]
 
         # given
         license_v2 = license_v1.copy()
         license_v2.update({
             'licenses': ['BSD-2-Clause'],
         })
 
         storage.content_fossology_license_add([license_v2])
 
         actual_licenses = list(storage.content_fossology_license_get(
             [data.sha1_1]))
 
         expected_license = {
             data.sha1_1: [{
                 'licenses': ['Apache-2.0', 'BSD-2-Clause'],
                 'tool': tool
             }]
         }
 
         # license did not change as the v2 was dropped.
         assert actual_licenses == [expected_license]
 
     def test_generate_content_fossology_license_get_range_limit_none(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         """license_get_range call with wrong limit input should fail"""
-        with pytest.raises(ValueError) as e:
+        with pytest.raises(IndexerStorageArgumentException) as e:
             storage.content_fossology_license_get_range(
                 start=None, end=None, indexer_configuration_id=None,
                 limit=None)
 
-        assert e.value.args == (
-            'Development error: limit should not be None',)
+        assert e.value.args == ('limit should not be None',)
 
     def test_generate_content_fossology_license_get_range_no_limit(
             self, swh_indexer_storage_with_data):
         """license_get_range returns licenses within range provided"""
         storage, data = swh_indexer_storage_with_data
         # craft some consistent mimetypes
         fossology_licenses = data.fossology_licenses
         mimetypes = prepare_mimetypes_from(fossology_licenses)
 
         storage.content_mimetype_add(mimetypes, conflict_update=True)
         # add fossology_licenses to storage
         storage.content_fossology_license_add(fossology_licenses)
 
         # All ids from the db
         content_ids = sorted([c['id'] for c in fossology_licenses])
 
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve fossology_licenses
         tool_id = fossology_licenses[0]['indexer_configuration_id']
         actual_result = storage.content_fossology_license_get_range(
             start, end, indexer_configuration_id=tool_id)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         assert len(fossology_licenses) == len(actual_ids)
         assert actual_next is None
         assert content_ids == actual_ids
 
     def test_generate_content_fossology_license_get_range_no_limit_with_filter(
             self, swh_indexer_storage_with_data):
         """This filters non textual, then returns results within range"""
         storage, data = swh_indexer_storage_with_data
         fossology_licenses = data.fossology_licenses
         mimetypes = data.mimetypes
 
         # craft some consistent mimetypes
         _mimetypes = prepare_mimetypes_from(fossology_licenses)
         # add binary mimetypes which will get filtered out in results
         for m in mimetypes:
             _mimetypes.append({
                 'mimetype': 'binary',
                 **m,
             })
 
         storage.content_mimetype_add(_mimetypes, conflict_update=True)
         # add fossology_licenses to storage
         storage.content_fossology_license_add(fossology_licenses)
 
         # All ids from the db
         content_ids = sorted([c['id'] for c in fossology_licenses])
 
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve fossology_licenses
         tool_id = fossology_licenses[0]['indexer_configuration_id']
         actual_result = storage.content_fossology_license_get_range(
             start, end, indexer_configuration_id=tool_id)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         assert len(fossology_licenses) == len(actual_ids)
         assert actual_next is None
         assert content_ids == actual_ids
 
     def test_generate_fossology_license_get_range_limit(
             self, swh_indexer_storage_with_data):
         """fossology_license_get_range paginates results if limit exceeded"""
         storage, data = swh_indexer_storage_with_data
         fossology_licenses = data.fossology_licenses
 
         # craft some consistent mimetypes
         mimetypes = prepare_mimetypes_from(fossology_licenses)
 
         # add fossology_licenses to storage
         storage.content_mimetype_add(mimetypes, conflict_update=True)
         storage.content_fossology_license_add(fossology_licenses)
 
         # input the list of sha1s we want from storage
         content_ids = sorted([c['id'] for c in fossology_licenses])
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve fossology_licenses limited to 3 results
         limited_results = len(fossology_licenses) - 1
         tool_id = fossology_licenses[0]['indexer_configuration_id']
         actual_result = storage.content_fossology_license_get_range(
             start, end,
             indexer_configuration_id=tool_id, limit=limited_results)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         assert limited_results == len(actual_ids)
         assert actual_next is not None
         assert actual_next == content_ids[-1]
 
         expected_fossology_licenses = content_ids[:-1]
         assert expected_fossology_licenses == actual_ids
 
         # retrieve next part
         actual_results2 = storage.content_fossology_license_get_range(
             start=end, end=end, indexer_configuration_id=tool_id)
         actual_ids2 = actual_results2['ids']
         actual_next2 = actual_results2['next']
 
         assert actual_next2 is None
         expected_fossology_licenses2 = [content_ids[-1]]
         assert expected_fossology_licenses2 == actual_ids2
 
 
 class TestIndexerStorageOriginIntrinsicMetadata:
     def test_origin_intrinsic_metadata_get(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # given
         tool_id = data.tools['swh-metadata-detector']['id']
 
         metadata = {
             'version': None,
             'name': None,
         }
         metadata_rev = {
             'id': data.revision_id_2,
             'metadata': metadata,
             'mappings': ['mapping1'],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin = {
             'id': data.origin_url_1,
             'metadata': metadata,
             'indexer_configuration_id': tool_id,
             'mappings': ['mapping1'],
             'from_revision': data.revision_id_2,
             }
 
         # when
         storage.revision_intrinsic_metadata_add([metadata_rev])
         storage.origin_intrinsic_metadata_add([metadata_origin])
 
         # then
         actual_metadata = list(storage.origin_intrinsic_metadata_get(
             [data.origin_url_1, 'no://where']))
 
         expected_metadata = [{
             'id': data.origin_url_1,
             'metadata': metadata,
             'tool': data.tools['swh-metadata-detector'],
             'from_revision': data.revision_id_2,
             'mappings': ['mapping1'],
         }]
 
         assert actual_metadata == expected_metadata
 
     def test_origin_intrinsic_metadata_delete(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # given
         tool_id = data.tools['swh-metadata-detector']['id']
 
         metadata = {
             'version': None,
             'name': None,
         }
         metadata_rev = {
             'id': data.revision_id_2,
             'metadata': metadata,
             'mappings': ['mapping1'],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin = {
             'id': data.origin_url_1,
             'metadata': metadata,
             'indexer_configuration_id': tool_id,
             'mappings': ['mapping1'],
             'from_revision': data.revision_id_2,
             }
         metadata_origin2 = metadata_origin.copy()
         metadata_origin2['id'] = data.origin_url_2
 
         # when
         storage.revision_intrinsic_metadata_add([metadata_rev])
         storage.origin_intrinsic_metadata_add([
             metadata_origin, metadata_origin2])
 
         storage.origin_intrinsic_metadata_delete([
             {
                 'id': data.origin_url_1,
                 'indexer_configuration_id': tool_id
             }
         ])
 
         # then
         actual_metadata = list(storage.origin_intrinsic_metadata_get(
             [data.origin_url_1, data.origin_url_2, 'no://where']))
         for item in actual_metadata:
             item['indexer_configuration_id'] = item.pop('tool')['id']
         assert actual_metadata == [metadata_origin2]
 
     def test_origin_intrinsic_metadata_delete_nonexisting(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         tool_id = data.tools['swh-metadata-detector']['id']
         storage.origin_intrinsic_metadata_delete([
             {
                 'id': data.origin_url_1,
                 'indexer_configuration_id': tool_id
             }
         ])
 
     def test_origin_intrinsic_metadata_add_drop_duplicate(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # given
         tool_id = data.tools['swh-metadata-detector']['id']
 
         metadata_v1 = {
             'version': None,
             'name': None,
         }
         metadata_rev_v1 = {
             'id': data.revision_id_1,
             'metadata': metadata_v1.copy(),
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin_v1 = {
             'id': data.origin_url_1,
             'metadata': metadata_v1.copy(),
             'indexer_configuration_id': tool_id,
             'mappings': [],
             'from_revision': data.revision_id_1,
         }
 
         # given
         storage.revision_intrinsic_metadata_add([metadata_rev_v1])
         storage.origin_intrinsic_metadata_add([metadata_origin_v1])
 
         # when
         actual_metadata = list(storage.origin_intrinsic_metadata_get(
             [data.origin_url_1, 'no://where']))
 
         expected_metadata_v1 = [{
             'id': data.origin_url_1,
             'metadata': metadata_v1,
             'tool': data.tools['swh-metadata-detector'],
             'from_revision': data.revision_id_1,
             'mappings': [],
         }]
 
         assert actual_metadata == expected_metadata_v1
 
         # given
         metadata_v2 = metadata_v1.copy()
         metadata_v2.update({
             'name': 'test_metadata',
             'author': 'MG',
         })
         metadata_rev_v2 = metadata_rev_v1.copy()
         metadata_origin_v2 = metadata_origin_v1.copy()
         metadata_rev_v2['metadata'] = metadata_v2
         metadata_origin_v2['metadata'] = metadata_v2
 
         storage.revision_intrinsic_metadata_add([metadata_rev_v2])
         storage.origin_intrinsic_metadata_add([metadata_origin_v2])
 
         # then
         actual_metadata = list(storage.origin_intrinsic_metadata_get(
             [data.origin_url_1]))
 
         # metadata did not change as the v2 was dropped.
         assert actual_metadata == expected_metadata_v1
 
     def test_origin_intrinsic_metadata_add_update_in_place_duplicate(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # given
         tool_id = data.tools['swh-metadata-detector']['id']
 
         metadata_v1 = {
             'version': None,
             'name': None,
         }
         metadata_rev_v1 = {
             'id': data.revision_id_2,
             'metadata': metadata_v1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin_v1 = {
             'id': data.origin_url_1,
             'metadata': metadata_v1.copy(),
             'indexer_configuration_id': tool_id,
             'mappings': [],
             'from_revision': data.revision_id_2,
         }
 
         # given
         storage.revision_intrinsic_metadata_add([metadata_rev_v1])
         storage.origin_intrinsic_metadata_add([metadata_origin_v1])
 
         # when
         actual_metadata = list(storage.origin_intrinsic_metadata_get(
             [data.origin_url_1]))
 
         # then
         expected_metadata_v1 = [{
             'id': data.origin_url_1,
             'metadata': metadata_v1,
             'tool': data.tools['swh-metadata-detector'],
             'from_revision': data.revision_id_2,
             'mappings': [],
         }]
         assert actual_metadata == expected_metadata_v1
 
         # given
         metadata_v2 = metadata_v1.copy()
         metadata_v2.update({
             'name': 'test_update_duplicated_metadata',
             'author': 'MG',
         })
         metadata_rev_v2 = metadata_rev_v1.copy()
         metadata_origin_v2 = metadata_origin_v1.copy()
         metadata_rev_v2['metadata'] = metadata_v2
         metadata_origin_v2 = {
             'id': data.origin_url_1,
             'metadata': metadata_v2.copy(),
             'indexer_configuration_id': tool_id,
             'mappings': ['npm'],
             'from_revision': data.revision_id_1,
         }
 
         storage.revision_intrinsic_metadata_add(
                 [metadata_rev_v2], conflict_update=True)
         storage.origin_intrinsic_metadata_add(
                 [metadata_origin_v2], conflict_update=True)
 
         actual_metadata = list(storage.origin_intrinsic_metadata_get(
             [data.origin_url_1]))
 
         expected_metadata_v2 = [{
             'id': data.origin_url_1,
             'metadata': metadata_v2,
             'tool': data.tools['swh-metadata-detector'],
             'from_revision': data.revision_id_1,
             'mappings': ['npm'],
         }]
 
         # metadata did change as the v2 was used to overwrite v1
         assert actual_metadata == expected_metadata_v2
 
     def test_origin_intrinsic_metadata_add__update_in_place_deadlock(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # given
         tool_id = data.tools['swh-metadata-detector']['id']
 
         ids = list(range(10))
 
         example_data1 = {
             'metadata': {
                 'version': None,
                 'name': None,
             },
             'mappings': [],
         }
         example_data2 = {
             'metadata': {
                 'version': 'v1.1.1',
                 'name': 'foo',
             },
             'mappings': [],
         }
 
         metadata_rev_v1 = {
             'id': data.revision_id_2,
             'metadata': {
                 'version': None,
                 'name': None,
             },
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
 
         data_v1 = [
             {
                 'id': 'file:///tmp/origin%d' % id_,
                 'from_revision': data.revision_id_2,
                 **example_data1,
                 'indexer_configuration_id': tool_id,
             }
             for id_ in ids
         ]
         data_v2 = [
             {
                 'id': 'file:///tmp/origin%d' % id_,
                 'from_revision': data.revision_id_2,
                 **example_data2,
                 'indexer_configuration_id': tool_id,
             }
             for id_ in ids
         ]
 
         # Remove one item from each, so that both queries have to succeed for
         # all items to be in the DB.
         data_v2a = data_v2[1:]
         data_v2b = list(reversed(data_v2[0:-1]))
 
         # given
         storage.revision_intrinsic_metadata_add([metadata_rev_v1])
         storage.origin_intrinsic_metadata_add(data_v1)
 
         # when
         origins = ['file:///tmp/origin%d' % i for i in ids]
         actual_data = list(storage.origin_intrinsic_metadata_get(origins))
 
         expected_data_v1 = [
             {
                 'id': 'file:///tmp/origin%d' % id_,
                 'from_revision': data.revision_id_2,
                 **example_data1,
                 'tool': data.tools['swh-metadata-detector'],
             }
             for id_ in ids
         ]
 
         # then
         assert actual_data == expected_data_v1
 
         # given
         def f1():
             storage.origin_intrinsic_metadata_add(
                 data_v2a, conflict_update=True)
 
         def f2():
             storage.origin_intrinsic_metadata_add(
                 data_v2b, conflict_update=True)
 
         t1 = threading.Thread(target=f1)
         t2 = threading.Thread(target=f2)
         t2.start()
         t1.start()
 
         t1.join()
         t2.join()
 
         actual_data = list(storage.origin_intrinsic_metadata_get(origins))
 
         expected_data_v2 = [
             {
                 'id': 'file:///tmp/origin%d' % id_,
                 'from_revision': data.revision_id_2,
                 **example_data2,
                 'tool': data.tools['swh-metadata-detector'],
             }
             for id_ in ids
         ]
 
         assert len(actual_data) == len(expected_data_v2)
         assert sorted(actual_data, key=lambda x: x['id']) == expected_data_v2
 
     def test_origin_intrinsic_metadata_add__duplicate_twice(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # given
         tool_id = data.tools['swh-metadata-detector']['id']
 
         metadata = {
             'developmentStatus': None,
             'name': None,
         }
         metadata_rev = {
             'id': data.revision_id_2,
             'metadata': metadata,
             'mappings': ['mapping1'],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin = {
             'id': data.origin_url_1,
             'metadata': metadata,
             'indexer_configuration_id': tool_id,
             'mappings': ['mapping1'],
             'from_revision': data.revision_id_2,
             }
 
         # when
         storage.revision_intrinsic_metadata_add([metadata_rev])
 
-        with pytest.raises(ValueError):
+        with pytest.raises(DuplicateId):
             storage.origin_intrinsic_metadata_add([
                 metadata_origin, metadata_origin])
 
     def test_origin_intrinsic_metadata_search_fulltext(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # given
         tool_id = data.tools['swh-metadata-detector']['id']
 
         metadata1 = {
             'author': 'John Doe',
         }
         metadata1_rev = {
             'id': data.revision_id_1,
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata1_origin = {
             'id': data.origin_url_1,
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': data.revision_id_1,
         }
         metadata2 = {
             'author': 'Jane Doe',
         }
         metadata2_rev = {
             'id': data.revision_id_2,
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata2_origin = {
             'id': data.origin_url_2,
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': data.revision_id_2,
         }
 
         # when
         storage.revision_intrinsic_metadata_add([metadata1_rev])
         storage.origin_intrinsic_metadata_add([metadata1_origin])
         storage.revision_intrinsic_metadata_add([metadata2_rev])
         storage.origin_intrinsic_metadata_add([metadata2_origin])
 
         # then
         search = storage.origin_intrinsic_metadata_search_fulltext
         assert set([res['id'] for res in search(['Doe'])]) \
             == set([data.origin_url_1, data.origin_url_2])
         assert [res['id'] for res in search(['John', 'Doe'])] \
             == [data.origin_url_1]
         assert [res['id'] for res in search(['John'])] \
             == [data.origin_url_1]
         assert not list(search(['John', 'Jane']))
 
     def test_origin_intrinsic_metadata_search_fulltext_rank(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         # given
         tool_id = data.tools['swh-metadata-detector']['id']
 
         # The following authors have "Random Person" to add some more content
         # to the JSON data, to work around normalization quirks when there
         # are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words
         # for small values of nb_words).
         metadata1 = {
             'author': [
                 'Random Person',
                 'John Doe',
                 'Jane Doe',
             ]
         }
         metadata1_rev = {
             'id': data.revision_id_1,
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata1_origin = {
             'id': data.origin_url_1,
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': data.revision_id_1,
         }
         metadata2 = {
             'author': [
                 'Random Person',
                 'Jane Doe',
             ]
         }
         metadata2_rev = {
             'id': data.revision_id_2,
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata2_origin = {
             'id': data.origin_url_2,
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': data.revision_id_2,
         }
 
         # when
         storage.revision_intrinsic_metadata_add([metadata1_rev])
         storage.origin_intrinsic_metadata_add([metadata1_origin])
         storage.revision_intrinsic_metadata_add([metadata2_rev])
         storage.origin_intrinsic_metadata_add([metadata2_origin])
 
         # then
         search = storage.origin_intrinsic_metadata_search_fulltext
         assert [res['id'] for res in search(['Doe'])] \
             == [data.origin_url_1, data.origin_url_2]
         assert [res['id'] for res in search(['Doe'], limit=1)] \
             == [data.origin_url_1]
         assert [res['id'] for res in search(['John'])] \
             == [data.origin_url_1]
         assert [res['id'] for res in search(['Jane'])] \
             == [data.origin_url_2, data.origin_url_1]
         assert [res['id'] for res in search(['John', 'Jane'])] \
             == [data.origin_url_1]
 
     def _fill_origin_intrinsic_metadata(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         tool1_id = data.tools['swh-metadata-detector']['id']
         tool2_id = data.tools['swh-metadata-detector2']['id']
 
         metadata1 = {
             '@context': 'foo',
             'author': 'John Doe',
         }
         metadata1_rev = {
             'id': data.revision_id_1,
             'metadata': metadata1,
             'mappings': ['npm'],
             'indexer_configuration_id': tool1_id,
         }
         metadata1_origin = {
             'id': data.origin_url_1,
             'metadata': metadata1,
             'mappings': ['npm'],
             'indexer_configuration_id': tool1_id,
             'from_revision': data.revision_id_1,
         }
         metadata2 = {
             '@context': 'foo',
             'author': 'Jane Doe',
         }
         metadata2_rev = {
             'id': data.revision_id_2,
             'metadata': metadata2,
             'mappings': ['npm', 'gemspec'],
             'indexer_configuration_id': tool2_id,
         }
         metadata2_origin = {
             'id': data.origin_url_2,
             'metadata': metadata2,
             'mappings': ['npm', 'gemspec'],
             'indexer_configuration_id': tool2_id,
             'from_revision': data.revision_id_2,
         }
         metadata3 = {
             '@context': 'foo',
         }
         metadata3_rev = {
             'id': data.revision_id_3,
             'metadata': metadata3,
             'mappings': ['npm', 'gemspec'],
             'indexer_configuration_id': tool2_id,
         }
         metadata3_origin = {
             'id': data.origin_url_3,
             'metadata': metadata3,
             'mappings': ['pkg-info'],
             'indexer_configuration_id': tool2_id,
             'from_revision': data.revision_id_3,
         }
 
         storage.revision_intrinsic_metadata_add([metadata1_rev])
         storage.origin_intrinsic_metadata_add([metadata1_origin])
         storage.revision_intrinsic_metadata_add([metadata2_rev])
         storage.origin_intrinsic_metadata_add([metadata2_origin])
         storage.revision_intrinsic_metadata_add([metadata3_rev])
         storage.origin_intrinsic_metadata_add([metadata3_origin])
 
     def test_origin_intrinsic_metadata_search_by_producer(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         self._fill_origin_intrinsic_metadata(
             swh_indexer_storage_with_data)
         tool1 = data.tools['swh-metadata-detector']
         tool2 = data.tools['swh-metadata-detector2']
         endpoint = storage.origin_intrinsic_metadata_search_by_producer
 
         # test pagination
         # no 'page_token' param, return all origins
         result = endpoint(ids_only=True)
         assert result['origins'] \
             == [data.origin_url_1, data.origin_url_2, data.origin_url_3]
         assert 'next_page_token' not in result
 
         # 'page_token' is < than origin_1, return everything
         result = endpoint(page_token=data.origin_url_1[:-1], ids_only=True)
         assert result['origins'] \
             == [data.origin_url_1, data.origin_url_2, data.origin_url_3]
         assert 'next_page_token' not in result
 
         # 'page_token' is origin_3, return nothing
         result = endpoint(page_token=data.origin_url_3, ids_only=True)
         assert not result['origins']
         assert 'next_page_token' not in result
 
         # test limit argument
         result = endpoint(page_token=data.origin_url_1[:-1],
                           limit=2, ids_only=True)
         assert result['origins'] == [data.origin_url_1, data.origin_url_2]
         assert result['next_page_token'] == result['origins'][-1]
 
         result = endpoint(page_token=data.origin_url_1, limit=2, ids_only=True)
         assert result['origins'] == [data.origin_url_2, data.origin_url_3]
         assert 'next_page_token' not in result
 
         result = endpoint(page_token=data.origin_url_2, limit=2, ids_only=True)
         assert result['origins'] == [data.origin_url_3]
         assert 'next_page_token' not in result
 
         # test mappings filtering
         result = endpoint(mappings=['npm'], ids_only=True)
         assert result['origins'] == [data.origin_url_1, data.origin_url_2]
         assert 'next_page_token' not in result
 
         result = endpoint(mappings=['npm', 'gemspec'], ids_only=True)
         assert result['origins'] == [data.origin_url_1, data.origin_url_2]
         assert 'next_page_token' not in result
 
         result = endpoint(mappings=['gemspec'], ids_only=True)
         assert result['origins'] == [data.origin_url_2]
         assert 'next_page_token' not in result
 
         result = endpoint(mappings=['pkg-info'], ids_only=True)
         assert result['origins'] == [data.origin_url_3]
         assert 'next_page_token' not in result
 
         result = endpoint(mappings=['foobar'], ids_only=True)
         assert not result['origins']
         assert 'next_page_token' not in result
 
         # test pagination + mappings
         result = endpoint(mappings=['npm'], limit=1, ids_only=True)
         assert result['origins'] == [data.origin_url_1]
         assert result['next_page_token'] == result['origins'][-1]
 
         # test tool filtering
         result = endpoint(tool_ids=[tool1['id']], ids_only=True)
         assert result['origins'] == [data.origin_url_1]
         assert 'next_page_token' not in result
 
         result = endpoint(tool_ids=[tool2['id']], ids_only=True)
         assert sorted(result['origins']) \
             == [data.origin_url_2, data.origin_url_3]
         assert 'next_page_token' not in result
 
         result = endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True)
         assert sorted(result['origins']) \
             == [data.origin_url_1, data.origin_url_2, data.origin_url_3]
         assert 'next_page_token' not in result
 
         # test ids_only=False
         assert endpoint(mappings=['gemspec'])['origins'] \
             == [{
                 'id': data.origin_url_2,
                 'metadata': {
                     '@context': 'foo',
                     'author': 'Jane Doe',
                 },
                 'mappings': ['npm', 'gemspec'],
                 'tool': tool2,
                 'from_revision': data.revision_id_2,
             }]
 
     def test_origin_intrinsic_metadata_stats(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         self._fill_origin_intrinsic_metadata(
             swh_indexer_storage_with_data)
 
         result = storage.origin_intrinsic_metadata_stats()
         assert result == {
             'per_mapping': {
                 'gemspec': 1,
                 'npm': 2,
                 'pkg-info': 1,
                 'codemeta': 0,
                 'maven': 0,
             },
             'total': 3,
             'non_empty': 2,
         }
 
 
 class TestIndexerStorageIndexerCondifuration:
     def test_indexer_configuration_add(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         tool = {
             'tool_name': 'some-unknown-tool',
             'tool_version': 'some-version',
             'tool_configuration': {"debian-package": "some-package"},
         }
 
         actual_tool = storage.indexer_configuration_get(tool)
         assert actual_tool is None  # does not exist
 
         # add it
         actual_tools = list(storage.indexer_configuration_add([tool]))
 
         assert len(actual_tools) == 1
         actual_tool = actual_tools[0]
         assert actual_tool is not None  # now it exists
         new_id = actual_tool.pop('id')
         assert actual_tool == tool
 
         actual_tools2 = list(storage.indexer_configuration_add([tool]))
         actual_tool2 = actual_tools2[0]
         assert actual_tool2 is not None  # now it exists
         new_id2 = actual_tool2.pop('id')
 
         assert new_id == new_id2
         assert actual_tool == actual_tool2
 
     def test_indexer_configuration_add_multiple(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         tool = {
             'tool_name': 'some-unknown-tool',
             'tool_version': 'some-version',
             'tool_configuration': {"debian-package": "some-package"},
         }
 
         actual_tools = list(storage.indexer_configuration_add([tool]))
         assert len(actual_tools) == 1
 
         new_tools = [tool, {
             'tool_name': 'yet-another-tool',
             'tool_version': 'version',
             'tool_configuration': {},
         }]
 
         actual_tools = list(storage.indexer_configuration_add(new_tools))
         assert len(actual_tools) == 2
 
         # order not guaranteed, so we iterate over results to check
         for tool in actual_tools:
             _id = tool.pop('id')
             assert _id is not None
             assert tool in new_tools
 
     def test_indexer_configuration_get_missing(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         tool = {
             'tool_name': 'unknown-tool',
             'tool_version': '3.1.0rc2-31-ga2cbb8c',
             'tool_configuration': {"command_line": "nomossa <filepath>"},
         }
 
         actual_tool = storage.indexer_configuration_get(tool)
 
         assert actual_tool is None
 
     def test_indexer_configuration_get(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         tool = {
             'tool_name': 'nomos',
             'tool_version': '3.1.0rc2-31-ga2cbb8c',
             'tool_configuration': {"command_line": "nomossa <filepath>"},
         }
 
         actual_tool = storage.indexer_configuration_get(tool)
         assert actual_tool
 
         expected_tool = tool.copy()
         del actual_tool['id']
 
         assert expected_tool == actual_tool
 
     def test_indexer_configuration_metadata_get_missing_context(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         tool = {
             'tool_name': 'swh-metadata-translator',
             'tool_version': '0.0.1',
             'tool_configuration': {"context": "unknown-context"},
         }
 
         actual_tool = storage.indexer_configuration_get(tool)
 
         assert actual_tool is None
 
     def test_indexer_configuration_metadata_get(
             self, swh_indexer_storage_with_data):
         storage, data = swh_indexer_storage_with_data
         tool = {
             'tool_name': 'swh-metadata-translator',
             'tool_version': '0.0.1',
             'tool_configuration': {"type": "local", "context": "NpmMapping"},
         }
 
         storage.indexer_configuration_add([tool])
         actual_tool = storage.indexer_configuration_get(tool)
         assert actual_tool
 
         expected_tool = tool.copy()
         expected_tool['id'] = actual_tool['id']
 
         assert expected_tool == actual_tool
 
 
 class TestIndexerStorageMisc:
     """Misc endpoints tests for the IndexerStorage.
     """
 
     def test_check_config(self, swh_indexer_storage):
         storage = swh_indexer_storage
         assert storage.check_config(check_write=True)
         assert storage.check_config(check_write=False)
 
     def test_types(self, swh_indexer_storage):
         """Checks all methods of StorageInterface are implemented by this
         backend, and that they have the same signature."""
         # Create an instance of the protocol (which cannot be instantiated
         # directly, so this creates a subclass, then instantiates it)
         interface = type('_', (IndexerStorageInterface,), {})()
 
         assert 'content_mimetype_add' in dir(interface)
 
         missing_methods = []
 
         for meth_name in dir(interface):
             if meth_name.startswith('_'):
                 continue
             interface_meth = getattr(interface, meth_name)
             try:
                 concrete_meth = getattr(swh_indexer_storage, meth_name)
             except AttributeError:
                 missing_methods.append(meth_name)
                 continue
 
             expected_signature = inspect.signature(interface_meth)
             actual_signature = inspect.signature(concrete_meth)
 
             assert expected_signature == actual_signature, meth_name
 
         assert missing_methods == []