diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
index 3cca457..c166dfd 100644
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -1,366 +1,365 @@
 # Copyright (C) 2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from collections import defaultdict
 import json
 
 SHA1_DIGEST_SIZE = 160
 
 
 def _transform_tool(tool):
     return {
         'id': tool['id'],
         'name': tool['tool_name'],
         'version': tool['tool_version'],
         'configuration': tool['tool_configuration'],
     }
 
 
 class SubStorage:
     """Implements common missing/get/add logic for each indexer type."""
     def __init__(self, tools):
         self._tools = tools
         self._data = {}  # map (id_, tool_id) -> metadata_dict
         self._tools_per_id = defaultdict(set)  # map id_ -> Set[tool_id]
 
     def missing(self, ids):
         """List data missing from storage.
 
         Args:
             data (iterable): dictionaries with keys:
 
                 - **id** (bytes): sha1 identifier
                 - **indexer_configuration_id** (int): tool used to compute
                   the results
 
         Yields:
             missing sha1s
 
         """
         for id_ in ids:
             tool_id = id_['indexer_configuration_id']
             id_ = id_['id']
             if tool_id not in self._tools_per_id.get(id_, set()):
                 yield id_
 
     def get(self, ids):
         """Retrieve data per id.
 
         Args:
             ids (iterable): sha1 checksums
 
         Yields:
             dict: dictionaries with the following keys:
 
               - **id** (bytes)
               - **tool** (dict): tool used to compute metadata
               - arbitrary data (as provided to `add`)
 
         """
         for id_ in ids:
             for tool_id in self._tools_per_id.get(id_, set()):
                 key = (id_, tool_id)
                 yield {
                     'id': id_,
                     'tool': _transform_tool(self._tools[tool_id]),
                     **self._data[key],
                 }
 
     def add(self, data, conflict_update):
         """Add data not present in storage.
 
         Args:
             data (iterable): dictionaries with keys:
 
               - **id**: sha1
               - **indexer_configuration_id**: tool used to compute the
                 results
               - arbitrary data
 
             conflict_update (bool): Flag to determine if we want to overwrite
               (true) or skip duplicates (false)
 
         """
         for item in data:
             item = item.copy()
             tool_id = item.pop('indexer_configuration_id')
             id_ = item.pop('id')
             data = item
             if not conflict_update and \
                     tool_id in self._tools_per_id.get(id_, set()):
                 # Duplicate, should not be updated
                 continue
             key = (id_, tool_id)
             self._data[key] = data
             self._tools_per_id[id_].add(tool_id)
 
 
 class IndexerStorage:
     """In-memory SWH indexer storage."""
 
     def __init__(self):
         self._tools = {}
         self._content_ctags = SubStorage(self._tools)
         self._content_metadata = SubStorage(self._tools)
         self._revision_metadata = SubStorage(self._tools)
 
     def content_ctags_missing(self, ctags):
         """List ctags missing from storage.
 
         Args:
             ctags (iterable): dicts with keys:
 
                 - **id** (bytes): sha1 identifier
                 - **indexer_configuration_id** (int): tool used to compute
                   the results
 
         Yields:
             an iterable of missing id for the tuple (id,
             indexer_configuration_id)
 
         """
         yield from self._content_ctags.missing(ctags)
 
     def content_ctags_get(self, ids):
         """Retrieve ctags per id.
 
         Args:
             ids (iterable): sha1 checksums
 
         Yields:
             Dictionaries with keys:
 
                 - **id** (bytes): content's identifier
                 - **name** (str): symbol's name
                 - **kind** (str): symbol's kind
                 - **lang** (str): language for that content
                 - **tool** (dict): tool used to compute the ctags' info
 
 
         """
         for item in self._content_ctags.get(ids):
             for item_ctags_item in item['ctags']:
                 yield {
                     'id': item['id'],
                     'tool': item['tool'],
                     **item_ctags_item
                 }
 
     def content_ctags_add(self, ctags, conflict_update=False):
         """Add ctags not present in storage
 
         Args:
             ctags (iterable): dictionaries with keys:
 
               - **id** (bytes): sha1
               - **ctags** ([list): List of dictionary with keys: name, kind,
                   line, lang
               - **indexer_configuration_id**: tool used to compute the
                 results
 
         """
         for item in ctags:
             tool_id = item['indexer_configuration_id']
             if conflict_update:
                 item_ctags = []
             else:
-                # TODO: this merges old ctags with new ctags. This is
-                # pointless, new ctags should replace the old ones.
+                # merge old ctags with new ctags
                 existing = list(self._content_ctags.get([item['id']]))
                 item_ctags = [
                     {
                         key: ctags_item[key]
                         for key in ('name', 'kind', 'line', 'lang')
                     }
                     for existing_item in existing
                     if existing_item['tool']['id'] == tool_id
                     for ctags_item in existing_item['ctags']
                 ]
             for new_item_ctags in item['ctags']:
                 if new_item_ctags not in item_ctags:
                     item_ctags.append(new_item_ctags)
             self._content_ctags.add([
                 {
                     'id': item['id'],
                     'indexer_configuration_id': tool_id,
                     'ctags': item_ctags,
                 }
             ], conflict_update=True)
 
     def content_ctags_search(self, expression,
                              limit=10, last_sha1=None, db=None, cur=None):
         """Search through content's raw ctags symbols.
 
         Args:
             expression (str): Expression to search for
             limit (int): Number of rows to return (default to 10).
             last_sha1 (str): Offset from which retrieving data (default to '').
 
         Yields:
             rows of ctags including id, name, lang, kind, line, etc...
 
         """
         nb_matches = 0
         for ((id_, tool_id), item) in \
                 sorted(self._content_ctags._data.items()):
             if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))):
                 continue
             nb_matches += 1
             for ctags_item in item['ctags']:
                 if ctags_item['name'] != expression:
                     continue
                 yield {
                     'id': id_,
                     'tool': _transform_tool(self._tools[tool_id]),
                     **ctags_item
                 }
             if nb_matches >= limit:
                 return
 
     def content_metadata_missing(self, metadata):
         """List metadata missing from storage.
 
         Args:
             metadata (iterable): dictionaries with keys:
 
               - **id** (bytes): sha1 identifier
               - **indexer_configuration_id** (int): tool used to compute
                 the results
 
         Yields:
             missing sha1s
 
         """
         yield from self._content_metadata.missing(metadata)
 
     def content_metadata_get(self, ids):
         """Retrieve metadata per id.
 
         Args:
             ids (iterable): sha1 checksums
 
         Yields:
             dictionaries with the following keys:
 
               - **id** (bytes)
               - **translated_metadata** (str): associated metadata
               - **tool** (dict): tool used to compute metadata
 
         """
         yield from self._content_metadata.get(ids)
 
     def content_metadata_add(self, metadata, conflict_update=False):
         """Add metadata not present in storage.
 
         Args:
             metadata (iterable): dictionaries with keys:
 
               - **id**: sha1
               - **translated_metadata**: arbitrary dict
               - **indexer_configuration_id**: tool used to compute the
                 results
 
             conflict_update: Flag to determine if we want to overwrite (true)
                 or skip duplicates (false, the default)
 
         """
         self._content_metadata.add(metadata, conflict_update)
 
     def revision_metadata_missing(self, metadata):
         """List metadata missing from storage.
 
         Args:
             metadata (iterable): dictionaries with keys:
 
               - **id** (bytes): sha1_git revision identifier
               - **indexer_configuration_id** (int): tool used to compute
                 the results
 
         Yields:
             missing ids
 
         """
         yield from self._revision_metadata.missing(metadata)
 
     def revision_metadata_get(self, ids):
         """Retrieve revision metadata per id.
 
         Args:
             ids (iterable): sha1 checksums
 
         Yields:
             dictionaries with the following keys:
 
             - **id** (bytes)
             - **translated_metadata** (str): associated metadata
             - **tool** (dict): tool used to compute metadata
 
         """
         yield from self._revision_metadata.get(ids)
 
     def revision_metadata_add(self, metadata, conflict_update=False):
         """Add metadata not present in storage.
 
         Args:
             metadata (iterable): dictionaries with keys:
 
               - **id**: sha1_git of revision
               - **translated_metadata**: arbitrary dict
               - **indexer_configuration_id**: tool used to compute metadata
 
             conflict_update: Flag to determine if we want to overwrite (true)
               or skip duplicates (false, the default)
 
         """
         self._revision_metadata.add(metadata, conflict_update)
 
     def indexer_configuration_add(self, tools):
         """Add new tools to the storage.
 
         Args:
             tools ([dict]): List of dictionary representing tool to
               insert in the db. Dictionary with the following keys:
 
               - **tool_name** (str): tool's name
               - **tool_version** (str): tool's version
               - **tool_configuration** (dict): tool's configuration
                 (free form dict)
 
         Returns:
             list: List of dict inserted in the db (holding the id key as
             well). The order of the list is not guaranteed to match
             the order of the initial list.
 
         """
         inserted = []
         for tool in tools:
             tool = tool.copy()
             id_ = self._tool_key(tool)
             tool['id'] = id_
             self._tools[id_] = tool
             inserted.append(tool)
         return inserted
 
     def indexer_configuration_get(self, tool):
         """Retrieve tool information.
 
         Args:
             tool (dict): Dictionary representing a tool with the
               following keys:
 
               - **tool_name** (str): tool's name
               - **tool_version** (str): tool's version
               - **tool_configuration** (dict): tool's configuration
                 (free form dict)
 
         Returns:
             The same dictionary with an `id` key, None otherwise.
 
         """
         return self._tools.get(self._tool_key(tool))
 
     def _tool_key(self, tool):
         return (tool['tool_name'], tool['tool_version'],
                 json.dumps(tool['tool_configuration'], sort_keys=True))
diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py
index 1aa80c8..ed7186d 100644
--- a/swh/indexer/tests/test_ctags.py
+++ b/swh/indexer/tests/test_ctags.py
@@ -1,146 +1,151 @@
 # Copyright (C) 2017-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import unittest
 
 from unittest.mock import patch
 from swh.indexer.ctags import (
     CtagsIndexer, run_ctags
 )
 
 from swh.indexer.tests.test_utils import (
     CommonContentIndexerTest,
     CommonIndexerWithErrorsTest, CommonIndexerNoTool,
     SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG
 )
 
 
 class BasicTest(unittest.TestCase):
     @patch('swh.indexer.ctags.subprocess')
     def test_run_ctags(self, mock_subprocess):
         """Computing licenses from a raw content should return results
 
         """
         output0 = """
 {"name":"defun","kind":"function","line":1,"language":"scheme"}
 {"name":"name","kind":"symbol","line":5,"language":"else"}"""
         output1 = """
 {"name":"let","kind":"var","line":10,"language":"something"}"""
 
         expected_result0 = [
             {
                 'name': 'defun',
                 'kind': 'function',
                 'line': 1,
                 'lang': 'scheme'
             },
             {
                 'name': 'name',
                 'kind': 'symbol',
                 'line': 5,
                 'lang': 'else'
             }
         ]
 
         expected_result1 = [
             {
                 'name': 'let',
                 'kind': 'var',
                 'line': 10,
                 'lang': 'something'
             }
         ]
         for path, lang, intermediary_result, expected_result in [
                 (b'some/path', 'lisp', output0, expected_result0),
                 (b'some/path/2', 'markdown', output1, expected_result1)
         ]:
             mock_subprocess.check_output.return_value = intermediary_result
             actual_result = list(run_ctags(path, lang=lang))
             self.assertEqual(actual_result, expected_result)
 
 
 class InjectCtagsIndexer:
     """Override ctags computations.
 
     """
     def compute_ctags(self, path, lang):
         """Inject fake ctags given path (sha1 identifier).
 
         """
         return {
             'lang': lang,
             **SHA1_TO_CTAGS.get(path)
         }
 
 
 class CtagsIndexerTest(NoDiskIndexer, InjectCtagsIndexer, CtagsIndexer):
     """Specific language whose configuration is enough to satisfy the
        indexing tests.
     """
     def parse_config_file(self, *args, **kwargs):
         return {
             **BASE_TEST_CONFIG,
             'tools': {
                 'name': 'universal-ctags',
                 'version': '~git7859817b',
                 'configuration': {
                     'command_line': '''ctags --fields=+lnz --sort=no '''
                                     ''' --links=no <filepath>''',
                     'max_content_size': 1000,
                 },
             },
             'languages': {
                 'python': 'python',
                 'haskell': 'haskell',
                 'bar': 'bar',
             },
             'workdir': '/nowhere',
         }
 
 
 class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase):
     """Ctags indexer test scenarios:
 
     - Known sha1s in the input list have their data indexed
     - Unknown sha1 in the input list are not indexed
 
     """
+
+    def get_indexer_results(self, ids):
+        yield from self.idx_storage.content_ctags_get(ids)
+
     def setUp(self):
         self.indexer = CtagsIndexerTest()
+        self.idx_storage = self.indexer.idx_storage
 
         # Prepare test input
         self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
         self.id1 = 'd4c647f0fc257591cc9ba1722484229780d1c607'
         self.id2 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
 
         tool_id = self.indexer.tool['id']
         self.expected_results = {
             self.id0: {
                 'id': self.id0,
                 'indexer_configuration_id': tool_id,
                 'ctags': SHA1_TO_CTAGS[self.id0],
             },
             self.id1: {
                 'id': self.id1,
                 'indexer_configuration_id': tool_id,
                 'ctags': SHA1_TO_CTAGS[self.id1],
             },
             self.id2: {
                 'id': self.id2,
                 'indexer_configuration_id': tool_id,
                 'ctags': SHA1_TO_CTAGS[self.id2],
             }
         }
 
 
 class CtagsIndexerUnknownToolTestStorage(
         CommonIndexerNoTool, CtagsIndexerTest):
     """Fossology license indexer with wrong configuration"""
 
 
 class TestCtagsIndexersErrors(
         CommonIndexerWithErrorsTest, unittest.TestCase):
     """Test the indexer raise the right errors when wrongly initialized"""
     Indexer = CtagsIndexerUnknownToolTestStorage
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
index 6abdd19..4784fc7 100644
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -1,806 +1,808 @@
 # Copyright (C) 2017-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import datetime
 
 from swh.objstorage.exc import ObjNotFoundError
 from swh.model import hashutil
 from swh.model.hashutil import hash_to_bytes
 
 from swh.indexer.storage import INDEXER_CFG_KEY
 
 BASE_TEST_CONFIG = {
     'storage': {
         'cls': 'memory',
         'args': {
         },
     },
     'objstorage': {
         'cls': 'memory',
         'args': {
         },
     },
     INDEXER_CFG_KEY: {
         'cls': 'memory',
         'args': {
         },
     },
 }
 
 ORIGINS = [
         {
             'id': 52189575,
             'lister': None,
             'project': None,
             'type': 'git',
             'url': 'https://github.com/SoftwareHeritage/swh-storage'},
         {
             'id': 4423668,
             'lister': None,
             'project': None,
             'type': 'ftp',
             'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
         {
             'id': 77775770,
             'lister': None,
             'project': None,
             'type': 'deposit',
             'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
         {
             'id': 85072327,
             'lister': None,
             'project': None,
             'type': 'pypi',
             'url': 'https://pypi.org/project/limnoria/'},
         {
             'id': 49908349,
             'lister': None,
             'project': None,
             'type': 'svn',
             'url': 'http://0-512-md.googlecode.com/svn/'},
         {
             'id': 54974445,
             'lister': None,
             'project': None,
             'type': 'git',
             'url': 'https://github.com/librariesio/yarn-parser'},
         ]
 
 SNAPSHOTS = {
         52189575: {
             'branches': {
                 b'refs/heads/add-revision-origin-cache': {
                     'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
                               b's\xe7/\xe9l\x1e',
                     'target_type': 'revision'},
                 b'HEAD': {
                     'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
                               b'\xac\xefrm',
                     'target_type': 'revision'},
                 b'refs/tags/v0.0.103': {
                     'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
                               b'\x0f\xdd',
                     'target_type': 'release'},
                 }},
         4423668: {
             'branches': {
                 b'3DLDF-1.1.4.tar.gz': {
                     'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc'
                               b'"G\x99\x11',
                     'target_type': 'revision'},
                 b'3DLDF-2.0.2.tar.gz': {
                     'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
                               b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
                     'target_type': 'revision'},
                 b'3DLDF-2.0.3-examples.tar.gz': {
                     'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
                               b'\xfe\xadZ\x80\x80\xc1\x83\xff',
                     'target_type': 'revision'},
                 b'3DLDF-2.0.3.tar.gz': {
                     'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
                               b'\xcc\x1a\xb4`\x8c\x8by',
                     'target_type': 'revision'},
                 b'3DLDF-2.0.tar.gz': {
                     'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
                               b'\xd3\xd1m',
                     b'target_type': 'revision'}
                 }},
         77775770: {
             'branches': {
                 b'master': {
                     'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
                               b'\xa6\xe9\x99\xb1\x9e]q\xeb',
                     'target_type': 'revision'}
             },
             'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
                   b"\x1d\r "},
         85072327: {
             'branches': {
                 b'HEAD': {
                     'target': b'releases/2018.09.09',
                     'target_type': 'alias'},
                 b'releases/2018.09.01': {
                     'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
                               b'\xbb\xdfF\xfdw\xcf',
                     'target_type': 'revision'},
                 b'releases/2018.09.09': {
                     'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
                               b'A\x10\x9d\xc5\xfa2\xf8t',
                     'target_type': 'revision'}},
             'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
                   b'\x12\x9e\xd6\xb3'},
         49908349: {
                 'branches': {
                     b'master': {
                         'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
                                   b'\xc9\xad#.\x1bw=\x18',
                         'target_type': 'revision'}},
                 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
                       b'\x05\xea\xb8\x1f\xc4H\xf4s'},
         54974445: {
                 'branches': {
                     b'HEAD': {
                         'target': hash_to_bytes(
                             '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
                         'target_type': 'revision'}}}
         }
 
 
 REVISIONS = [{
     'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
     'committer': {
         'id': 26,
         'name': b'Andrew Nesbitt',
         'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
         'email': b'andrewnez@gmail.com'
     },
     'synthetic': False,
     'date': {
         'negative_utc': False,
         'timestamp': {
             'seconds': 1487596456,
             'microseconds': 0
         },
         'offset': 0
     },
     'directory': b'10'
 }]
 
 DIRECTORY_ID = b'10'
 
 DIRECTORY = [{
     'sha1_git': b'abc',
     'name': b'index.js',
     'target': b'abc',
     'length': 897,
     'status': 'visible',
     'type': 'file',
     'perms': 33188,
     'sha1': b'bcd'
     },
     {
     'sha1_git': b'aab',
     'name': b'package.json',
     'target': b'aab',
     'length': 712,
     'status': 'visible',
     'type': 'file',
     'perms': 33188,
     'sha1': b'cde'
     },
     {
     'target': b'11',
     'type': 'dir',
     'length': None,
     'name': b'.github',
     'sha1': None,
     'perms': 16384,
     'sha1_git': None,
     'status': None,
     'sha256': None
     }
 ]
 
 SHA1_TO_LICENSES = {
     '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
     '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
     '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
     '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
     'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
 }
 
 
 SHA1_TO_CTAGS = {
     '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{
         'name': 'foo',
         'kind': 'str',
         'line': 10,
         'lang': 'bar',
     }],
     'd4c647f0fc257591cc9ba1722484229780d1c607': [{
         'name': 'let',
         'kind': 'int',
         'line': 100,
         'lang': 'haskell',
     }],
     '688a5ef812c53907562fe379d4b3851e69c7cb15': [{
         'name': 'symbol',
         'kind': 'float',
         'line': 99,
         'lang': 'python',
     }],
 }
 
 
 OBJ_STORAGE_DATA = {
     '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text',
     '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text',
     '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text',
     '02fb2c89e14f7fab46701478c83779c7beb7b069': b"""
     import unittest
     import logging
     from swh.indexer.mimetype import MimetypeIndexer
     from swh.indexer.tests.test_utils import MockObjStorage
 
     class MockStorage():
         def content_mimetype_add(self, mimetypes):
             self.state = mimetypes
             self.conflict_update = conflict_update
 
         def indexer_configuration_add(self, tools):
             return [{
                 'id': 10,
             }]
     """,
     '103bc087db1d26afc3a0283f38663d081e9b01e6': b"""
         #ifndef __AVL__
         #define __AVL__
 
         typedef struct _avl_tree avl_tree;
 
         typedef struct _data_t {
           int content;
         } data_t;
     """,
     '93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
     (should 'pygments (recognize 'lisp 'easily))
 
     """,
     '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
     {
         "name": "test_metadata",
         "version": "0.0.1",
         "description": "Simple package.json test for indexer",
         "repository": {
           "type": "git",
           "url": "https://github.com/moranegg/metadata_test"
       }
     }
     """,
     'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
     {
       "version": "5.0.3",
       "name": "npm",
       "description": "a package manager for JavaScript",
       "keywords": [
         "install",
         "modules",
         "package manager",
         "package.json"
       ],
       "preferGlobal": true,
       "config": {
         "publishtest": false
       },
       "homepage": "https://docs.npmjs.com/",
       "author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
       "repository": {
         "type": "git",
         "url": "https://github.com/npm/npm"
       },
       "bugs": {
         "url": "https://github.com/npm/npm/issues"
       },
       "dependencies": {
         "JSONStream": "~1.3.1",
         "abbrev": "~1.1.0",
         "ansi-regex": "~2.1.1",
         "ansicolors": "~0.3.2",
         "ansistyles": "~0.1.3"
       },
       "devDependencies": {
         "tacks": "~1.2.6",
         "tap": "~10.3.2"
       },
       "license": "Artistic-2.0"
     }
 
     """,
     'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b"""
     """,
     'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'',
 }
 
 CONTENT_METADATA = [{
     'tool': {
         'configuration': {
             'type': 'local',
             'context': 'NpmMapping'
             },
         'version': '0.0.1',
         'id': 6,
         'name': 'swh-metadata-translator'
     },
     'id': b'cde',
     'translated_metadata': {
         '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
         'type': 'SoftwareSourceCode',
         'codemeta:issueTracker':
             'https://github.com/librariesio/yarn-parser/issues',
         'version': '1.0.0',
         'name': 'yarn-parser',
         'schema:author': 'Andrew Nesbitt',
         'url':
             'https://github.com/librariesio/yarn-parser#readme',
         'processorRequirements': {'node': '7.5'},
         'license': 'AGPL-3.0',
         'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
         'schema:codeRepository':
             'git+https://github.com/librariesio/yarn-parser.git',
         'description':
             'Tiny web service for parsing yarn.lock files',
         }
 }]
 
 
 def fill_obj_storage(obj_storage):
     """Add some content in an object storage."""
     for (obj_id, content) in OBJ_STORAGE_DATA.items():
         obj_storage.add(content, obj_id=hash_to_bytes(obj_id))
 
 
 class MockObjStorage:
     """Mock an swh-objstorage objstorage with predefined contents.
 
     """
     data = {}
 
     def __init__(self):
         self.data = OBJ_STORAGE_DATA.copy()
 
     def __iter__(self):
         yield from self.data.keys()
 
     def __contains__(self, sha1):
         return self.data.get(sha1) is not None
 
     def get(self, sha1):
         raw_content = self.data.get(sha1)
         if raw_content is None:
             raise ObjNotFoundError(sha1)
         return raw_content
 
 
 class MockIndexerStorage():
     """Mock an swh-indexer storage.
 
     """
     added_data = []
     revision_metadata = {}
     tools = {}
 
     def indexer_configuration_add(self, tools):
         results = []
         for tool in tools:
             results.append(self._indexer_configuration_add_one(tool))
         return results
 
     def _indexer_configuration_add_one(self, tool):
         if tool['tool_name'] == 'swh-metadata-translator':
             tool2 = {
                 'id': 30,
                 'tool_name': 'swh-metadata-translator',
                 'tool_version': '0.0.1',
                 'tool_configuration': {
                     'type': 'local',
                     'context': 'NpmMapping'
                 },
             }
         elif tool['tool_name'] == 'swh-metadata-detector':
             tool2 = {
                 'id': 7,
                 'tool_name': 'swh-metadata-detector',
                 'tool_version': '0.0.1',
                 'tool_configuration': {
                     'type': 'local',
                     'context': 'NpmMapping'
                 },
             }
         elif tool['tool_name'] == 'origin-metadata':
             tool2 = {
                 'id': 8,
                 'tool_name': 'origin-metadata',
                 'tool_version': '0.0.1',
                 'tool_configuration': {},
             }
         else:
             assert False, 'Unknown tool {tool_name}'.format(**tool)
 
         self.tools[tool2['id']] = tool2
         return tool2
 
     def content_metadata_missing(self, sha1s):
         yield from []
 
     def content_metadata_add(self, metadata, conflict_update=None):
         self.added_data.append(
                 ('content_metadata', conflict_update, metadata))
 
     def revision_metadata_add(self, metadata, conflict_update=None):
         assert conflict_update
         self.added_data.append(
                 ('revision_metadata', conflict_update, metadata))
         for item in metadata:
             assert isinstance(item['id'], bytes)
             self.revision_metadata.setdefault(item['id'], []).append(item)
 
     def revision_metadata_get(self, ids):
         for id_ in ids:
             assert isinstance(id_, bytes)
             for item in self.revision_metadata.get(id_):
                 item = item.copy()
                 tool_id = item.pop('indexer_configuration_id')
                 if tool_id in self.tools:
                     item['tool'] = self.tools[tool_id].copy()
                 else:  # HACK: this needs to be removed altogether
                     item['tool'] = {
                         'id': tool_id,
                         'name': tool_id[0],
                         'version': tool_id[1],
                         'configuration': tool_id[2],
                     }
                 yield item
 
     def origin_intrinsic_metadata_add(self, metadata, conflict_update=None):
         self.added_data.append(
                 ('origin_intrinsic_metadata', conflict_update, metadata))
 
     def content_metadata_get(self, sha1s):
         assert sha1s == [b'cde']
         return CONTENT_METADATA
 
 
 def fill_storage(storage):
     for origin in ORIGINS:
         origin = origin.copy()
         del origin['id']
         last_origin_id = storage.origin_add_one(origin)
     visit = storage.origin_visit_add(last_origin_id, datetime.datetime.now())
     for (snap_id, snap_branches) in SNAPSHOTS.items():
         storage.snapshot_add(last_origin_id, visit['visit'], {
             'id': snap_id,
             'branches': snap_branches
         })
     storage.revision_add(REVISIONS)
     storage.directory_add([{
         'id': DIRECTORY_ID,
         'entries': DIRECTORY,
     }])
 
 
 class MockStorage():
     """Mock a real swh-storage storage to simplify reading indexers'
     outputs.
 
     """
     def origin_get(self, id_):
         for origin in ORIGINS:
             for (k, v) in id_.items():
                 if origin[k] != v:
                     break
             else:
                 # This block is run iff we didn't break, ie. if all supplied
                 # parts of the id are set to the expected value.
                 return origin
         assert False, id_
 
     def snapshot_get_latest(self, origin_id):
         if origin_id in SNAPSHOTS:
             return SNAPSHOTS[origin_id]
         else:
             assert False, origin_id
 
     def revision_get(self, revisions):
         return REVISIONS.copy()
 
     def directory_ls(self, directory, recursive=False, cur=None):
         assert directory == DIRECTORY_ID
         return DIRECTORY
 
 
 class BasicMockStorage():
     """In memory implementation to fake the content_get_range api.
 
     FIXME: To remove when the actual in-memory lands.
 
     """
     contents = []
 
     def __init__(self, contents):
         self.contents = contents
 
     def content_get_range(self, start, end, limit=1000):
         # to make input test data consilient with actual runtime the
         # other way of doing properly things would be to rewrite all
         # tests (that's another task entirely so not right now)
         if isinstance(start, bytes):
             start = hashutil.hash_to_hex(start)
         if isinstance(end, bytes):
             end = hashutil.hash_to_hex(end)
         results = []
         _next_id = None
         counter = 0
         for c in self.contents:
             _id = c['sha1']
             if start <= _id and _id <= end:
                 results.append(c)
             if counter >= limit:
                 break
             counter += 1
 
         return {
             'contents': results,
             'next': _next_id
         }
 
 
 class BasicMockIndexerStorage():
     """Mock Indexer storage to simplify reading indexers' outputs.
 
     """
     state = []
 
     def _internal_add(self, data, conflict_update=None):
         """All content indexer have the same structure. So reuse `data` as the
            same data.  It's either mimetype, language,
            fossology_license, etc...
 
         """
         self.state = data
         self.conflict_update = conflict_update
 
     def content_mimetype_add(self, data, conflict_update=None):
         self._internal_add(data, conflict_update=conflict_update)
 
     def content_fossology_license_add(self, data, conflict_update=None):
         self._internal_add(data, conflict_update=conflict_update)
 
     def content_language_add(self, data, conflict_update=None):
         self._internal_add(data, conflict_update=conflict_update)
 
     def content_ctags_add(self, data, conflict_update=None):
         self._internal_add(data, conflict_update=conflict_update)
 
     def _internal_get_range(self, start, end,
                             indexer_configuration_id, limit=1000):
         """Same logic as _internal_add, we retrieve indexed data given an
            identifier. So the code here does not change even though
            the underlying data does.
 
         """
         # to make input test data consilient with actual runtime the
         # other way of doing properly things would be to rewrite all
         # tests (that's another task entirely so not right now)
         if isinstance(start, bytes):
             start = hashutil.hash_to_hex(start)
         if isinstance(end, bytes):
             end = hashutil.hash_to_hex(end)
         results = []
         _next = None
         counter = 0
         for m in self.state:
             _id = m['id']
             _tool_id = m['indexer_configuration_id']
             if (start <= _id and _id <= end and
                _tool_id == indexer_configuration_id):
                 results.append(_id)
             if counter >= limit:
                 break
             counter += 1
 
         return {
             'ids': results,
             'next': _next
         }
 
     def content_mimetype_get_range(
             self, start, end, indexer_configuration_id, limit=1000):
         return self._internal_get_range(
             start, end, indexer_configuration_id, limit=limit)
 
     def content_fossology_license_get_range(
             self, start, end, indexer_configuration_id, limit=1000):
         return self._internal_get_range(
             start, end, indexer_configuration_id, limit=limit)
 
     def indexer_configuration_add(self, tools):
         return [{
             'id': 10,
         }]
 
 
 class CommonIndexerNoTool:
     """Mixin to wronly initialize content indexer"""
     def prepare(self):
         super().prepare()
         self.tools = None
 
 
 class CommonIndexerWithErrorsTest:
     """Test indexer configuration checks.
 
     """
     Indexer = None
     RangeIndexer = None
 
     def test_wrong_unknown_configuration_tool(self):
         """Indexer with unknown configuration tool fails check"""
         with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
             print('indexer: %s' % self.Indexer)
             self.Indexer()
 
     def test_wrong_unknown_configuration_tool_range(self):
         """Range Indexer with unknown configuration tool fails check"""
         if self.RangeIndexer is not None:
             with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
                 self.RangeIndexer()
 
 
 class CommonContentIndexerTest:
-    def assert_results_ok(self, actual_results, expected_results=None):
+    def get_indexer_results(self, ids):
+        """Override this for indexers that don't have a mock storage."""
+        return self.indexer.idx_storage.state
+
+    def assert_results_ok(self, sha1s, expected_results=None):
+        actual_results = self.get_indexer_results(sha1s)
+
         if expected_results is None:
             expected_results = self.expected_results
 
         for indexed_data in actual_results:
             _id = indexed_data['id']
             self.assertEqual(indexed_data, expected_results[_id])
             _tool_id = indexed_data['indexer_configuration_id']
             self.assertEqual(_tool_id, self.indexer.tool['id'])
 
     def test_index(self):
         """Known sha1 have their data indexed
 
         """
         sha1s = [self.id0, self.id1, self.id2]
 
         # when
         self.indexer.run(sha1s, policy_update='update-dups')
 
-        actual_results = self.indexer.idx_storage.state
-        self.assertTrue(self.indexer.idx_storage.conflict_update)
-        self.assert_results_ok(actual_results)
+        self.assert_results_ok(sha1s)
 
         # 2nd pass
         self.indexer.run(sha1s, policy_update='ignore-dups')
 
-        self.assertFalse(self.indexer.idx_storage.conflict_update)
-        self.assert_results_ok(actual_results)
+        self.assert_results_ok(sha1s)
 
     def test_index_one_unknown_sha1(self):
         """Unknown sha1 are not indexed"""
         sha1s = [self.id1,
                  '799a5ef812c53907562fe379d4b3851e69c7cb15',  # unknown
                  '800a5ef812c53907562fe379d4b3851e69c7cb15']  # unknown
 
         # when
         self.indexer.run(sha1s, policy_update='update-dups')
-        actual_results = self.indexer.idx_storage.state
 
         # then
         expected_results = {
             k: v for k, v in self.expected_results.items() if k in sha1s
         }
 
-        self.assert_results_ok(actual_results, expected_results)
+        self.assert_results_ok(sha1s, expected_results)
 
 
 class CommonContentIndexerRangeTest:
     """Allows to factorize tests on range indexer.
 
     """
     def assert_results_ok(self, start, end, actual_results,
                           expected_results=None):
         if expected_results is None:
             expected_results = self.expected_results
 
         for indexed_data in actual_results:
             _id = indexed_data['id']
             self.assertEqual(indexed_data, expected_results[_id])
             self.assertTrue(start <= _id and _id <= end)
             _tool_id = indexed_data['indexer_configuration_id']
             self.assertEqual(_tool_id, self.indexer.tool['id'])
 
     def test__index_contents(self):
         """Indexing contents without existing data results in indexed data
 
         """
         start, end = [self.contents[0], self.contents[2]]  # output hex ids
         # given
         actual_results = list(self.indexer._index_contents(
             start, end, indexed={}))
 
         self.assert_results_ok(start, end, actual_results)
 
     def test__index_contents_with_indexed_data(self):
         """Indexing contents with existing data results in less indexed data
 
         """
         start, end = [self.contents[0], self.contents[2]]  # output hex ids
         data_indexed = [self.id0, self.id2]
 
         # given
         actual_results = self.indexer._index_contents(
             start, end, indexed=set(data_indexed))
 
         # craft the expected results
         expected_results = self.expected_results.copy()
         for already_indexed_key in data_indexed:
             expected_results.pop(already_indexed_key)
 
         self.assert_results_ok(
             start, end, actual_results, expected_results)
 
     def test_generate_content_get(self):
         """Optimal indexing should result in indexed data
 
         """
         start, end = [self.contents[0], self.contents[2]]  # output hex ids
 
         # given
         actual_results = self.indexer.run(start, end)
 
         # then
         self.assertTrue(actual_results)
 
     def test_generate_content_get_input_as_bytes(self):
         """Optimal indexing should result in indexed data
 
         Input are in bytes here.
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
 
         # given
         actual_results = self.indexer.run(  # checks the bytes input this time
             start, end, skip_existing=False)
         # no already indexed data so same result as prior test
 
         # then
         self.assertTrue(actual_results)
 
     def test_generate_content_get_no_result(self):
         """No result indexed returns False"""
         start, end = ['0000000000000000000000000000000000000000',
                       '0000000000000000000000000000000000000001']
         # given
         actual_results = self.indexer.run(
             start, end, incremental=False)
 
         # then
         self.assertFalse(actual_results)
 
 
 class NoDiskIndexer:
     """Mixin to override the DiskIndexer behavior avoiding side-effects in
        tests.
 
     """
 
     def write_to_temp(self, filename, data):  # noop
         return filename
 
     def cleanup(self, content_path):  # noop
         return None