diff --git a/PKG-INFO b/PKG-INFO
index 68bb5a5..7cff760 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,69 +1,69 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 0.0.146
+Version: 0.0.147
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Description: swh-indexer
         ============
         
         Tools to compute multiple indexes on SWH's raw contents:
         - content:
           - mimetype
           - ctags
           - language
           - fossology-license
           - metadata
         - revision:
           - metadata
         
         An indexer is in charge of:
         - looking up objects
         - extracting information from those objects
         - store those information in the swh-indexer db
         
         There are multiple indexers working on different object types:
           - content indexer: works with content sha1 hashes
           - revision indexer: works with revision sha1 hashes
           - origin indexer: works with origin identifiers
         
         Indexation procedure:
         - receive batch of ids
         - retrieve the associated data depending on object type
         - compute for that object some index
         - store the result to swh's storage
         
         Current content indexers:
         
         - mimetype (queue swh_indexer_content_mimetype): detect the encoding
           and mimetype
         
         - language (queue swh_indexer_content_language): detect the
           programming language
         
         - ctags (queue swh_indexer_content_ctags): compute tags information
         
         - fossology-license (queue swh_indexer_fossology_license): compute the
           license
         
         - metadata: translate file into translated_metadata dict
         
         Current revision indexers:
         
         - metadata: detects files containing metadata and retrieves translated_metadata
           in content_metadata table in storage or run content indexer to translate
           files.
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
diff --git a/requirements-swh.txt b/requirements-swh.txt
index be61102..394e977 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,6 +1,6 @@
-swh.core >= 0.0.53
+swh.core[db,http] >= 0.0.61
 swh.model >= 0.0.15
 swh.objstorage >= 0.0.28
 swh.scheduler >= 0.0.47
 swh.storage >= 0.0.123
 swh.journal >= 0.0.6
diff --git a/requirements.txt b/requirements.txt
index a578b91..84e7278 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,5 @@
 vcversioner
-pygments
 click
-chardet
 file-magic
 pyld
 xmltodict
diff --git a/setup.py b/setup.py
index d2b2a85..299b040 100755
--- a/setup.py
+++ b/setup.py
@@ -1,69 +1,71 @@
 #!/usr/bin/env python3
 # Copyright (C) 2015-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from setuptools import setup, find_packages
 
 from os import path
 from io import open
 
 here = path.abspath(path.dirname(__file__))
 
 # Get the long description from the README file
 with open(path.join(here, 'README.md'), encoding='utf-8') as f:
     long_description = f.read()
 
 
 def parse_requirements(name=None):
     if name:
         reqf = 'requirements-%s.txt' % name
     else:
         reqf = 'requirements.txt'
 
     requirements = []
     if not path.exists(reqf):
         return requirements
 
     with open(reqf) as f:
         for line in f.readlines():
             line = line.strip()
             if not line or line.startswith('#'):
                 continue
             requirements.append(line)
     return requirements
 
 
 setup(
     name='swh.indexer',
     description='Software Heritage Content Indexer',
     long_description=long_description,
     long_description_content_type='text/markdown',
     author='Software Heritage developers',
     author_email='swh-devel@inria.fr',
     url='https://forge.softwareheritage.org/diffusion/78/',
     packages=find_packages(),
     scripts=[],
     install_requires=parse_requirements() + parse_requirements('swh'),
     setup_requires=['vcversioner'],
     extras_require={'testing': parse_requirements('test')},
     vcversioner={},
     include_package_data=True,
     entry_points='''
         [console_scripts]
         swh-indexer=swh.indexer.cli:main
+        [swh.cli.subcommands]
+        indexer=swh.indexer.cli:cli
     ''',
     classifiers=[
         "Programming Language :: Python :: 3",
         "Intended Audience :: Developers",
         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
         "Operating System :: OS Independent",
         "Development Status :: 5 - Production/Stable",
     ],
     project_urls={
         'Bug Reports': 'https://forge.softwareheritage.org/maniphest',
         'Funding': 'https://www.softwareheritage.org/donate',
         'Source': 'https://forge.softwareheritage.org/source/swh-indexer',
     },
 )
diff --git a/sql/upgrades/124.sql b/sql/upgrades/124.sql
new file mode 100644
index 0000000..a773c32
--- /dev/null
+++ b/sql/upgrades/124.sql
@@ -0,0 +1,9 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 123
+-- to_version: 124
+-- description: drop constraint that origin_intrinsic_metadata references an existing revision_intrinsic_metadata.
+
+insert into dbversion(version, release, description)
+values(124, now(), 'Work In Progress');
+
+alter table origin_intrinsic_metadata drop constraint origin_intrinsic_metadata_revision_metadata_fkey;
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index 68bb5a5..7cff760 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,69 +1,69 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 0.0.146
+Version: 0.0.147
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Description: swh-indexer
         ============
         
         Tools to compute multiple indexes on SWH's raw contents:
         - content:
           - mimetype
           - ctags
           - language
           - fossology-license
           - metadata
         - revision:
           - metadata
         
         An indexer is in charge of:
         - looking up objects
         - extracting information from those objects
         - store those information in the swh-indexer db
         
         There are multiple indexers working on different object types:
           - content indexer: works with content sha1 hashes
           - revision indexer: works with revision sha1 hashes
           - origin indexer: works with origin identifiers
         
         Indexation procedure:
         - receive batch of ids
         - retrieve the associated data depending on object type
         - compute for that object some index
         - store the result to swh's storage
         
         Current content indexers:
         
         - mimetype (queue swh_indexer_content_mimetype): detect the encoding
           and mimetype
         
         - language (queue swh_indexer_content_language): detect the
           programming language
         
         - ctags (queue swh_indexer_content_ctags): compute tags information
         
         - fossology-license (queue swh_indexer_fossology_license): compute the
           license
         
         - metadata: translate file into translated_metadata dict
         
         Current revision indexers:
         
         - metadata: detects files containing metadata and retrieves translated_metadata
           in content_metadata table in storage or run content indexer to translate
           files.
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt
index 80cb6e8..8595712 100644
--- a/swh.indexer.egg-info/SOURCES.txt
+++ b/swh.indexer.egg-info/SOURCES.txt
@@ -1,91 +1,90 @@
 MANIFEST.in
 Makefile
 README.md
 requirements-swh.txt
 requirements.txt
 setup.py
 version.txt
 sql/bin/db-upgrade
 sql/bin/dot_add_content
 sql/doc/json/.gitignore
 sql/doc/json/Makefile
 sql/doc/json/indexer_configuration.tool_configuration.schema.json
 sql/doc/json/revision_metadata.translated_metadata.json
 sql/json/.gitignore
 sql/json/Makefile
 sql/json/indexer_configuration.tool_configuration.schema.json
 sql/json/revision_metadata.translated_metadata.json
 sql/upgrades/115.sql
 sql/upgrades/116.sql
 sql/upgrades/117.sql
 sql/upgrades/118.sql
 sql/upgrades/119.sql
 sql/upgrades/120.sql
 sql/upgrades/121.sql
 sql/upgrades/122.sql
 sql/upgrades/123.sql
+sql/upgrades/124.sql
 swh/__init__.py
 swh.indexer.egg-info/PKG-INFO
 swh.indexer.egg-info/SOURCES.txt
 swh.indexer.egg-info/dependency_links.txt
 swh.indexer.egg-info/entry_points.txt
 swh.indexer.egg-info/requires.txt
 swh.indexer.egg-info/top_level.txt
 swh/indexer/__init__.py
 swh/indexer/cli.py
 swh/indexer/codemeta.py
 swh/indexer/ctags.py
 swh/indexer/fossology_license.py
 swh/indexer/indexer.py
 swh/indexer/journal_client.py
-swh/indexer/language.py
 swh/indexer/metadata.py
 swh/indexer/metadata_detector.py
 swh/indexer/mimetype.py
 swh/indexer/origin_head.py
 swh/indexer/rehash.py
 swh/indexer/tasks.py
 swh/indexer/data/codemeta/CITATION
 swh/indexer/data/codemeta/LICENSE
 swh/indexer/data/codemeta/codemeta.jsonld
 swh/indexer/data/codemeta/crosswalk.csv
 swh/indexer/metadata_dictionary/__init__.py
 swh/indexer/metadata_dictionary/base.py
 swh/indexer/metadata_dictionary/codemeta.py
 swh/indexer/metadata_dictionary/maven.py
 swh/indexer/metadata_dictionary/npm.py
 swh/indexer/metadata_dictionary/python.py
 swh/indexer/metadata_dictionary/ruby.py
 swh/indexer/sql/10-swh-init.sql
 swh/indexer/sql/20-swh-enums.sql
 swh/indexer/sql/30-swh-schema.sql
 swh/indexer/sql/40-swh-func.sql
 swh/indexer/sql/50-swh-data.sql
 swh/indexer/sql/60-swh-indexes.sql
 swh/indexer/storage/__init__.py
 swh/indexer/storage/converters.py
 swh/indexer/storage/db.py
 swh/indexer/storage/in_memory.py
 swh/indexer/storage/api/__init__.py
 swh/indexer/storage/api/client.py
 swh/indexer/storage/api/server.py
 swh/indexer/storage/api/wsgi.py
 swh/indexer/tests/__init__.py
 swh/indexer/tests/conftest.py
 swh/indexer/tests/tasks.py
 swh/indexer/tests/test_cli.py
 swh/indexer/tests/test_ctags.py
 swh/indexer/tests/test_fossology_license.py
-swh/indexer/tests/test_language.py
 swh/indexer/tests/test_metadata.py
 swh/indexer/tests/test_mimetype.py
 swh/indexer/tests/test_origin_head.py
 swh/indexer/tests/test_origin_metadata.py
 swh/indexer/tests/utils.py
 swh/indexer/tests/storage/__init__.py
 swh/indexer/tests/storage/generate_data_test.py
 swh/indexer/tests/storage/test_api_client.py
 swh/indexer/tests/storage/test_converters.py
 swh/indexer/tests/storage/test_in_memory.py
 swh/indexer/tests/storage/test_server.py
 swh/indexer/tests/storage/test_storage.py
\ No newline at end of file
diff --git a/swh.indexer.egg-info/entry_points.txt b/swh.indexer.egg-info/entry_points.txt
index 43042e1..49be9af 100644
--- a/swh.indexer.egg-info/entry_points.txt
+++ b/swh.indexer.egg-info/entry_points.txt
@@ -1,4 +1,6 @@
 
         [console_scripts]
         swh-indexer=swh.indexer.cli:main
+        [swh.cli.subcommands]
+        indexer=swh.indexer.cli:cli
     
\ No newline at end of file
diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt
index cc485e1..7fc2845 100644
--- a/swh.indexer.egg-info/requires.txt
+++ b/swh.indexer.egg-info/requires.txt
@@ -1,18 +1,16 @@
 vcversioner
-pygments
 click
-chardet
 file-magic
 pyld
 xmltodict
-swh.core>=0.0.53
+swh.core[db,http]>=0.0.61
 swh.model>=0.0.15
 swh.objstorage>=0.0.28
 swh.scheduler>=0.0.47
 swh.storage>=0.0.123
 swh.journal>=0.0.6
 
 [testing]
 pytest<4
 pytest-postgresql
 hypothesis>=3.11.0
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
index c5244be..37e97f0 100644
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -1,177 +1,188 @@
 # Copyright (C) 2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import click
 
 from swh.core import config
+from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup
 from swh.scheduler import get_scheduler
 from swh.scheduler.cli_utils import schedule_origin_batches
 from swh.storage import get_storage
 
 from swh.indexer import metadata_dictionary
 from swh.indexer.storage import get_indexer_storage
 from swh.indexer.storage.api.server import load_and_check_config, app
 
 
-CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
-
-
-@click.group(context_settings=CONTEXT_SETTINGS)
+@click.group(name='indexer', context_settings=CONTEXT_SETTINGS,
+             cls=AliasedGroup)
 @click.option('--config-file', '-C', default=None,
               type=click.Path(exists=True, dir_okay=False,),
               help="Configuration file.")
 @click.pass_context
 def cli(ctx, config_file):
-    """Software Heritage Indexer CLI interface
+    """Software Heritage Indexer tools.
+
+    The Indexer is used to mine the content of the archive and extract derived
+    information from archive source code artifacts.
+
     """
     ctx.ensure_object(dict)
 
     conf = config.read(config_file)
     ctx.obj['config'] = conf
 
 
 def _get_api(getter, config, config_key, url):
     if url:
         config[config_key] = {
             'cls': 'remote',
             'args': {'url': url}
         }
     elif config_key not in config:
         raise click.ClickException(
             'Missing configuration for {}'.format(config_key))
     return getter(**config[config_key])
 
 
 @cli.group('mapping')
 def mapping():
+    '''Manage Software Heritage Indexer mappings.'''
     pass
 
 
 @mapping.command('list')
 def mapping_list():
     """Prints the list of known mappings."""
     mapping_names = [mapping.name
                      for mapping in metadata_dictionary.MAPPINGS.values()]
     mapping_names.sort()
     for mapping_name in mapping_names:
         click.echo(mapping_name)
 
 
 @mapping.command('list-terms')
 @click.option('--exclude-mapping', multiple=True,
               help='Exclude the given mapping from the output')
 @click.option('--concise', is_flag=True,
               default=False,
               help='Don\'t print the list of mappings supporting each term.')
 def mapping_list_terms(concise, exclude_mapping):
     """Prints the list of known CodeMeta terms, and which mappings
     support them."""
     properties = metadata_dictionary.list_terms()
     for (property_name, supported_mappings) in sorted(properties.items()):
         supported_mappings = {m.name for m in supported_mappings}
         supported_mappings -= set(exclude_mapping)
         if supported_mappings:
             if concise:
                 click.echo(property_name)
             else:
                 click.echo('{}:'.format(property_name))
                 click.echo('\t' + ', '.join(sorted(supported_mappings)))
 
 
 @cli.group('schedule')
 @click.option('--scheduler-url', '-s', default=None,
               help="URL of the scheduler API")
 @click.option('--indexer-storage-url', '-i', default=None,
               help="URL of the indexer storage API")
 @click.option('--storage-url', '-g', default=None,
               help="URL of the (graph) storage API")
 @click.option('--dry-run/--no-dry-run', is_flag=True,
               default=False,
               help='List only what would be scheduled.')
 @click.pass_context
 def schedule(ctx, scheduler_url, storage_url, indexer_storage_url,
              dry_run):
-    """Manipulate indexer tasks via SWH Scheduler's API."""
+    """Manipulate Software Heritage Indexer tasks.
+
+    Via SWH Scheduler's API."""
     ctx.obj['indexer_storage'] = _get_api(
         get_indexer_storage,
         ctx.obj['config'],
         'indexer_storage',
         indexer_storage_url
     )
     ctx.obj['storage'] = _get_api(
         get_storage,
         ctx.obj['config'],
         'storage',
         storage_url
     )
     ctx.obj['scheduler'] = _get_api(
         get_scheduler,
         ctx.obj['config'],
         'scheduler',
         scheduler_url
     )
     if dry_run:
         ctx.obj['scheduler'] = None
 
 
 def list_origins_by_producer(idx_storage, mappings, tool_ids):
     start = 0
     limit = 10000
     while True:
         origins = list(
             idx_storage.origin_intrinsic_metadata_search_by_producer(
                 start=start, limit=limit, ids_only=True,
                 mappings=mappings or None, tool_ids=tool_ids or None))
         if not origins:
             break
         start = origins[-1]+1
         yield from origins
 
 
 @schedule.command('reindex_origin_metadata')
 @click.option('--batch-size', '-b', 'origin_batch_size',
               default=10, show_default=True, type=int,
               help="Number of origins per task")
 @click.option('--tool-id', '-t', 'tool_ids', type=int, multiple=True,
               help="Restrict search of old metadata to this/these tool ids.")
 @click.option('--mapping', '-m', 'mappings', multiple=True,
               help="Mapping(s) that should be re-scheduled (eg. 'npm', "
                    "'gemspec', 'maven')")
 @click.option('--task-type',
-              default='indexer_origin_metadata', show_default=True,
+              default='index-origin-metadata', show_default=True,
               help="Name of the task type to schedule.")
 @click.pass_context
 def schedule_origin_metadata_reindex(
         ctx, origin_batch_size, tool_ids, mappings, task_type):
     """Schedules indexing tasks for origins that were already indexed."""
     idx_storage = ctx.obj['indexer_storage']
     scheduler = ctx.obj['scheduler']
 
     origins = list_origins_by_producer(idx_storage, mappings, tool_ids)
 
     kwargs = {"policy_update": "update-dups", "parse_ids": False}
     schedule_origin_batches(
         scheduler, task_type, origins, origin_batch_size, kwargs)
 
 
-@cli.command('api-server')
+@cli.command('rpc-serve')
 @click.argument('config-path', required=1)
 @click.option('--host', default='0.0.0.0', help="Host to run the server")
 @click.option('--port', default=5007, type=click.INT,
               help="Binding port of the server")
 @click.option('--debug/--nodebug', default=True,
               help="Indicates if the server should run in debug mode")
-def api_server(config_path, host, port, debug):
+def rpc_server(config_path, host, port, debug):
+    """Starts a Software Heritage Indexer RPC HTTP server."""
     api_cfg = load_and_check_config(config_path, type='any')
     app.config.update(api_cfg)
     app.run(host, port=int(port), debug=bool(debug))
 
 
+cli.add_alias(rpc_server, 'api-server')
+cli.add_alias(rpc_server, 'serve')
+
+
 def main():
     return cli(auto_envvar_prefix='SWH_INDEXER')
 
 
 if __name__ == '__main__':
     main()
diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py
index dbf7e15..b29e4c7 100644
--- a/swh/indexer/ctags.py
+++ b/swh/indexer/ctags.py
@@ -1,146 +1,151 @@
 # Copyright (C) 2015-2017  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import subprocess
 import json
 
 from swh.model import hashutil
 
-from .language import compute_language
 from .indexer import ContentIndexer, write_to_temp
 
 
 # Options used to compute tags
 __FLAGS = [
     '--fields=+lnz',  # +l: language
                       # +n: line number of tag definition
                       # +z: include the symbol's kind (function, variable, ...)
     '--sort=no',      # sort output on tag name
     '--links=no',     # do not follow symlinks
     '--output-format=json',  # outputs in json
 ]
 
 
+def compute_language(content, log=None):
+    raise NotImplementedError(
+        'Language detection was unreliable, so it is currently disabled. '
+        'See https://forge.softwareheritage.org/D1455')
+
+
 def run_ctags(path, lang=None, ctags_command='ctags'):
     """Run ctags on file path with optional language.
 
     Args:
         path: path to the file
         lang: language for that path (optional)
 
     Yields:
         dict: ctags' output
 
     """
     optional = []
     if lang:
         optional = ['--language-force=%s' % lang]
 
     cmd = [ctags_command] + __FLAGS + optional + [path]
     output = subprocess.check_output(cmd, universal_newlines=True)
 
     for symbol in output.split('\n'):
         if not symbol:
             continue
         js_symbol = json.loads(symbol)
         yield {
             'name': js_symbol['name'],
             'kind': js_symbol['kind'],
             'line': js_symbol['line'],
             'lang': js_symbol['language'],
         }
 
 
 class CtagsIndexer(ContentIndexer):
     CONFIG_BASE_FILENAME = 'indexer/ctags'
 
     ADDITIONAL_CONFIG = {
         'workdir': ('str', '/tmp/swh/indexer.ctags'),
         'tools': ('dict', {
             'name': 'universal-ctags',
             'version': '~git7859817b',
             'configuration': {
                 'command_line': '''ctags --fields=+lnz --sort=no --links=no '''
                                 '''--output-format=json <filepath>'''
             },
         }),
         'languages': ('dict', {
             'ada': 'Ada',
             'adl': None,
             'agda': None,
             # ...
         })
     }
 
     def prepare(self):
         super().prepare()
         self.working_directory = self.config['workdir']
         self.language_map = self.config['languages']
 
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones.
 
         """
         yield from self.idx_storage.content_ctags_missing((
             {
                 'id': sha1,
                 'indexer_configuration_id': self.tool['id'],
             } for sha1 in ids
         ))
 
     def index(self, id, data):
         """Index sha1s' content and store result.
 
         Args:
             id (bytes): content's identifier
             data (bytes): raw content in bytes
 
         Returns:
             dict: a dict representing a content_mimetype with keys:
 
             - **id** (bytes): content's identifier (sha1)
             - **ctags** ([dict]): ctags list of symbols
 
         """
         lang = compute_language(data, log=self.log)['lang']
 
         if not lang:
             return None
 
         ctags_lang = self.language_map.get(lang)
 
         if not ctags_lang:
             return None
 
         ctags = {
             'id': id,
         }
 
         filename = hashutil.hash_to_hex(id)
         with write_to_temp(
                 filename=filename, data=data,
                 working_directory=self.working_directory) as content_path:
             result = run_ctags(content_path, lang=ctags_lang)
             ctags.update({
                 'ctags': list(result),
                 'indexer_configuration_id': self.tool['id'],
             })
 
         return ctags
 
     def persist_index_computations(self, results, policy_update):
         """Persist the results in storage.
 
         Args:
             results ([dict]): list of content_mimetype, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - ctags ([dict]): ctags list of symbols
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
               respectively update duplicates or ignore them
 
         """
         self.idx_storage.content_ctags_add(
             results, conflict_update=(policy_update == 'update-dups'))
diff --git a/swh/indexer/journal_client.py b/swh/indexer/journal_client.py
index c8f27ea..ac236cc 100644
--- a/swh/indexer/journal_client.py
+++ b/swh/indexer/journal_client.py
@@ -1,88 +1,88 @@
 # Copyright (C) 2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import logging
 
 from swh.journal.client import JournalClient
 from swh.scheduler import get_scheduler
 from swh.scheduler.utils import create_task_dict
 
 
 class IndexerJournalClient(JournalClient):
     """Client in charge of listing new received origins and origin_visits
        in the swh journal.
 
     """
     CONFIG_BASE_FILENAME = 'indexer/journal_client'
 
     ADDITIONAL_CONFIG = {
         'scheduler': ('dict', {
             'cls': 'remote',
             'args': {
                 'url': 'http://localhost:5008/',
             }
         }),
         'origin_visit_tasks': ('List[dict]', [
             {
-                'type': 'indexer_origin_metadata',
+                'type': 'index-origin-metadata',
                 'kwargs': {
                     'policy_update': 'update-dups',
                     'parse_ids': False,
                 }
             }
         ]),
     }
 
     def __init__(self):
         super().__init__(extra_configuration={
             'object_types': ['origin_visit'],
         })
         self.scheduler = get_scheduler(**self.config['scheduler'])
         logging.info(
             'Starting indexer journal client with config %r',
             self.config)
 
     def process_objects(self, messages):
         assert set(messages) == {'origin_visit'}, set(messages)
         for origin_visit in messages['origin_visit']:
             self.process_origin_visit(origin_visit)
 
     def process_origin_visit(self, origin_visit):
         task_dicts = []
         logging.debug('processing origin visit %r', origin_visit)
         if origin_visit[b'status'] == b'full':
             for task_config in self.config['origin_visit_tasks']:
                 logging.info(
                     'Scheduling %s for visit of origin %d',
                     task_config['type'], origin_visit[b'origin'])
                 task_dicts.append(create_task_dict(
                     task_config['type'],
                     'oneshot',
                     [origin_visit[b'origin']],
                     **task_config['kwargs'],
                 ))
         else:
             logging.debug('status is not "full", ignoring.')
 
         if task_dicts:
             self.scheduler.create_tasks(task_dicts)
 
 
 if __name__ == '__main__':
     logging.basicConfig(
         level=logging.INFO,
         format='%(asctime)s %(process)d %(levelname)s %(message)s'
     )
 
     import click
 
     @click.command()
     def main():
         """Log the new received origin and origin_visits.
 
         """
         IndexerJournalClient().process()
 
     main()
diff --git a/swh/indexer/language.py b/swh/indexer/language.py
deleted file mode 100644
index c69b1dc..0000000
--- a/swh/indexer/language.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright (C) 2016-2018  The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-
-import io
-
-from pygments.lexers import guess_lexer
-from pygments.util import ClassNotFound
-from chardet.universaldetector import UniversalDetector
-
-from .indexer import ContentIndexer
-
-
-def _cleanup_classname(classname):
-    """Determine the language from the pygments' lexer names.
-
-    """
-    return classname.lower().replace(' ', '-')
-
-
-def _read_raw(raw_content, size=2048):
-    """Read raw content in chunk.
-
-    """
-    bs = io.BytesIO(raw_content)
-    while True:
-        chunk = bs.read(size)
-        if not chunk:
-            break
-        yield chunk
-
-
-def _detect_encoding(raw_content):
-    """Given a raw content, try and detect its encoding.
-
-    """
-    detector = UniversalDetector()
-    for chunk in _read_raw(raw_content):
-        detector.feed(chunk)
-        if detector.done:
-            break
-    detector.close()
-    return detector.result['encoding']
-
-
-def compute_language_from_chunk(encoding, length, raw_content, max_size,
-                                log=None):
-    """Determine the raw content's language.
-
-    Args:
-        encoding (str): Encoding to use to decode the content
-        length (int): raw_content's length
-        raw_content (bytes): raw content to work with
-        max_size (int): max size to split the raw content at
-
-    Returns:
-        dict: Dict with keys:
-        - **lang**: None if nothing found or the possible language
-
-    """
-    try:
-        if max_size <= length:
-            raw_content = raw_content[0:max_size]
-
-        content = raw_content.decode(encoding)
-        lang = _cleanup_classname(
-            guess_lexer(content).name)
-    except ClassNotFound:
-        lang = None
-    except UnicodeDecodeError:
-        raise
-    except Exception:
-        if log:
-            log.exception('Problem during language detection, skipping')
-        lang = None
-    return {
-        'lang': lang
-    }
-
-
-def compute_language(raw_content, encoding=None, log=None):
-    """Determine the raw content's language.
-
-    Args:
-        raw_content (bytes): raw content to work with
-
-    Returns:
-        dict: Dict with keys:
-        - **lang**: None if nothing found or the possible language
-
-    """
-    try:
-        encoding = _detect_encoding(raw_content)
-        content = raw_content.decode(encoding)
-        lang = _cleanup_classname(
-            guess_lexer(content).name)
-    except ClassNotFound:
-        lang = None
-    except Exception:
-        if log:
-            log.exception('Problem during language detection, skipping')
-        lang = None
-    return {
-        'lang': lang
-    }
-
-
-class LanguageIndexer(ContentIndexer):
-    """Indexer in charge of:
-
-    - filtering out content already indexed
-    - reading content from objstorage per the content's id (sha1)
-    - computing {mimetype, encoding} from that content
-    - store result in storage
-
-    """
-    CONFIG_BASE_FILENAME = 'indexer/language'
-
-    ADDITIONAL_CONFIG = {
-        'tools': ('dict', {
-            'name': 'pygments',
-            'version': '2.0.1+dfsg-1.1+deb8u1',
-            'configuration': {
-                'type': 'library',
-                'debian-package': 'python3-pygments',
-                'max_content_size': 10240,
-            },
-        }),
-    }
-
-    @property
-    def max_content_size(self):
-        return self.tool['tool_configuration']['max_content_size']
-
-    def filter(self, ids):
-        """Filter out known sha1s and return only missing ones.
-
-        """
-        yield from self.idx_storage.content_language_missing((
-            {
-                'id': sha1,
-                'indexer_configuration_id': self.tool['id']
-            } for sha1 in ids
-        ))
-
-    def index(self, id, data):
-        """Index sha1s' content and store result.
-
-        Args:
-            id (bytes): content's identifier
-            data (bytes): raw content in bytes
-
-        Returns:
-            dict: Dict that represents a content_mimetype, with keys:
-            - id (bytes): content's identifier (sha1)
-            - lang (bytes): detected language
-
-        """
-        result = {
-            'id': id,
-            'indexer_configuration_id': self.tool['id'],
-            'lang': None,
-        }
-
-        encoding = _detect_encoding(data)
-
-        if not encoding:
-            return result
-
-        _len = len(data)
-        for i in range(0, 9):
-            max_size = self.max_content_size + i
-
-            try:
-                result = compute_language_from_chunk(
-                    encoding, _len, data, max_size, log=self.log)
-            except UnicodeDecodeError:
-                self.log.warning(
-                    'Decoding failed on wrong byte chunk at [0-%s]'
-                    ', trying again at next ending byte.' % max_size)
-                continue
-
-            # we found something, so we return it
-            result.update({
-                'id': id,
-                'indexer_configuration_id': self.tool['id'],
-            })
-            break
-
-        return result
-
-    def persist_index_computations(self, results, policy_update):
-        """Persist the results in storage.
-
-        Args:
-            results ([dict]): list of content_mimetype, dict with the
-              following keys:
-              - id (bytes): content's identifier (sha1)
-              - lang (bytes): detected language
-            policy_update ([str]): either 'update-dups' or 'ignore-dups' to
-              respectively update duplicates or ignore them
-
-        """
-        self.idx_storage.content_language_add(
-            results, conflict_update=(policy_update == 'update-dups'))
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index e9fe41e..c3a7e5e 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,335 +1,336 @@
 # Copyright (C) 2017-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from copy import deepcopy
 
 from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
 from swh.indexer.origin_head import OriginHeadIndexer
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_detector import extract_minimal_metadata_dict
 from swh.indexer.storage import INDEXER_CFG_KEY
 
 from swh.model import hashutil
 
 
 class ContentMetadataIndexer(ContentIndexer):
     """Content-level indexer
 
     This indexer is in charge of:
 
     - filtering out content already indexed in content_metadata
     - reading content from objstorage with the content's id sha1
     - computing metadata by given context
     - using the metadata_dictionary as the 'swh-metadata-translator' tool
     - store result in content_metadata table
 
     """
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones.
         """
         yield from self.idx_storage.content_metadata_missing((
             {
                 'id': sha1,
                 'indexer_configuration_id': self.tool['id'],
             } for sha1 in ids
         ))
 
     def index(self, id, data, log_suffix='unknown revision'):
         """Index sha1s' content and store result.
 
         Args:
             id (bytes): content's identifier
             data (bytes): raw content in bytes
 
         Returns:
             dict: dictionary representing a content_metadata. If the
             translation wasn't successful the metadata keys will
             be returned as None
 
         """
         result = {
             'id': id,
             'indexer_configuration_id': self.tool['id'],
             'metadata': None
         }
         try:
             mapping_name = self.tool['tool_configuration']['context']
             log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id)
             result['metadata'] = \
                 MAPPINGS[mapping_name](log_suffix).translate(data)
         except Exception:
             self.log.exception(
                 "Problem during metadata translation "
                 "for content %s" % hashutil.hash_to_hex(id))
         if result['metadata'] is None:
             return None
         return result
 
     def persist_index_computations(self, results, policy_update):
         """Persist the results in storage.
 
         Args:
             results ([dict]): list of content_metadata, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - metadata (jsonb): detected metadata
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
               respectively update duplicates or ignore them
 
         """
         self.idx_storage.content_metadata_add(
             results, conflict_update=(policy_update == 'update-dups'))
 
 
 class RevisionMetadataIndexer(RevisionIndexer):
     """Revision-level indexer
 
     This indexer is in charge of:
 
     - filtering revisions already indexed in revision_intrinsic_metadata table
       with defined computation tool
     - retrieve all entry_files in root directory
     - use metadata_detector for file_names containing metadata
     - compute metadata translation if necessary and possible (depends on tool)
     - send sha1s to content indexing if possible
     - store the results for revision
 
     """
     ADDITIONAL_CONFIG = {
         'tools': ('dict', {
             'name': 'swh-metadata-detector',
             'version': '0.0.2',
             'configuration': {
             },
         }),
     }
 
     def filter(self, sha1_gits):
         """Filter out known sha1s and return only missing ones.
 
         """
         yield from self.idx_storage.revision_intrinsic_metadata_missing((
             {
                 'id': sha1_git,
                 'indexer_configuration_id': self.tool['id'],
             } for sha1_git in sha1_gits
         ))
 
     def index(self, rev):
         """Index rev by processing it and organizing result.
 
         use metadata_detector to iterate on filenames
 
         - if one filename detected -> sends file to content indexer
         - if multiple file detected -> translation needed at revision level
 
         Args:
           rev (dict): revision artifact from storage
 
         Returns:
             dict: dictionary representing a revision_intrinsic_metadata, with
             keys:
 
             - id (str): rev's identifier (sha1_git)
             - indexer_configuration_id (bytes): tool used
             - metadata: dict of retrieved metadata
 
         """
         result = {
             'id': rev['id'],
             'indexer_configuration_id': self.tool['id'],
             'mappings': None,
             'metadata': None
         }
 
         try:
             root_dir = rev['directory']
             dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
             if [entry['type'] for entry in dir_ls] == ['dir']:
                 # If the root is just a single directory, recurse into it
                 # eg. PyPI packages, GNU tarballs
                 subdir = dir_ls[0]['target']
                 dir_ls = self.storage.directory_ls(subdir, recursive=False)
             files = [entry for entry in dir_ls if entry['type'] == 'file']
             detected_files = detect_metadata(files)
             (mappings, metadata) = self.translate_revision_intrinsic_metadata(
                 detected_files,
                 log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id']))
             result['mappings'] = mappings
             result['metadata'] = metadata
         except Exception as e:
             self.log.exception(
                 'Problem when indexing rev: %r', e)
         return result
 
     def persist_index_computations(self, results, policy_update):
         """Persist the results in storage.
 
         Args:
             results ([dict]): list of content_mimetype, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - mimetype (bytes): mimetype in bytes
               - encoding (bytes): encoding in bytes
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
               respectively update duplicates or ignore them
 
         """
         # TODO: add functions in storage to keep data in
         # revision_intrinsic_metadata
         self.idx_storage.revision_intrinsic_metadata_add(
             results, conflict_update=(policy_update == 'update-dups'))
 
     def translate_revision_intrinsic_metadata(
             self, detected_files, log_suffix):
         """
         Determine plan of action to translate metadata when containing
         one or multiple detected files:
 
         Args:
             detected_files (dict): dictionary mapping context names (e.g.,
               "npm", "authors") to list of sha1
 
         Returns:
             (List[str], dict): list of mappings used and dict with
             translated metadata according to the CodeMeta vocabulary
 
         """
         used_mappings = [MAPPINGS[context].name for context in detected_files]
         metadata = []
         tool = {
                 'name': 'swh-metadata-translator',
                 'version': '0.0.2',
                 'configuration': {
                 },
             }
         # TODO: iterate on each context, on each file
         # -> get raw_contents
         # -> translate each content
         config = {
             k: self.config[k]
             for k in [INDEXER_CFG_KEY, 'objstorage', 'storage']
         }
         config['tools'] = [tool]
         for context in detected_files.keys():
             cfg = deepcopy(config)
             cfg['tools'][0]['configuration']['context'] = context
             c_metadata_indexer = ContentMetadataIndexer(config=cfg)
             # sha1s that are in content_metadata table
             sha1s_in_storage = []
             metadata_generator = self.idx_storage.content_metadata_get(
                 detected_files[context])
             for c in metadata_generator:
                 # extracting metadata
                 sha1 = c['id']
                 sha1s_in_storage.append(sha1)
                 local_metadata = c['metadata']
                 # local metadata is aggregated
                 if local_metadata:
                     metadata.append(local_metadata)
 
             sha1s_filtered = [item for item in detected_files[context]
                               if item not in sha1s_in_storage]
 
             if sha1s_filtered:
                 # content indexing
                 try:
                     c_metadata_indexer.run(sha1s_filtered,
                                            policy_update='ignore-dups',
                                            log_suffix=log_suffix)
                     # on the fly possibility:
                     for result in c_metadata_indexer.results:
                         local_metadata = result['metadata']
                         metadata.append(local_metadata)
 
                 except Exception:
                     self.log.exception(
                         "Exception while indexing metadata on contents")
 
         # transform metadata into min set with swh-metadata-detector
         min_metadata = extract_minimal_metadata_dict(metadata)
         return (used_mappings, min_metadata)
 
 
 class OriginMetadataIndexer(OriginIndexer):
     ADDITIONAL_CONFIG = RevisionMetadataIndexer.ADDITIONAL_CONFIG
 
     USE_TOOLS = False
 
     def __init__(self, config=None, **kwargs):
         super().__init__(config=config, **kwargs)
         self.origin_head_indexer = OriginHeadIndexer(config=config)
         self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
 
     def index_list(self, origins):
         head_rev_ids = []
         origins_with_head = []
         for origin in origins:
             head_result = self.origin_head_indexer.index(origin)
             if head_result:
                 origins_with_head.append(origin)
                 head_rev_ids.append(head_result['revision_id'])
 
         head_revs = list(self.storage.revision_get(head_rev_ids))
         assert len(head_revs) == len(head_rev_ids)
 
         results = []
         for (origin, rev) in zip(origins_with_head, head_revs):
             if not rev:
                 self.log.warning('Missing head revision of origin %r',
                                  origin)
                 continue
 
             rev_metadata = self.revision_metadata_indexer.index(rev)
             orig_metadata = {
                 'from_revision': rev_metadata['id'],
                 'id': origin['id'],
                 'metadata': rev_metadata['metadata'],
                 'mappings': rev_metadata['mappings'],
                 'indexer_configuration_id':
                     rev_metadata['indexer_configuration_id'],
             }
             results.append((orig_metadata, rev_metadata))
         return results
 
     def persist_index_computations(self, results, policy_update):
         conflict_update = (policy_update == 'update-dups')
 
         # Deduplicate revisions
         rev_metadata = []
         orig_metadata = []
         revs_to_delete = []
         origs_to_delete = []
         for (orig_item, rev_item) in results:
-            assert rev_item['mappings'] == orig_item['mappings']
-            if rev_item['mappings']:
-                # Only store translated metadata if we found a metadata file.
-                # Otherwise it's just an empty dict with a "@context" key.
-                if rev_item not in rev_metadata:
-                    rev_metadata.append(rev_item)
-                if orig_item not in orig_metadata:
-                    orig_metadata.append(orig_item)
-            else:
+            assert rev_item['metadata'] == orig_item['metadata']
+            if not rev_item['metadata'] or \
+                    rev_item['metadata'].keys() <= {'@context'}:
+                # If we didn't find any metadata, don't store a DB record
+                # (and delete existing ones, if any)
                 if rev_item not in revs_to_delete:
                     revs_to_delete.append(rev_item)
                 if orig_item not in origs_to_delete:
                     origs_to_delete.append(orig_item)
+            else:
+                if rev_item not in rev_metadata:
+                    rev_metadata.append(rev_item)
+                if orig_item not in orig_metadata:
+                    orig_metadata.append(orig_item)
 
         if rev_metadata:
             self.idx_storage.revision_intrinsic_metadata_add(
                 rev_metadata, conflict_update=conflict_update)
         if orig_metadata:
             self.idx_storage.origin_intrinsic_metadata_add(
                 orig_metadata, conflict_update=conflict_update)
 
         # revs_to_delete should always be empty unless we changed a mapping
-        # to detect less files.
+        # to detect less files or less content.
         # However, origs_to_delete may be empty whenever an upstream deletes
         # a metadata file.
         if origs_to_delete:
             self.idx_storage.origin_intrinsic_metadata_delete(origs_to_delete)
         if revs_to_delete:
             self.idx_storage.revision_intrinsic_metadata_delete(revs_to_delete)
diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql
index 9fcc66c..2bd0bd5 100644
--- a/swh/indexer/sql/30-swh-schema.sql
+++ b/swh/indexer/sql/30-swh-schema.sql
@@ -1,145 +1,145 @@
 ---
 --- Software Heritage Indexers Data Model
 ---
 
 -- drop schema if exists swh cascade;
 -- create schema swh;
 -- set search_path to swh;
 
 create table dbversion
 (
   version     int primary key,
   release     timestamptz,
   description text
 );
 
 insert into dbversion(version, release, description)
-      values(122, now(), 'Work In Progress');
+      values(124, now(), 'Work In Progress');
 -- Computing metadata on sha1's contents
 
 -- a SHA1 checksum (not necessarily originating from Git)
 create domain sha1 as bytea check (length(value) = 20);
 
 -- a Git object ID, i.e., a SHA1 checksum
 create domain sha1_git as bytea check (length(value) = 20);
 
 create table indexer_configuration (
   id serial not null,
   tool_name text not null,
   tool_version text not null,
   tool_configuration jsonb
 );
 
 comment on table indexer_configuration is 'Indexer''s configuration version';
 comment on column indexer_configuration.id is 'Tool identifier';
 comment on column indexer_configuration.tool_version is 'Tool name';
 comment on column indexer_configuration.tool_version is 'Tool version';
 comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...';
 
 -- Properties (mimetype, encoding, etc...)
 create table content_mimetype (
   id sha1 not null,
   mimetype text not null,
   encoding text not null,
   indexer_configuration_id bigint not null
 );
 
 comment on table content_mimetype is 'Metadata associated to a raw content';
 comment on column content_mimetype.mimetype is 'Raw content Mimetype';
 comment on column content_mimetype.encoding is 'Raw content encoding';
 comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information';
 
 -- Language metadata
 create table content_language (
   id sha1 not null,
   lang languages not null,
   indexer_configuration_id bigint not null
 );
 
 comment on table content_language is 'Language information on a raw content';
 comment on column content_language.lang is 'Language information';
 comment on column content_language.indexer_configuration_id is 'Tool used to compute the information';
 
 -- ctags information per content
 create table content_ctags (
   id sha1 not null,
   name text not null,
   kind text not null,
   line bigint not null,
   lang ctags_languages not null,
   indexer_configuration_id bigint not null
 );
 
 comment on table content_ctags is 'Ctags information on a raw content';
 comment on column content_ctags.id is 'Content identifier';
 comment on column content_ctags.name is 'Symbol name';
 comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)';
 comment on column content_ctags.line is 'Symbol line';
 comment on column content_ctags.lang is 'Language information for that content';
 comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information';
 
 create table fossology_license(
   id smallserial,
   name text not null
 );
 
 comment on table fossology_license is 'Possible license recognized by license indexer';
 comment on column fossology_license.id is 'License identifier';
 comment on column fossology_license.name is 'License name';
 
 create table content_fossology_license (
   id sha1 not null,
   license_id smallserial not null,
   indexer_configuration_id bigint not null
 );
 
 comment on table content_fossology_license is 'license associated to a raw content';
 comment on column content_fossology_license.id is 'Raw content identifier';
 comment on column content_fossology_license.license_id is 'One of the content''s license identifier';
 comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information';
 
 
 -- The table content_metadata provides a translation to files
 -- identified as potentially containning metadata with a translation tool (indexer_configuration_id)
 create table content_metadata(
   id                       sha1   not null,
   metadata                 jsonb  not null,
   indexer_configuration_id bigint not null
 );
 
 comment on table content_metadata is 'metadata semantically translated from a content file';
 comment on column content_metadata.id is 'sha1 of content file';
 comment on column content_metadata.metadata is 'result of translation with defined format';
 comment on column content_metadata.indexer_configuration_id is 'tool used for translation';
 
 -- The table revision_intrinsic_metadata provides a minimal set of intrinsic
 -- metadata detected with the detection  tool (indexer_configuration_id) and
 -- aggregated from the content_metadata translation.
 create table revision_intrinsic_metadata(
   id                       sha1_git   not null,
   metadata                 jsonb      not null,
   indexer_configuration_id bigint     not null,
   mappings                 text array not null
 );
 
 comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision';
 comment on column revision_intrinsic_metadata.id is 'sha1_git of revision';
 comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
 comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
 comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
 
 create table origin_intrinsic_metadata(
   id                        bigserial  not null,
   metadata                  jsonb,
   indexer_configuration_id  bigint     not null,
   from_revision             sha1_git   not null,
   metadata_tsvector         tsvector,
   mappings                  text array not null
 );
 
 comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
 comment on column origin_intrinsic_metadata.id is 'the entry id in origin';
 comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision';
 comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
 comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.';
 comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql
index 80ffe8e..c12622c 100644
--- a/swh/indexer/sql/60-swh-indexes.sql
+++ b/swh/indexer/sql/60-swh-indexes.sql
@@ -1,69 +1,67 @@
 -- fossology_license
 create unique index fossology_license_pkey on fossology_license(id);
 alter table fossology_license add primary key using index fossology_license_pkey;
 
 create unique index on fossology_license(name);
 
 -- indexer_configuration
 create unique index concurrently indexer_configuration_pkey on indexer_configuration(id);
 alter table indexer_configuration add primary key using index indexer_configuration_pkey;
 
 create unique index on indexer_configuration(tool_name, tool_version, tool_configuration);
 
 -- content_ctags
 create index on content_ctags(id);
 create index on content_ctags(hash_sha1(name));
 create unique index on content_ctags(id, hash_sha1(name), kind, line, lang, indexer_configuration_id);
 
 alter table content_ctags add constraint content_ctags_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
 alter table content_ctags validate constraint content_ctags_indexer_configuration_id_fkey;
 
 -- content_metadata
 create unique index content_metadata_pkey on content_metadata(id, indexer_configuration_id);
 alter table content_metadata add primary key using index content_metadata_pkey;
 
 alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
 alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey;
 
 -- revision_intrinsic_metadata
 create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id);
 alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey;
 
 alter table revision_intrinsic_metadata add constraint revision_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
 alter table revision_intrinsic_metadata validate constraint revision_intrinsic_metadata_indexer_configuration_id_fkey;
 
 -- content_mimetype
 create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id);
 alter table content_mimetype add primary key using index content_mimetype_pkey;
 
 alter table content_mimetype add constraint content_mimetype_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
 alter table content_mimetype validate constraint content_mimetype_indexer_configuration_id_fkey;
 
 -- content_language
 create unique index content_language_pkey on content_language(id, indexer_configuration_id);
 alter table content_language add primary key using index content_language_pkey;
 
 alter table content_language add constraint content_language_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
 alter table content_language validate constraint content_language_indexer_configuration_id_fkey;
 
 -- content_fossology_license
 create unique index content_fossology_license_pkey on content_fossology_license(id, license_id, indexer_configuration_id);
 alter table content_fossology_license add primary key using index content_fossology_license_pkey;
 
 alter table content_fossology_license add constraint content_fossology_license_license_id_fkey foreign key (license_id) references fossology_license(id) not valid;
 alter table content_fossology_license validate constraint content_fossology_license_license_id_fkey;
 
 alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
 alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey;
 
 -- origin_intrinsic_metadata
 create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(id, indexer_configuration_id);
 alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey;
 
 alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
 alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
-alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_intrinsic_metadata(id, indexer_configuration_id) not valid;
-alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey;
 
 create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector);
 create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py
index 97f921c..dc47146 100644
--- a/swh/indexer/tasks.py
+++ b/swh/indexer/tasks.py
@@ -1,64 +1,57 @@
 # Copyright (C) 2016-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 
 from celery import current_app as app
 
 from .mimetype import MimetypeIndexer, MimetypeRangeIndexer
-from .language import LanguageIndexer
 from .ctags import CtagsIndexer
 from .fossology_license import (
     FossologyLicenseIndexer, FossologyLicenseRangeIndexer
 )
 from .rehash import RecomputeChecksums
 from .metadata import OriginMetadataIndexer
 
 
 @app.task(name=__name__ + '.OriginMetadata')
 def origin_metadata(*args, **kwargs):
     results = OriginMetadataIndexer().run(*args, **kwargs)
     return getattr(results, 'results', results)
 
 
-@app.task(name=__name__ + '.ContentLanguage')
-def content_language(*args, **kwargs):
-    results = LanguageIndexer().run(*args, **kwargs)
-    return getattr(results, 'results', results)
-
-
 @app.task(name=__name__ + '.Ctags')
 def ctags(*args, **kwargs):
     results = CtagsIndexer().run(*args, **kwargs)
     return getattr(results, 'results', results)
 
 
 @app.task(name=__name__ + '.ContentFossologyLicense')
 def fossology_license(*args, **kwargs):
     results = FossologyLicenseIndexer().run(*args, **kwargs)
     return getattr(results, 'results', results)
 
 
 @app.task(name=__name__ + '.RecomputeChecksums')
 def recompute_checksums(*args, **kwargs):
     results = RecomputeChecksums().run(*args, **kwargs)
     return getattr(results, 'results', results)
 
 
 @app.task(name=__name__ + '.ContentMimetype')
 def mimetype(*args, **kwargs):
     results = MimetypeIndexer().run(*args, **kwargs)
     return {'status': 'eventful' if results else 'uneventful'}
 
 
 @app.task(name=__name__ + '.ContentRangeMimetype')
 def range_mimetype(*args, **kwargs):
     results = MimetypeRangeIndexer().run(*args, **kwargs)
     return {'status': 'eventful' if results else 'uneventful'}
 
 
 @app.task(name=__name__ + '.ContentRangeFossologyLicense')
 def range_license(*args, **kwargs):
     results = FossologyLicenseRangeIndexer().run(*args, **kwargs)
     return {'status': 'eventful' if results else 'uneventful'}
diff --git a/swh/indexer/tests/storage/test_api_client.py b/swh/indexer/tests/storage/test_api_client.py
index 4b51858..a139099 100644
--- a/swh/indexer/tests/storage/test_api_client.py
+++ b/swh/indexer/tests/storage/test_api_client.py
@@ -1,38 +1,38 @@
 # Copyright (C) 2015-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import unittest
 
-from swh.core.tests.server_testing import ServerTestFixture
+from swh.core.api.tests.server_testing import ServerTestFixture
 from swh.indexer.storage import INDEXER_CFG_KEY
 from swh.indexer.storage.api.client import RemoteStorage
 from swh.indexer.storage.api.server import app
 
 from .test_storage import CommonTestStorage, BasePgTestStorage
 
 
 class TestRemoteStorage(CommonTestStorage, ServerTestFixture,
                         BasePgTestStorage, unittest.TestCase):
     """Test the indexer's remote storage API.
 
     This class doesn't define any tests as we want identical
     functionality between local and remote storage. All the tests are
     therefore defined in
     `class`:swh.indexer.storage.test_storage.CommonTestStorage.
 
     """
 
     def setUp(self):
         self.config = {
             INDEXER_CFG_KEY: {
                 'cls': 'local',
                 'args': {
                     'db': 'dbname=%s' % self.TEST_DB_NAME,
                 }
             }
         }
         self.app = app
         super().setUp()
         self.storage = RemoteStorage(self.url())
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
index f4f78eb..2843f38 100644
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -1,1964 +1,1964 @@
 # Copyright (C) 2015-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import os
 import threading
 import unittest
 
 import pytest
 from hypothesis import given
 
 from swh.model.hashutil import hash_to_bytes
 
 from swh.indexer.storage import get_indexer_storage, MAPPING_NAMES
-from swh.core.tests.db_testing import SingleDbTestFixture
+from swh.core.db.tests.db_testing import SingleDbTestFixture
 from swh.indexer.tests.storage.generate_data_test import (
     gen_content_mimetypes, gen_content_fossology_licenses
 )
 from swh.indexer.tests.storage import SQL_DIR
 from swh.indexer.metadata_dictionary import MAPPINGS
 
 TOOLS = [
     {
         'tool_name': 'universal-ctags',
         'tool_version': '~git7859817b',
         'tool_configuration': {
             "command_line": "ctags --fields=+lnz --sort=no --links=no "
                             "--output-format=json <filepath>"}
     },
     {
         'tool_name': 'swh-metadata-translator',
         'tool_version': '0.0.1',
         'tool_configuration': {"type": "local", "context": "NpmMapping"},
     },
     {
         'tool_name': 'swh-metadata-detector',
         'tool_version': '0.0.1',
         'tool_configuration': {
             "type": "local", "context": ["NpmMapping", "CodemetaMapping"]},
     },
     {
         'tool_name': 'swh-metadata-detector2',
         'tool_version': '0.0.1',
         'tool_configuration': {
             "type": "local", "context": ["NpmMapping", "CodemetaMapping"]},
     },
     {
         'tool_name': 'file',
         'tool_version': '5.22',
         'tool_configuration': {"command_line": "file --mime <filepath>"},
     },
     {
         'tool_name': 'pygments',
         'tool_version': '2.0.1+dfsg-1.1+deb8u1',
         'tool_configuration': {
             "type": "library", "debian-package": "python3-pygments"},
     },
     {
         'tool_name': 'pygments',
         'tool_version': '2.0.1+dfsg-1.1+deb8u1',
         'tool_configuration': {
             "type": "library",
             "debian-package": "python3-pygments",
             "max_content_size": 10240
         },
     },
     {
         'tool_name': 'nomos',
         'tool_version': '3.1.0rc2-31-ga2cbb8c',
         'tool_configuration': {"command_line": "nomossa <filepath>"},
     }
 ]
 
 
 @pytest.mark.db
 class BasePgTestStorage(SingleDbTestFixture):
     """Base test class for most indexer tests.
 
     It adds support for Storage testing to the SingleDbTestFixture class.
     It will also build the database from the swh-indexed/sql/*.sql files.
     """
 
     TEST_DB_NAME = 'softwareheritage-test-indexer'
     TEST_DB_DUMP = os.path.join(SQL_DIR, '*.sql')
 
     def setUp(self):
         super().setUp()
         self.storage_config = {
             'cls': 'local',
             'args': {
                 'db': 'dbname=%s' % self.TEST_DB_NAME,
             },
         }
 
     def tearDown(self):
         self.reset_storage_tables()
         self.storage = None
         super().tearDown()
 
     def reset_storage_tables(self):
         excluded = {'indexer_configuration'}
         self.reset_db_tables(self.TEST_DB_NAME, excluded=excluded)
 
         db = self.test_db[self.TEST_DB_NAME]
         db.conn.commit()
 
 
 def gen_generic_endpoint_tests(endpoint_type, tool_name,
                                example_data1, example_data2):
     def rename(f):
         f.__name__ = 'test_' + endpoint_type + f.__name__
         return f
 
     def endpoint(self, endpoint_name):
         return getattr(self.storage, endpoint_type + '_' + endpoint_name)
 
     @rename
     def missing(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         query = [
             {
                 'id': self.sha1_1,
                 'indexer_configuration_id': tool_id,
             },
             {
                 'id': self.sha1_2,
                 'indexer_configuration_id': tool_id,
             }]
 
         # when
         actual_missing = endpoint(self, 'missing')(query)
 
         # then
         self.assertEqual(list(actual_missing), [
             self.sha1_1,
             self.sha1_2,
         ])
 
         # given
         endpoint(self, 'add')([{
             'id': self.sha1_2,
             **example_data1,
             'indexer_configuration_id': tool_id,
         }])
 
         # when
         actual_missing = endpoint(self, 'missing')(query)
 
         # then
         self.assertEqual(list(actual_missing), [self.sha1_1])
 
     @rename
     def add__drop_duplicate(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         data_v1 = {
             'id': self.sha1_2,
             **example_data1,
             'indexer_configuration_id': tool_id,
         }
 
         # given
         endpoint(self, 'add')([data_v1])
 
         # when
         actual_data = list(endpoint(self, 'get')([self.sha1_2]))
 
         # then
         expected_data_v1 = [{
             'id': self.sha1_2,
             **example_data1,
             'tool': self.tools[tool_name],
         }]
         self.assertEqual(actual_data, expected_data_v1)
 
         # given
         data_v2 = data_v1.copy()
         data_v2.update(example_data2)
 
         endpoint(self, 'add')([data_v2])
 
         actual_data = list(endpoint(self, 'get')([self.sha1_2]))
 
         # data did not change as the v2 was dropped.
         self.assertEqual(actual_data, expected_data_v1)
 
     @rename
     def add__update_in_place_duplicate(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         data_v1 = {
             'id': self.sha1_2,
             **example_data1,
             'indexer_configuration_id': tool_id,
         }
 
         # given
         endpoint(self, 'add')([data_v1])
 
         # when
         actual_data = list(endpoint(self, 'get')([self.sha1_2]))
 
         expected_data_v1 = [{
             'id': self.sha1_2,
             **example_data1,
             'tool': self.tools[tool_name],
         }]
 
         # then
         self.assertEqual(actual_data, expected_data_v1)
 
         # given
         data_v2 = data_v1.copy()
         data_v2.update(example_data2)
 
         endpoint(self, 'add')([data_v2], conflict_update=True)
 
         actual_data = list(endpoint(self, 'get')([self.sha1_2]))
 
         expected_data_v2 = [{
             'id': self.sha1_2,
             **example_data2,
             'tool': self.tools[tool_name],
         }]
 
         # data did change as the v2 was used to overwrite v1
         self.assertEqual(actual_data, expected_data_v2)
 
     @rename
     def add__update_in_place_deadlock(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         hashes = [
             hash_to_bytes(
                 '34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}'.format(i))
             for i in range(1000)]
 
         data_v1 = [
             {
                 'id': hash_,
                 **example_data1,
                 'indexer_configuration_id': tool_id,
             }
             for hash_ in hashes
         ]
         data_v2 = [
             {
                 'id': hash_,
                 **example_data2,
                 'indexer_configuration_id': tool_id,
             }
             for hash_ in hashes
         ]
 
         # Remove one item from each, so that both queries have to succeed for
         # all items to be in the DB.
         data_v2a = data_v2[1:]
         data_v2b = list(reversed(data_v2[0:-1]))
 
         # given
         endpoint(self, 'add')(data_v1)
 
         # when
         actual_data = list(endpoint(self, 'get')(hashes))
 
         expected_data_v1 = [
             {
                 'id': hash_,
                 **example_data1,
                 'tool': self.tools[tool_name],
             }
             for hash_ in hashes
         ]
 
         # then
         self.assertEqual(actual_data, expected_data_v1)
 
         # given
         def f1():
             endpoint(self, 'add')(data_v2a, conflict_update=True)
 
         def f2():
             endpoint(self, 'add')(data_v2b, conflict_update=True)
 
         t1 = threading.Thread(target=f1)
         t2 = threading.Thread(target=f2)
         t2.start()
         t1.start()
 
         t1.join()
         t2.join()
 
         actual_data = list(endpoint(self, 'get')(hashes))
 
         expected_data_v2 = [
             {
                 'id': hash_,
                 **example_data2,
                 'tool': self.tools[tool_name],
             }
             for hash_ in hashes
         ]
 
         self.assertCountEqual(actual_data, expected_data_v2)
 
     def add__duplicate_twice(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         data_rev1 = {
             'id': self.revision_id_2,
             **example_data1,
             'indexer_configuration_id': tool_id
         }
 
         data_rev2 = {
             'id': self.revision_id_2,
             **example_data2,
             'indexer_configuration_id': tool_id
         }
 
         # when
         endpoint(self, 'add')([data_rev1])
 
         with self.assertRaises(ValueError):
             endpoint(self, 'add')(
                 [data_rev2, data_rev2],
                 conflict_update=True)
 
         # then
         actual_data = list(endpoint(self, 'get')(
             [self.revision_id_2, self.revision_id_1]))
 
         expected_data = [{
             'id': self.revision_id_2,
             **example_data1,
             'tool': self.tools[tool_name]
         }]
         self.assertEqual(actual_data, expected_data)
 
     @rename
     def get(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         query = [self.sha1_2, self.sha1_1]
 
         data1 = {
             'id': self.sha1_2,
             **example_data1,
             'indexer_configuration_id': tool_id,
         }
 
         # when
         endpoint(self, 'add')([data1])
 
         # then
         actual_data = list(endpoint(self, 'get')(query))
 
         # then
         expected_data = [{
             'id': self.sha1_2,
             **example_data1,
             'tool': self.tools[tool_name]
         }]
 
         self.assertEqual(actual_data, expected_data)
 
     @rename
     def delete(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         query = [self.sha1_2, self.sha1_1]
 
         data1 = {
             'id': self.sha1_2,
             **example_data1,
             'indexer_configuration_id': tool_id,
         }
 
         # when
         endpoint(self, 'add')([data1])
         endpoint(self, 'delete')([
             {
                 'id': self.sha1_2,
                 'indexer_configuration_id': tool_id,
             }
         ])
 
         # then
         actual_data = list(endpoint(self, 'get')(query))
 
         # then
         self.assertEqual(actual_data, [])
 
     @rename
     def delete_nonexisting(self):
         tool_id = self.tools[tool_name]['id']
         endpoint(self, 'delete')([
             {
                 'id': self.sha1_2,
                 'indexer_configuration_id': tool_id,
             }
         ])
 
     return (
         missing,
         add__drop_duplicate,
         add__update_in_place_duplicate,
         add__update_in_place_deadlock,
         add__duplicate_twice,
         get,
         delete,
         delete_nonexisting,
     )
 
 
 class CommonTestStorage:
     """Base class for Indexer Storage testing.
 
     """
     def setUp(self):
         super().setUp()
         self.storage = get_indexer_storage(**self.storage_config)
         tools = self.storage.indexer_configuration_add(TOOLS)
         self.tools = {}
         for tool in tools:
             tool_name = tool['tool_name']
             while tool_name in self.tools:
                 tool_name += '_'
             self.tools[tool_name] = {
                 'id': tool['id'],
                 'name': tool['tool_name'],
                 'version': tool['tool_version'],
                 'configuration': tool['tool_configuration'],
             }
 
         self.sha1_1 = hash_to_bytes('34973274ccef6ab4dfaaf86599792fa9c3fe4689')
         self.sha1_2 = hash_to_bytes('61c2b3a30496d329e21af70dd2d7e097046d07b7')
         self.revision_id_1 = hash_to_bytes(
             '7026b7c1a2af56521e951c01ed20f255fa054238')
         self.revision_id_2 = hash_to_bytes(
             '7026b7c1a2af56521e9587659012345678904321')
         self.revision_id_3 = hash_to_bytes(
             '7026b7c1a2af56521e9587659012345678904320')
         self.origin_id_1 = 44434341
         self.origin_id_2 = 44434342
         self.origin_id_3 = 54974445
 
     def test_check_config(self):
         self.assertTrue(self.storage.check_config(check_write=True))
         self.assertTrue(self.storage.check_config(check_write=False))
 
     # generate content_mimetype tests
     (
         test_content_mimetype_missing,
         test_content_mimetype_add__drop_duplicate,
         test_content_mimetype_add__update_in_place_duplicate,
         test_content_mimetype_add__update_in_place_deadlock,
         test_content_mimetype_add__duplicate_twice,
         test_content_mimetype_get,
         _,  # content_mimetype_detete,
         _,  # content_mimetype_detete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='content_mimetype',
         tool_name='file',
         example_data1={
             'mimetype': 'text/plain',
             'encoding': 'utf-8',
         },
         example_data2={
             'mimetype': 'text/html',
             'encoding': 'us-ascii',
         },
     )
 
     # content_language tests
     (
         test_content_language_missing,
         test_content_language_add__drop_duplicate,
         test_content_language_add__update_in_place_duplicate,
         test_content_language_add__update_in_place_deadlock,
         test_content_language_add__duplicate_twice,
         test_content_language_get,
         _,  # test_content_language_delete,
         _,  # test_content_language_delete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='content_language',
         tool_name='pygments',
         example_data1={
             'lang': 'haskell',
         },
         example_data2={
             'lang': 'common-lisp',
         },
     )
 
     # content_ctags tests
     (
         test_content_ctags_missing,
         # the following tests are disabled because CTAGS behave differently
         _,  # test_content_ctags_add__drop_duplicate,
         _,  # test_content_ctags_add__update_in_place_duplicate,
         _,  # test_content_ctags_add__update_in_place_deadlock,
         _,  # test_content_ctags_add__duplicate_twice,
         _,  # test_content_ctags_get,
         _,  # test_content_ctags_delete,
         _,  # test_content_ctags_delete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='content_ctags',
         tool_name='universal-ctags',
         example_data1={
             'ctags': [{
                 'name': 'done',
                 'kind': 'variable',
                 'line': 119,
                 'lang': 'OCaml',
             }]
         },
         example_data2={
             'ctags': [
                 {
                     'name': 'done',
                     'kind': 'variable',
                     'line': 100,
                     'lang': 'Python',
                 },
                 {
                     'name': 'main',
                     'kind': 'function',
                     'line': 119,
                     'lang': 'Python',
                 }]
         },
     )
 
     def test_content_ctags_search(self):
         # 1. given
         tool = self.tools['universal-ctags']
         tool_id = tool['id']
 
         ctag1 = {
             'id': self.sha1_1,
             'indexer_configuration_id': tool_id,
             'ctags': [
                 {
                     'name': 'hello',
                     'kind': 'function',
                     'line': 133,
                     'lang': 'Python',
                 },
                 {
                     'name': 'counter',
                     'kind': 'variable',
                     'line': 119,
                     'lang': 'Python',
                 },
                 {
                     'name': 'hello',
                     'kind': 'variable',
                     'line': 210,
                     'lang': 'Python',
                 },
             ]
         }
 
         ctag2 = {
             'id': self.sha1_2,
             'indexer_configuration_id': tool_id,
             'ctags': [
                 {
                     'name': 'hello',
                     'kind': 'variable',
                     'line': 100,
                     'lang': 'C',
                 },
                 {
                     'name': 'result',
                     'kind': 'variable',
                     'line': 120,
                     'lang': 'C',
                 },
             ]
         }
 
         self.storage.content_ctags_add([ctag1, ctag2])
 
         # 1. when
         actual_ctags = list(self.storage.content_ctags_search('hello',
                                                               limit=1))
 
         # 1. then
         self.assertEqual(actual_ctags, [
             {
                 'id': ctag1['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'function',
                 'line': 133,
                 'lang': 'Python',
             }
         ])
 
         # 2. when
         actual_ctags = list(self.storage.content_ctags_search(
             'hello',
             limit=1,
             last_sha1=ctag1['id']))
 
         # 2. then
         self.assertEqual(actual_ctags, [
             {
                 'id': ctag2['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'C',
             }
         ])
 
         # 3. when
         actual_ctags = list(self.storage.content_ctags_search('hello'))
 
         # 3. then
         self.assertEqual(actual_ctags, [
             {
                 'id': ctag1['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'function',
                 'line': 133,
                 'lang': 'Python',
             },
             {
                 'id': ctag1['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'variable',
                 'line': 210,
                 'lang': 'Python',
             },
             {
                 'id': ctag2['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'C',
             },
         ])
 
         # 4. when
         actual_ctags = list(self.storage.content_ctags_search('counter'))
 
         # then
         self.assertEqual(actual_ctags, [{
             'id': ctag1['id'],
             'tool': tool,
             'name': 'counter',
             'kind': 'variable',
             'line': 119,
             'lang': 'Python',
         }])
 
         # 5. when
         actual_ctags = list(self.storage.content_ctags_search('result',
                                                               limit=1))
 
         # then
         self.assertEqual(actual_ctags, [{
             'id': ctag2['id'],
             'tool': tool,
             'name': 'result',
             'kind': 'variable',
             'line': 120,
             'lang': 'C',
         }])
 
     def test_content_ctags_search_no_result(self):
         actual_ctags = list(self.storage.content_ctags_search('counter'))
 
         self.assertEqual(actual_ctags, [])
 
     def test_content_ctags_add__add_new_ctags_added(self):
         # given
         tool = self.tools['universal-ctags']
         tool_id = tool['id']
 
         ctag_v1 = {
             'id': self.sha1_2,
             'indexer_configuration_id': tool_id,
             'ctags': [{
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
             }]
         }
 
         # given
         self.storage.content_ctags_add([ctag_v1])
         self.storage.content_ctags_add([ctag_v1])  # conflict does nothing
 
         # when
         actual_ctags = list(self.storage.content_ctags_get(
             [self.sha1_2]))
 
         # then
         expected_ctags = [{
             'id': self.sha1_2,
             'name': 'done',
             'kind': 'variable',
             'line': 100,
             'lang': 'Scheme',
             'tool': tool,
         }]
 
         self.assertEqual(actual_ctags, expected_ctags)
 
         # given
         ctag_v2 = ctag_v1.copy()
         ctag_v2.update({
             'ctags': [
                 {
                     'name': 'defn',
                     'kind': 'function',
                     'line': 120,
                     'lang': 'Scheme',
                 }
             ]
         })
 
         self.storage.content_ctags_add([ctag_v2])
 
         expected_ctags = [
             {
                 'id': self.sha1_2,
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
                 'tool': tool,
             }, {
                 'id': self.sha1_2,
                 'name': 'defn',
                 'kind': 'function',
                 'line': 120,
                 'lang': 'Scheme',
                 'tool': tool,
             }
         ]
 
         actual_ctags = list(self.storage.content_ctags_get(
             [self.sha1_2]))
 
         self.assertEqual(actual_ctags, expected_ctags)
 
     def test_content_ctags_add__update_in_place(self):
         # given
         tool = self.tools['universal-ctags']
         tool_id = tool['id']
 
         ctag_v1 = {
             'id': self.sha1_2,
             'indexer_configuration_id': tool_id,
             'ctags': [{
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
             }]
         }
 
         # given
         self.storage.content_ctags_add([ctag_v1])
 
         # when
         actual_ctags = list(self.storage.content_ctags_get(
             [self.sha1_2]))
 
         # then
         expected_ctags = [
             {
                 'id': self.sha1_2,
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
                 'tool': tool
             }
         ]
         self.assertEqual(actual_ctags, expected_ctags)
 
         # given
         ctag_v2 = ctag_v1.copy()
         ctag_v2.update({
             'ctags': [
                 {
                     'name': 'done',
                     'kind': 'variable',
                     'line': 100,
                     'lang': 'Scheme',
                 },
                 {
                     'name': 'defn',
                     'kind': 'function',
                     'line': 120,
                     'lang': 'Scheme',
                 }
             ]
         })
 
         self.storage.content_ctags_add([ctag_v2], conflict_update=True)
 
         actual_ctags = list(self.storage.content_ctags_get(
             [self.sha1_2]))
 
         # ctag did change as the v2 was used to overwrite v1
         expected_ctags = [
             {
                 'id': self.sha1_2,
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
                 'tool': tool,
             },
             {
                 'id': self.sha1_2,
                 'name': 'defn',
                 'kind': 'function',
                 'line': 120,
                 'lang': 'Scheme',
                 'tool': tool,
             }
         ]
         self.assertEqual(actual_ctags, expected_ctags)
 
     # content_fossology_license tests
     (
         _,  # The endpoint content_fossology_license_missing does not exist
         # the following tests are disabled because fossology_license tests
         # behave differently
         _,  # test_content_fossology_license_add__drop_duplicate,
         _,  # test_content_fossology_license_add__update_in_place_duplicate,
         _,  # test_content_fossology_license_add__update_in_place_deadlock,
         _,  # test_content_metadata_add__duplicate_twice,
         _,  # test_content_fossology_license_get,
         _,  # test_content_fossology_license_delete,
         _,  # test_content_fossology_license_delete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='content_fossology_license',
         tool_name='nomos',
         example_data1={
             'licenses': ['Apache-2.0'],
         },
         example_data2={
             'licenses': ['BSD-2-Clause'],
         },
     )
 
     def test_content_fossology_license_add__new_license_added(self):
         # given
         tool = self.tools['nomos']
         tool_id = tool['id']
 
         license_v1 = {
             'id': self.sha1_1,
             'licenses': ['Apache-2.0'],
             'indexer_configuration_id': tool_id,
         }
 
         # given
         self.storage.content_fossology_license_add([license_v1])
         # conflict does nothing
         self.storage.content_fossology_license_add([license_v1])
 
         # when
         actual_licenses = list(self.storage.content_fossology_license_get(
             [self.sha1_1]))
 
         # then
         expected_license = {
             self.sha1_1: [{
                 'licenses': ['Apache-2.0'],
                 'tool': tool,
             }]
         }
         self.assertEqual(actual_licenses, [expected_license])
 
         # given
         license_v2 = license_v1.copy()
         license_v2.update({
             'licenses': ['BSD-2-Clause'],
         })
 
         self.storage.content_fossology_license_add([license_v2])
 
         actual_licenses = list(self.storage.content_fossology_license_get(
             [self.sha1_1]))
 
         expected_license = {
             self.sha1_1: [{
                 'licenses': ['Apache-2.0', 'BSD-2-Clause'],
                 'tool': tool
             }]
         }
 
         # license did not change as the v2 was dropped.
         self.assertEqual(actual_licenses, [expected_license])
 
     # content_metadata tests
     (
         test_content_metadata_missing,
         test_content_metadata_add__drop_duplicate,
         test_content_metadata_add__update_in_place_duplicate,
         test_content_metadata_add__update_in_place_deadlock,
         test_content_metadata_add__duplicate_twice,
         test_content_metadata_get,
         _,  # test_content_metadata_delete,
         _,  # test_content_metadata_delete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='content_metadata',
         tool_name='swh-metadata-detector',
         example_data1={
             'metadata': {
                 'other': {},
                 'codeRepository': {
                     'type': 'git',
                     'url': 'https://github.com/moranegg/metadata_test'
                 },
                 'description': 'Simple package.json test for indexer',
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
         },
         example_data2={
             'metadata': {
                 'other': {},
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
         },
     )
 
     # revision_intrinsic_metadata tests
     (
         test_revision_intrinsic_metadata_missing,
         test_revision_intrinsic_metadata_add__drop_duplicate,
         test_revision_intrinsic_metadata_add__update_in_place_duplicate,
         test_revision_intrinsic_metadata_add__update_in_place_deadlock,
         test_revision_intrinsic_metadata_add__duplicate_twice,
         test_revision_intrinsic_metadata_get,
         test_revision_intrinsic_metadata_delete,
         test_revision_intrinsic_metadata_delete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='revision_intrinsic_metadata',
         tool_name='swh-metadata-detector',
         example_data1={
             'metadata': {
                 'other': {},
                 'codeRepository': {
                     'type': 'git',
                     'url': 'https://github.com/moranegg/metadata_test'
                 },
                 'description': 'Simple package.json test for indexer',
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
             'mappings': ['mapping1'],
         },
         example_data2={
             'metadata': {
                 'other': {},
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
             'mappings': ['mapping2'],
         },
     )
 
     def test_origin_intrinsic_metadata_get(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata = {
             'version': None,
             'name': None,
         }
         metadata_rev = {
             'id': self.revision_id_2,
             'metadata': metadata,
             'mappings': ['mapping1'],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin = {
             'id': self.origin_id_1,
             'metadata': metadata,
             'indexer_configuration_id': tool_id,
             'mappings': ['mapping1'],
             'from_revision': self.revision_id_2,
             }
 
         # when
         self.storage.revision_intrinsic_metadata_add([metadata_rev])
         self.storage.origin_intrinsic_metadata_add([metadata_origin])
 
         # then
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1, 42]))
 
         expected_metadata = [{
             'id': self.origin_id_1,
             'metadata': metadata,
             'tool': self.tools['swh-metadata-detector'],
             'from_revision': self.revision_id_2,
             'mappings': ['mapping1'],
         }]
 
         self.assertEqual(actual_metadata, expected_metadata)
 
     def test_origin_intrinsic_metadata_delete(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata = {
             'version': None,
             'name': None,
         }
         metadata_rev = {
             'id': self.revision_id_2,
             'metadata': metadata,
             'mappings': ['mapping1'],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin = {
             'id': self.origin_id_1,
             'metadata': metadata,
             'indexer_configuration_id': tool_id,
             'mappings': ['mapping1'],
             'from_revision': self.revision_id_2,
             }
         metadata_origin2 = metadata_origin.copy()
         metadata_origin2['id'] = self.origin_id_2
 
         # when
         self.storage.revision_intrinsic_metadata_add([metadata_rev])
         self.storage.origin_intrinsic_metadata_add([
             metadata_origin, metadata_origin2])
 
         self.storage.origin_intrinsic_metadata_delete([
             {
                 'id': self.origin_id_1,
                 'indexer_configuration_id': tool_id
             }
         ])
 
         # then
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1, self.origin_id_2, 42]))
         for item in actual_metadata:
             item['indexer_configuration_id'] = item.pop('tool')['id']
         self.assertEqual(actual_metadata, [metadata_origin2])
 
     def test_origin_intrinsic_metadata_delete_nonexisting(self):
         tool_id = self.tools['swh-metadata-detector']['id']
         self.storage.origin_intrinsic_metadata_delete([
             {
                 'id': self.origin_id_1,
                 'indexer_configuration_id': tool_id
             }
         ])
 
     def test_origin_intrinsic_metadata_add_drop_duplicate(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata_v1 = {
             'version': None,
             'name': None,
         }
         metadata_rev_v1 = {
             'id': self.revision_id_1,
             'metadata': metadata_v1.copy(),
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin_v1 = {
             'id': self.origin_id_1,
             'metadata': metadata_v1.copy(),
             'indexer_configuration_id': tool_id,
             'mappings': [],
             'from_revision': self.revision_id_1,
         }
 
         # given
         self.storage.revision_intrinsic_metadata_add([metadata_rev_v1])
         self.storage.origin_intrinsic_metadata_add([metadata_origin_v1])
 
         # when
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1, 42]))
 
         expected_metadata_v1 = [{
             'id': self.origin_id_1,
             'metadata': metadata_v1,
             'tool': self.tools['swh-metadata-detector'],
             'from_revision': self.revision_id_1,
             'mappings': [],
         }]
 
         self.assertEqual(actual_metadata, expected_metadata_v1)
 
         # given
         metadata_v2 = metadata_v1.copy()
         metadata_v2.update({
             'name': 'test_metadata',
             'author': 'MG',
         })
         metadata_rev_v2 = metadata_rev_v1.copy()
         metadata_origin_v2 = metadata_origin_v1.copy()
         metadata_rev_v2['metadata'] = metadata_v2
         metadata_origin_v2['metadata'] = metadata_v2
 
         self.storage.revision_intrinsic_metadata_add([metadata_rev_v2])
         self.storage.origin_intrinsic_metadata_add([metadata_origin_v2])
 
         # then
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1]))
 
         # metadata did not change as the v2 was dropped.
         self.assertEqual(actual_metadata, expected_metadata_v1)
 
     def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata_v1 = {
             'version': None,
             'name': None,
         }
         metadata_rev_v1 = {
             'id': self.revision_id_2,
             'metadata': metadata_v1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin_v1 = {
             'id': self.origin_id_1,
             'metadata': metadata_v1.copy(),
             'indexer_configuration_id': tool_id,
             'mappings': [],
             'from_revision': self.revision_id_2,
         }
 
         # given
         self.storage.revision_intrinsic_metadata_add([metadata_rev_v1])
         self.storage.origin_intrinsic_metadata_add([metadata_origin_v1])
 
         # when
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1]))
 
         # then
         expected_metadata_v1 = [{
             'id': self.origin_id_1,
             'metadata': metadata_v1,
             'tool': self.tools['swh-metadata-detector'],
             'from_revision': self.revision_id_2,
             'mappings': [],
         }]
         self.assertEqual(actual_metadata, expected_metadata_v1)
 
         # given
         metadata_v2 = metadata_v1.copy()
         metadata_v2.update({
             'name': 'test_update_duplicated_metadata',
             'author': 'MG',
         })
         metadata_rev_v2 = metadata_rev_v1.copy()
         metadata_origin_v2 = metadata_origin_v1.copy()
         metadata_rev_v2['metadata'] = metadata_v2
         metadata_origin_v2['metadata'] = metadata_v2
 
         self.storage.revision_intrinsic_metadata_add(
                 [metadata_rev_v2], conflict_update=True)
         self.storage.origin_intrinsic_metadata_add(
                 [metadata_origin_v2], conflict_update=True)
 
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1]))
 
         expected_metadata_v2 = [{
             'id': self.origin_id_1,
             'metadata': metadata_v2,
             'tool': self.tools['swh-metadata-detector'],
             'from_revision': self.revision_id_2,
             'mappings': [],
         }]
 
         # metadata did change as the v2 was used to overwrite v1
         self.assertEqual(actual_metadata, expected_metadata_v2)
 
     def test_origin_intrinsic_metadata_add__update_in_place_deadlock(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         ids = list(range(1000))
 
         example_data1 = {
             'metadata': {
                 'version': None,
                 'name': None,
             },
             'mappings': [],
         }
         example_data2 = {
             'metadata': {
                 'version': 'v1.1.1',
                 'name': 'foo',
             },
             'mappings': [],
         }
 
         metadata_rev_v1 = {
             'id': self.revision_id_2,
             'metadata': {
                 'version': None,
                 'name': None,
             },
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
 
         data_v1 = [
             {
                 'id': id_,
                 'from_revision': self.revision_id_2,
                 **example_data1,
                 'indexer_configuration_id': tool_id,
             }
             for id_ in ids
         ]
         data_v2 = [
             {
                 'id': id_,
                 'from_revision': self.revision_id_2,
                 **example_data2,
                 'indexer_configuration_id': tool_id,
             }
             for id_ in ids
         ]
 
         # Remove one item from each, so that both queries have to succeed for
         # all items to be in the DB.
         data_v2a = data_v2[1:]
         data_v2b = list(reversed(data_v2[0:-1]))
 
         # given
         self.storage.revision_intrinsic_metadata_add([metadata_rev_v1])
         self.storage.origin_intrinsic_metadata_add(data_v1)
 
         # when
         actual_data = list(self.storage.origin_intrinsic_metadata_get(ids))
 
         expected_data_v1 = [
             {
                 'id': id_,
                 'from_revision': self.revision_id_2,
                 **example_data1,
                 'tool': self.tools['swh-metadata-detector'],
             }
             for id_ in ids
         ]
 
         # then
         self.assertEqual(actual_data, expected_data_v1)
 
         # given
         def f1():
             self.storage.origin_intrinsic_metadata_add(
                 data_v2a, conflict_update=True)
 
         def f2():
             self.storage.origin_intrinsic_metadata_add(
                 data_v2b, conflict_update=True)
 
         t1 = threading.Thread(target=f1)
         t2 = threading.Thread(target=f2)
         t2.start()
         t1.start()
 
         t1.join()
         t2.join()
 
         actual_data = list(self.storage.origin_intrinsic_metadata_get(ids))
 
         expected_data_v2 = [
             {
                 'id': id_,
                 'from_revision': self.revision_id_2,
                 **example_data2,
                 'tool': self.tools['swh-metadata-detector'],
             }
             for id_ in ids
         ]
 
         self.maxDiff = None
         self.assertCountEqual(actual_data, expected_data_v2)
 
     def test_origin_intrinsic_metadata_add__duplicate_twice(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata = {
             'developmentStatus': None,
             'name': None,
         }
         metadata_rev = {
             'id': self.revision_id_2,
             'metadata': metadata,
             'mappings': ['mapping1'],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin = {
             'id': self.origin_id_1,
             'metadata': metadata,
             'indexer_configuration_id': tool_id,
             'mappings': ['mapping1'],
             'from_revision': self.revision_id_2,
             }
 
         # when
         self.storage.revision_intrinsic_metadata_add([metadata_rev])
 
         with self.assertRaises(ValueError):
             self.storage.origin_intrinsic_metadata_add([
                 metadata_origin, metadata_origin])
 
     def test_origin_intrinsic_metadata_search_fulltext(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata1 = {
             'author': 'John Doe',
         }
         metadata1_rev = {
             'id': self.revision_id_1,
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata1_origin = {
             'id': self.origin_id_1,
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': self.revision_id_1,
         }
         metadata2 = {
             'author': 'Jane Doe',
         }
         metadata2_rev = {
             'id': self.revision_id_2,
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata2_origin = {
             'id': self.origin_id_2,
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': self.revision_id_2,
         }
 
         # when
         self.storage.revision_intrinsic_metadata_add([metadata1_rev])
         self.storage.origin_intrinsic_metadata_add([metadata1_origin])
         self.storage.revision_intrinsic_metadata_add([metadata2_rev])
         self.storage.origin_intrinsic_metadata_add([metadata2_origin])
 
         # then
         search = self.storage.origin_intrinsic_metadata_search_fulltext
         self.assertCountEqual(
                 [res['id'] for res in search(['Doe'])],
                 [self.origin_id_1, self.origin_id_2])
         self.assertEqual(
                 [res['id'] for res in search(['John', 'Doe'])],
                 [self.origin_id_1])
         self.assertEqual(
                 [res['id'] for res in search(['John'])],
                 [self.origin_id_1])
         self.assertEqual(
                 [res['id'] for res in search(['John', 'Jane'])],
                 [])
 
     def test_origin_intrinsic_metadata_search_fulltext_rank(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         # The following authors have "Random Person" to add some more content
         # to the JSON data, to work around normalization quirks when there
         # are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words
         # for small values of nb_words).
         metadata1 = {
             'author': [
                 'Random Person',
                 'John Doe',
                 'Jane Doe',
             ]
         }
         metadata1_rev = {
             'id': self.revision_id_1,
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata1_origin = {
             'id': self.origin_id_1,
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': self.revision_id_1,
         }
         metadata2 = {
             'author': [
                 'Random Person',
                 'Jane Doe',
             ]
         }
         metadata2_rev = {
             'id': self.revision_id_2,
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata2_origin = {
             'id': self.origin_id_2,
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': self.revision_id_2,
         }
 
         # when
         self.storage.revision_intrinsic_metadata_add([metadata1_rev])
         self.storage.origin_intrinsic_metadata_add([metadata1_origin])
         self.storage.revision_intrinsic_metadata_add([metadata2_rev])
         self.storage.origin_intrinsic_metadata_add([metadata2_origin])
 
         # then
         search = self.storage.origin_intrinsic_metadata_search_fulltext
         self.assertEqual(
                 [res['id'] for res in search(['Doe'])],
                 [self.origin_id_1, self.origin_id_2])
         self.assertEqual(
                 [res['id'] for res in search(['Doe'], limit=1)],
                 [self.origin_id_1])
         self.assertEqual(
                 [res['id'] for res in search(['John'])],
                 [self.origin_id_1])
         self.assertEqual(
                 [res['id'] for res in search(['Jane'])],
                 [self.origin_id_2, self.origin_id_1])
         self.assertEqual(
                 [res['id'] for res in search(['John', 'Jane'])],
                 [self.origin_id_1])
 
     def _fill_origin_intrinsic_metadata(self):
         tool1_id = self.tools['swh-metadata-detector']['id']
         tool2_id = self.tools['swh-metadata-detector2']['id']
 
         metadata1 = {
             '@context': 'foo',
             'author': 'John Doe',
         }
         metadata1_rev = {
             'id': self.revision_id_1,
             'metadata': metadata1,
             'mappings': ['npm'],
             'indexer_configuration_id': tool1_id,
         }
         metadata1_origin = {
             'id': self.origin_id_1,
             'metadata': metadata1,
             'mappings': ['npm'],
             'indexer_configuration_id': tool1_id,
             'from_revision': self.revision_id_1,
         }
         metadata2 = {
             '@context': 'foo',
             'author': 'Jane Doe',
         }
         metadata2_rev = {
             'id': self.revision_id_2,
             'metadata': metadata2,
             'mappings': ['npm', 'gemspec'],
             'indexer_configuration_id': tool2_id,
         }
         metadata2_origin = {
             'id': self.origin_id_2,
             'metadata': metadata2,
             'mappings': ['npm', 'gemspec'],
             'indexer_configuration_id': tool2_id,
             'from_revision': self.revision_id_2,
         }
         metadata3 = {
             '@context': 'foo',
         }
         metadata3_rev = {
             'id': self.revision_id_3,
             'metadata': metadata3,
             'mappings': ['npm', 'gemspec'],
             'indexer_configuration_id': tool2_id,
         }
         metadata3_origin = {
             'id': self.origin_id_3,
             'metadata': metadata3,
             'mappings': ['pkg-info'],
             'indexer_configuration_id': tool2_id,
             'from_revision': self.revision_id_3,
         }
 
         self.storage.revision_intrinsic_metadata_add([metadata1_rev])
         self.storage.origin_intrinsic_metadata_add([metadata1_origin])
         self.storage.revision_intrinsic_metadata_add([metadata2_rev])
         self.storage.origin_intrinsic_metadata_add([metadata2_origin])
         self.storage.revision_intrinsic_metadata_add([metadata3_rev])
         self.storage.origin_intrinsic_metadata_add([metadata3_origin])
 
     def test_origin_intrinsic_metadata_search_by_producer(self):
         self._fill_origin_intrinsic_metadata()
         tool1 = self.tools['swh-metadata-detector']
         tool2 = self.tools['swh-metadata-detector2']
         endpoint = self.storage.origin_intrinsic_metadata_search_by_producer
 
         # test pagination
         self.assertCountEqual(
             endpoint(ids_only=True),
             [self.origin_id_1, self.origin_id_2, self.origin_id_3])
         self.assertCountEqual(
             endpoint(start=0, ids_only=True),
             [self.origin_id_1, self.origin_id_2, self.origin_id_3])
         self.assertCountEqual(
             endpoint(start=0, limit=2, ids_only=True),
             [self.origin_id_1, self.origin_id_2])
         self.assertCountEqual(
             endpoint(start=self.origin_id_1+1, ids_only=True),
             [self.origin_id_2, self.origin_id_3])
         self.assertCountEqual(
             endpoint(start=self.origin_id_1+1, end=self.origin_id_3-1,
                      ids_only=True),
             [self.origin_id_2])
 
         # test mappings filtering
         self.assertCountEqual(
             endpoint(mappings=['npm'], ids_only=True),
             [self.origin_id_1, self.origin_id_2])
         self.assertCountEqual(
             endpoint(mappings=['npm', 'gemspec'], ids_only=True),
             [self.origin_id_1, self.origin_id_2])
         self.assertCountEqual(
             endpoint(mappings=['gemspec'], ids_only=True),
             [self.origin_id_2])
         self.assertCountEqual(
             endpoint(mappings=['pkg-info'], ids_only=True),
             [self.origin_id_3])
         self.assertCountEqual(
             endpoint(mappings=['foobar'], ids_only=True),
             [])
 
         # test pagination + mappings
         self.assertCountEqual(
             endpoint(mappings=['npm'], limit=1, ids_only=True),
             [self.origin_id_1])
 
         # test tool filtering
         self.assertCountEqual(
             endpoint(tool_ids=[tool1['id']], ids_only=True),
             [self.origin_id_1])
         self.assertCountEqual(
             endpoint(tool_ids=[tool2['id']], ids_only=True),
             [self.origin_id_2, self.origin_id_3])
         self.assertCountEqual(
             endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True),
             [self.origin_id_1, self.origin_id_2, self.origin_id_3])
 
         # test ids_only=False
         self.assertEqual(list(endpoint(mappings=['gemspec'])), [{
             'id': self.origin_id_2,
             'metadata': {
                 '@context': 'foo',
                 'author': 'Jane Doe',
             },
             'mappings': ['npm', 'gemspec'],
             'tool': tool2,
             'from_revision': self.revision_id_2,
         }])
 
     def test_origin_intrinsic_metadata_stats(self):
         self._fill_origin_intrinsic_metadata()
 
         result = self.storage.origin_intrinsic_metadata_stats()
         self.assertEqual(result, {
             'per_mapping': {
                 'gemspec': 1,
                 'npm': 2,
                 'pkg-info': 1,
                 'codemeta': 0,
                 'maven': 0,
             },
             'total': 3,
             'non_empty': 2,
         })
 
     def test_indexer_configuration_add(self):
         tool = {
             'tool_name': 'some-unknown-tool',
             'tool_version': 'some-version',
             'tool_configuration': {"debian-package": "some-package"},
         }
 
         actual_tool = self.storage.indexer_configuration_get(tool)
         self.assertIsNone(actual_tool)  # does not exist
 
         # add it
         actual_tools = list(self.storage.indexer_configuration_add([tool]))
 
         self.assertEqual(len(actual_tools), 1)
         actual_tool = actual_tools[0]
         self.assertIsNotNone(actual_tool)  # now it exists
         new_id = actual_tool.pop('id')
         self.assertEqual(actual_tool, tool)
 
         actual_tools2 = list(self.storage.indexer_configuration_add([tool]))
         actual_tool2 = actual_tools2[0]
         self.assertIsNotNone(actual_tool2)  # now it exists
         new_id2 = actual_tool2.pop('id')
 
         self.assertEqual(new_id, new_id2)
         self.assertEqual(actual_tool, actual_tool2)
 
     def test_indexer_configuration_add_multiple(self):
         tool = {
             'tool_name': 'some-unknown-tool',
             'tool_version': 'some-version',
             'tool_configuration': {"debian-package": "some-package"},
         }
 
         actual_tools = list(self.storage.indexer_configuration_add([tool]))
         self.assertEqual(len(actual_tools), 1)
 
         new_tools = [tool, {
             'tool_name': 'yet-another-tool',
             'tool_version': 'version',
             'tool_configuration': {},
         }]
 
         actual_tools = list(self.storage.indexer_configuration_add(new_tools))
         self.assertEqual(len(actual_tools), 2)
 
         # order not guaranteed, so we iterate over results to check
         for tool in actual_tools:
             _id = tool.pop('id')
             self.assertIsNotNone(_id)
             self.assertIn(tool, new_tools)
 
     def test_indexer_configuration_get_missing(self):
         tool = {
             'tool_name': 'unknown-tool',
             'tool_version': '3.1.0rc2-31-ga2cbb8c',
             'tool_configuration': {"command_line": "nomossa <filepath>"},
         }
 
         actual_tool = self.storage.indexer_configuration_get(tool)
 
         self.assertIsNone(actual_tool)
 
     def test_indexer_configuration_get(self):
         tool = {
             'tool_name': 'nomos',
             'tool_version': '3.1.0rc2-31-ga2cbb8c',
             'tool_configuration': {"command_line": "nomossa <filepath>"},
         }
 
         self.storage.indexer_configuration_add([tool])
         actual_tool = self.storage.indexer_configuration_get(tool)
 
         expected_tool = tool.copy()
         del actual_tool['id']
 
         self.assertEqual(expected_tool, actual_tool)
 
     def test_indexer_configuration_metadata_get_missing_context(self):
         tool = {
             'tool_name': 'swh-metadata-translator',
             'tool_version': '0.0.1',
             'tool_configuration': {"context": "unknown-context"},
         }
 
         actual_tool = self.storage.indexer_configuration_get(tool)
 
         self.assertIsNone(actual_tool)
 
     def test_indexer_configuration_metadata_get(self):
         tool = {
             'tool_name': 'swh-metadata-translator',
             'tool_version': '0.0.1',
             'tool_configuration': {"type": "local", "context": "NpmMapping"},
         }
 
         self.storage.indexer_configuration_add([tool])
         actual_tool = self.storage.indexer_configuration_get(tool)
 
         expected_tool = tool.copy()
         expected_tool['id'] = actual_tool['id']
 
         self.assertEqual(expected_tool, actual_tool)
 
     @pytest.mark.property_based
     def test_generate_content_mimetype_get_range_limit_none(self):
         """mimetype_get_range call with wrong limit input should fail"""
         with self.assertRaises(ValueError) as e:
             self.storage.content_mimetype_get_range(
                 start=None, end=None, indexer_configuration_id=None,
                 limit=None)
 
         self.assertEqual(e.exception.args, (
             'Development error: limit should not be None',))
 
     @pytest.mark.property_based
     @given(gen_content_mimetypes(min_size=1, max_size=4))
     def test_generate_content_mimetype_get_range_no_limit(self, mimetypes):
         """mimetype_get_range returns mimetypes within range provided"""
         self.reset_storage_tables()
         # add mimetypes to storage
         self.storage.content_mimetype_add(mimetypes)
 
         # All ids from the db
         content_ids = sorted([c['id'] for c in mimetypes])
 
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve mimetypes
         tool_id = mimetypes[0]['indexer_configuration_id']
         actual_result = self.storage.content_mimetype_get_range(
             start, end, indexer_configuration_id=tool_id)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         self.assertEqual(len(mimetypes), len(actual_ids))
         self.assertIsNone(actual_next)
         self.assertEqual(content_ids, actual_ids)
 
     @pytest.mark.property_based
     @given(gen_content_mimetypes(min_size=4, max_size=4))
     def test_generate_content_mimetype_get_range_limit(self, mimetypes):
         """mimetype_get_range paginates results if limit exceeded"""
         self.reset_storage_tables()
 
         # add mimetypes to storage
         self.storage.content_mimetype_add(mimetypes)
 
         # input the list of sha1s we want from storage
         content_ids = sorted([c['id'] for c in mimetypes])
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve mimetypes limited to 3 results
         limited_results = len(mimetypes) - 1
         tool_id = mimetypes[0]['indexer_configuration_id']
         actual_result = self.storage.content_mimetype_get_range(
             start, end,
             indexer_configuration_id=tool_id, limit=limited_results)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         self.assertEqual(limited_results, len(actual_ids))
         self.assertIsNotNone(actual_next)
         self.assertEqual(actual_next, content_ids[-1])
 
         expected_mimetypes = content_ids[:-1]
         self.assertEqual(expected_mimetypes, actual_ids)
 
         # retrieve next part
         actual_results2 = self.storage.content_mimetype_get_range(
             start=end, end=end, indexer_configuration_id=tool_id)
         actual_ids2 = actual_results2['ids']
         actual_next2 = actual_results2['next']
 
         self.assertIsNone(actual_next2)
         expected_mimetypes2 = [content_ids[-1]]
         self.assertEqual(expected_mimetypes2, actual_ids2)
 
     @pytest.mark.property_based
     def test_generate_content_fossology_license_get_range_limit_none(self):
         """license_get_range call with wrong limit input should fail"""
         with self.assertRaises(ValueError) as e:
             self.storage.content_fossology_license_get_range(
                 start=None, end=None, indexer_configuration_id=None,
                 limit=None)
 
         self.assertEqual(e.exception.args, (
             'Development error: limit should not be None',))
 
     @pytest.mark.property_based
     def prepare_mimetypes_from(self, fossology_licenses):
         """Fossology license needs some consistent data in db to run.
 
         """
         mimetypes = []
         for c in fossology_licenses:
             mimetypes.append({
                 'id': c['id'],
                 'mimetype': 'text/plain',
                 'encoding': 'utf-8',
                 'indexer_configuration_id': c['indexer_configuration_id'],
             })
         return mimetypes
 
     @pytest.mark.property_based
     @given(gen_content_fossology_licenses(min_size=1, max_size=4))
     def test_generate_content_fossology_license_get_range_no_limit(
             self, fossology_licenses):
         """license_get_range returns licenses within range provided"""
         self.reset_storage_tables()
         # craft some consistent mimetypes
         mimetypes = self.prepare_mimetypes_from(fossology_licenses)
 
         self.storage.content_mimetype_add(mimetypes)
         # add fossology_licenses to storage
         self.storage.content_fossology_license_add(fossology_licenses)
 
         # All ids from the db
         content_ids = sorted([c['id'] for c in fossology_licenses])
 
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve fossology_licenses
         tool_id = fossology_licenses[0]['indexer_configuration_id']
         actual_result = self.storage.content_fossology_license_get_range(
             start, end, indexer_configuration_id=tool_id)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         self.assertEqual(len(fossology_licenses), len(actual_ids))
         self.assertIsNone(actual_next)
         self.assertEqual(content_ids, actual_ids)
 
     @pytest.mark.property_based
     @given(gen_content_fossology_licenses(min_size=1, max_size=4),
            gen_content_mimetypes(min_size=1, max_size=1))
     def test_generate_content_fossology_license_get_range_no_limit_with_filter(
             self, fossology_licenses, mimetypes):
         """This filters non textual, then returns results within range"""
         self.reset_storage_tables()
 
         # craft some consistent mimetypes
         _mimetypes = self.prepare_mimetypes_from(fossology_licenses)
         # add binary mimetypes which will get filtered out in results
         for m in mimetypes:
             _mimetypes.append({
                 'mimetype': 'binary',
                 **m,
             })
 
         self.storage.content_mimetype_add(_mimetypes)
         # add fossology_licenses to storage
         self.storage.content_fossology_license_add(fossology_licenses)
 
         # All ids from the db
         content_ids = sorted([c['id'] for c in fossology_licenses])
 
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve fossology_licenses
         tool_id = fossology_licenses[0]['indexer_configuration_id']
         actual_result = self.storage.content_fossology_license_get_range(
             start, end, indexer_configuration_id=tool_id)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         self.assertEqual(len(fossology_licenses), len(actual_ids))
         self.assertIsNone(actual_next)
         self.assertEqual(content_ids, actual_ids)
 
     @pytest.mark.property_based
     @given(gen_content_fossology_licenses(min_size=4, max_size=4))
     def test_generate_fossology_license_get_range_limit(
             self, fossology_licenses):
         """fossology_license_get_range paginates results if limit exceeded"""
         self.reset_storage_tables()
         # craft some consistent mimetypes
         mimetypes = self.prepare_mimetypes_from(fossology_licenses)
 
         # add fossology_licenses to storage
         self.storage.content_mimetype_add(mimetypes)
         self.storage.content_fossology_license_add(fossology_licenses)
 
         # input the list of sha1s we want from storage
         content_ids = sorted([c['id'] for c in fossology_licenses])
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve fossology_licenses limited to 3 results
         limited_results = len(fossology_licenses) - 1
         tool_id = fossology_licenses[0]['indexer_configuration_id']
         actual_result = self.storage.content_fossology_license_get_range(
             start, end,
             indexer_configuration_id=tool_id, limit=limited_results)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         self.assertEqual(limited_results, len(actual_ids))
         self.assertIsNotNone(actual_next)
         self.assertEqual(actual_next, content_ids[-1])
 
         expected_fossology_licenses = content_ids[:-1]
         self.assertEqual(expected_fossology_licenses, actual_ids)
 
         # retrieve next part
         actual_results2 = self.storage.content_fossology_license_get_range(
             start=end, end=end, indexer_configuration_id=tool_id)
         actual_ids2 = actual_results2['ids']
         actual_next2 = actual_results2['next']
 
         self.assertIsNone(actual_next2)
         expected_fossology_licenses2 = [content_ids[-1]]
         self.assertEqual(expected_fossology_licenses2, actual_ids2)
 
 
 @pytest.mark.db
 class IndexerTestStorage(CommonTestStorage, BasePgTestStorage,
                          unittest.TestCase):
     """Running the tests locally.
 
     For the client api tests (remote storage), see
     `class`:swh.indexer.storage.test_api_client:TestRemoteStorage
     class.
 
     """
     pass
 
 
 def test_mapping_names():
     assert set(MAPPING_NAMES) == {m.name for m in MAPPINGS.values()}
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
index d6aaf02..347104c 100644
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -1,315 +1,322 @@
 # Copyright (C) 2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from functools import reduce
 import re
 import tempfile
 from unittest.mock import patch
 
 from click.testing import CliRunner
 
 from swh.model.hashutil import hash_to_bytes
 
 from swh.indexer.cli import cli
 
 
 CLI_CONFIG = '''
 scheduler:
     cls: foo
     args: {}
 storage:
     cls: memory
     args: {}
 indexer_storage:
     cls: memory
     args: {}
 '''
 
 
 def fill_idx_storage(idx_storage, nb_rows):
     tools = [
         {
             'tool_name': 'tool %d' % i,
             'tool_version': '0.0.1',
             'tool_configuration': {},
         }
         for i in range(2)
     ]
     tools = idx_storage.indexer_configuration_add(tools)
 
     origin_metadata = [
         {
             'id': origin_id,
             'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)),
             'indexer_configuration_id': tools[origin_id % 2]['id'],
             'metadata': {'name': 'origin %d' % origin_id},
             'mappings': ['mapping%d' % (origin_id % 10)]
         }
         for origin_id in range(nb_rows)
     ]
     revision_metadata = [
         {
             'id': hash_to_bytes('abcd{:0>4}'.format(origin_id)),
             'indexer_configuration_id': tools[origin_id % 2]['id'],
             'metadata': {'name': 'origin %d' % origin_id},
             'mappings': ['mapping%d' % (origin_id % 10)]
         }
         for origin_id in range(nb_rows)
     ]
 
     idx_storage.revision_intrinsic_metadata_add(revision_metadata)
     idx_storage.origin_intrinsic_metadata_add(origin_metadata)
 
     return [tool['id'] for tool in tools]
 
 
 def _origins_in_task_args(tasks):
     """Returns the set of origins contained in the arguments of the
-    provided tasks (assumed to be of type indexer_origin_metadata)."""
+    provided tasks (assumed to be of type index-origin-metadata)."""
     return reduce(
         set.union,
         (set(task['arguments']['args'][0]) for task in tasks),
         set()
     )
 
 
 def _assert_tasks_for_origins(tasks, origins):
     expected_kwargs = {"policy_update": "update-dups", "parse_ids": False}
-    assert {task['type'] for task in tasks} == {'indexer_origin_metadata'}
+    assert {task['type'] for task in tasks} == {'index-origin-metadata'}
     assert all(len(task['arguments']['args']) == 1 for task in tasks)
     assert all(task['arguments']['kwargs'] == expected_kwargs
                for task in tasks)
     assert _origins_in_task_args(tasks) == set(origins)
 
 
 def invoke(scheduler, catch_exceptions, args):
     runner = CliRunner()
     with patch('swh.indexer.cli.get_scheduler') as get_scheduler_mock, \
             tempfile.NamedTemporaryFile('a', suffix='.yml') as config_fd:
         config_fd.write(CLI_CONFIG)
         config_fd.seek(0)
         get_scheduler_mock.return_value = scheduler
         result = runner.invoke(cli, ['-C' + config_fd.name] + args)
     if not catch_exceptions and result.exception:
         print(result.output)
         raise result.exception
     return result
 
 
 def test_mapping_list(indexer_scheduler):
     result = invoke(indexer_scheduler, False, [
         'mapping', 'list',
     ])
     expected_output = '\n'.join([
         'codemeta', 'gemspec', 'maven', 'npm', 'pkg-info', '',
     ])
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
 
 def test_mapping_list_terms(indexer_scheduler):
     result = invoke(indexer_scheduler, False, [
         'mapping', 'list-terms',
     ])
     assert result.exit_code == 0, result.output
     assert re.search(r'http://schema.org/url:\n.*npm', result.output)
     assert re.search(r'http://schema.org/url:\n.*codemeta', result.output)
     assert re.search(
         r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta',
         result.output)
 
 
 def test_mapping_list_terms_exclude(indexer_scheduler):
     result = invoke(indexer_scheduler, False, [
         'mapping', 'list-terms',
         '--exclude-mapping', 'codemeta'
     ])
     assert result.exit_code == 0, result.output
     assert re.search(r'http://schema.org/url:\n.*npm', result.output)
     assert not re.search(r'http://schema.org/url:\n.*codemeta', result.output)
     assert not re.search(
         r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta',
         result.output)
 
 
+@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
 @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_empty_db(
         indexer_scheduler, idx_storage, storage):
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
     ])
     expected_output = (
         'Nothing to do (no origin metadata matched the criteria).\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 0
 
 
+@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
 @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_divisor(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when origin_batch_size*task_batch_size is a
     divisor of nb_origins."""
     fill_idx_storage(idx_storage, 90)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 3 tasks (30 origins).\n'
         'Scheduled 6 tasks (60 origins).\n'
         'Scheduled 9 tasks (90 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 9
     _assert_tasks_for_origins(tasks, range(90))
 
 
+@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
 @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_dry_run(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when origin_batch_size*task_batch_size is a
     divisor of nb_origins."""
     fill_idx_storage(idx_storage, 90)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', '--dry-run', 'reindex_origin_metadata',
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 3 tasks (30 origins).\n'
         'Scheduled 6 tasks (60 origins).\n'
         'Scheduled 9 tasks (90 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 0
 
 
+@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
 @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_nondivisor(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when neither origin_batch_size or
     task_batch_size is a divisor of nb_origins."""
     fill_idx_storage(idx_storage, 70)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
         '--batch-size', '20',
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 3 tasks (60 origins).\n'
         'Scheduled 4 tasks (70 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 4
     _assert_tasks_for_origins(tasks, range(70))
 
 
+@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
 @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_filter_one_mapping(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when origin_batch_size*task_batch_size is a
     divisor of nb_origins."""
     fill_idx_storage(idx_storage, 110)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
         '--mapping', 'mapping1',
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 2 tasks (11 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 2
     _assert_tasks_for_origins(
         tasks,
         [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101])
 
 
+@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
 @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_filter_two_mappings(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when origin_batch_size*task_batch_size is a
     divisor of nb_origins."""
     fill_idx_storage(idx_storage, 110)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
         '--mapping', 'mapping1', '--mapping', 'mapping2',
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 3 tasks (22 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 3
     _assert_tasks_for_origins(
         tasks,
         [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101,
          2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102])
 
 
+@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
 @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_filter_one_tool(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when origin_batch_size*task_batch_size is a
     divisor of nb_origins."""
     tool_ids = fill_idx_storage(idx_storage, 110)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
         '--tool-id', str(tool_ids[0]),
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 3 tasks (30 origins).\n'
         'Scheduled 6 tasks (55 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 6
     _assert_tasks_for_origins(
         tasks,
         [x*2 for x in range(55)])
diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py
deleted file mode 100644
index dc4e0c0..0000000
--- a/swh/indexer/tests/test_language.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (C) 2017-2018  The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import unittest
-import pytest
-
-from swh.indexer import language
-from swh.indexer.language import LanguageIndexer
-from swh.indexer.tests.utils import (
-    CommonContentIndexerTest,
-    BASE_TEST_CONFIG, fill_storage, fill_obj_storage, filter_dict,
-)
-
-
-CONFIG = {
-    **BASE_TEST_CONFIG,
-    'tools':  {
-        'name': 'pygments',
-        'version': '2.0.1+dfsg-1.1+deb8u1',
-        'configuration': {
-            'type': 'library',
-            'debian-package': 'python3-pygments',
-            'max_content_size': 10240,
-        },
-    }
-}
-
-
-class Language(unittest.TestCase):
-    """Tests pygments tool for language detection
-
-    """
-    def test_compute_language_none(self):
-        # given
-        self.content = ""
-        self.declared_language = {
-            'lang': None
-        }
-        # when
-        result = language.compute_language(self.content)
-        # then
-        self.assertEqual(self.declared_language, result)
-
-
-class TestLanguageIndexer(CommonContentIndexerTest, unittest.TestCase):
-    """Language indexer test scenarios:
-
-    - Known sha1s in the input list have their data indexed
-    - Unknown sha1 in the input list are not indexed
-
-    """
-
-    legacy_get_format = True
-
-    def get_indexer_results(self, ids):
-        yield from self.indexer.idx_storage.content_language_get(ids)
-
-    def setUp(self):
-        self.indexer = LanguageIndexer(config=CONFIG)
-        self.indexer.catch_exceptions = False
-        fill_storage(self.indexer.storage)
-        fill_obj_storage(self.indexer.objstorage)
-
-        self.id0 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
-        self.id1 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
-        self.id2 = 'd4c647f0fc257591cc9ba1722484229780d1c607'
-
-        tool = {k.replace('tool_', ''): v
-                for (k, v) in self.indexer.tool.items()}
-
-        self.expected_results = {
-            self.id0: {
-                'id': self.id0,
-                'tool': tool,
-                'lang': 'python',
-            },
-            self.id1: {
-                'id': self.id1,
-                'tool': tool,
-                'lang': 'c'
-            },
-            self.id2: {
-                'id': self.id2,
-                'tool': tool,
-                'lang': 'text-only'
-            }
-        }
-
-
-def test_language_w_no_tool():
-    with pytest.raises(ValueError):
-        LanguageIndexer(config=filter_dict(CONFIG, 'tools'))
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index a5be367..321cc1f 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,217 +1,263 @@
 # Copyright (C) 2018-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from unittest.mock import patch
 
 from swh.model.hashutil import hash_to_bytes
 
 from swh.indexer.metadata import OriginMetadataIndexer
 
 from .utils import YARN_PARSER_METADATA
 from .test_metadata import REVISION_METADATA_CONFIG
 
 
 def test_origin_metadata_indexer(
         idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     rev_metadata = {
         'id': rev_id,
         'metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
     origin_metadata = {
         'id': origin['id'],
         'from_revision': rev_id,
         'metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     for result in results:
         del result['tool']
     assert results == [rev_metadata]
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     for result in results:
         del result['tool']
     assert results == [origin_metadata]
 
 
 def test_origin_metadata_indexer_duplicate_origin(
         idx_storage, storage, obj_storage):
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2)
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert len(results) == 1
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert len(results) == 1
 
 
 def test_origin_metadata_indexer_missing_head(
         idx_storage, storage, obj_storage):
 
     storage.origin_add([{
         'type': 'git',
         'url': 'https://example.com'
     }])
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.run(["git+https://example.com"])
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://example.com'})
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert results == []
 
 
 def test_origin_metadata_indexer_partial_missing_head(
         idx_storage, storage, obj_storage):
 
     storage.origin_add([{
         'type': 'git',
         'url': 'https://example.com'
     }])
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.run(["git+https://example.com",
                  "git+https://github.com/librariesio/yarn-parser"])
 
     origin1 = storage.origin_get({
         'type': 'git',
         'url': 'https://example.com'})
     origin2 = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     rev_metadata = {
         'id': rev_id,
         'metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
     origin_metadata = {
         'id': origin2['id'],
         'from_revision': rev_id,
         'metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     for result in results:
         del result['tool']
     assert results == [rev_metadata]
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin1['id'], origin2['id']]))
     for result in results:
         del result['tool']
     assert results == [origin_metadata]
 
 
 def test_origin_metadata_indexer_duplicate_revision(
         idx_storage, storage, obj_storage):
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.run(["git+https://github.com/librariesio/yarn-parser",
                  "git+https://github.com/librariesio/yarn-parser.git"])
 
     origin1 = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     origin2 = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser.git'})
     assert origin1['id'] != origin2['id']
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert len(results) == 1
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin1['id'], origin2['id']]))
     assert len(results) == 2
 
 
-def test_origin_metadata_indexer_no_metadata(
+def test_origin_metadata_indexer_no_metadata_file(
         idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename',
                b'foo.json'):
         indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results == []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert results == []
 
 
+def test_origin_metadata_indexer_no_metadata(
+        idx_storage, storage, obj_storage):
+
+    indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+    with patch('swh.indexer.metadata.RevisionMetadataIndexer'
+               '.translate_revision_intrinsic_metadata',
+               return_value=(['npm'], {'@context': 'foo'})):
+        indexer.run(["git+https://github.com/librariesio/yarn-parser"])
+
+    origin = storage.origin_get({
+        'type': 'git',
+        'url': 'https://github.com/librariesio/yarn-parser'})
+    rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
+
+    results = list(
+        indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+    assert results == []
+
+    results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
+        origin['id']]))
+    assert results == []
+
+
+def test_origin_metadata_indexer_error(
+        idx_storage, storage, obj_storage):
+
+    indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+    with patch('swh.indexer.metadata.RevisionMetadataIndexer'
+               '.translate_revision_intrinsic_metadata',
+               return_value=None):
+        indexer.run(["git+https://github.com/librariesio/yarn-parser"])
+
+    origin = storage.origin_get({
+        'type': 'git',
+        'url': 'https://github.com/librariesio/yarn-parser'})
+    rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
+
+    results = list(
+        indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+    assert results == []
+
+    results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
+        origin['id']]))
+    assert results == []
+
+
 def test_origin_metadata_indexer_delete_metadata(
         idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results != []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert results != []
 
     with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename',
                b'foo.json'):
         indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results == []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert results == []
diff --git a/version.txt b/version.txt
index 94717a4..f4e725e 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v0.0.146-0-g669998e
\ No newline at end of file
+v0.0.147-0-gde4744f
\ No newline at end of file