diff --git a/PKG-INFO b/PKG-INFO
index e279eb8..68bb5a5 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,69 +1,69 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 0.0.145
+Version: 0.0.146
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Funding, https://www.softwareheritage.org/donate
-Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
+Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Description: swh-indexer
         ============
         
         Tools to compute multiple indexes on SWH's raw contents:
         - content:
           - mimetype
           - ctags
           - language
           - fossology-license
           - metadata
         - revision:
           - metadata
         
         An indexer is in charge of:
         - looking up objects
         - extracting information from those objects
         - store those information in the swh-indexer db
         
         There are multiple indexers working on different object types:
           - content indexer: works with content sha1 hashes
           - revision indexer: works with revision sha1 hashes
           - origin indexer: works with origin identifiers
         
         Indexation procedure:
         - receive batch of ids
         - retrieve the associated data depending on object type
         - compute for that object some index
         - store the result to swh's storage
         
         Current content indexers:
         
         - mimetype (queue swh_indexer_content_mimetype): detect the encoding
           and mimetype
         
         - language (queue swh_indexer_content_language): detect the
           programming language
         
         - ctags (queue swh_indexer_content_ctags): compute tags information
         
         - fossology-license (queue swh_indexer_fossology_license): compute the
           license
         
         - metadata: translate file into translated_metadata dict
         
         Current revision indexers:
         
         - metadata: detects files containing metadata and retrieves translated_metadata
           in content_metadata table in storage or run content indexer to translate
           files.
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
diff --git a/requirements.txt b/requirements.txt
index 3a7428c..a578b91 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 vcversioner
 pygments
 click
 chardet
-file_magic
+file-magic
 pyld
 xmltodict
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index e279eb8..68bb5a5 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,69 +1,69 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 0.0.145
+Version: 0.0.146
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Funding, https://www.softwareheritage.org/donate
-Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
+Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Description: swh-indexer
         ============
         
         Tools to compute multiple indexes on SWH's raw contents:
         - content:
           - mimetype
           - ctags
           - language
           - fossology-license
           - metadata
         - revision:
           - metadata
         
         An indexer is in charge of:
         - looking up objects
         - extracting information from those objects
         - store those information in the swh-indexer db
         
         There are multiple indexers working on different object types:
           - content indexer: works with content sha1 hashes
           - revision indexer: works with revision sha1 hashes
           - origin indexer: works with origin identifiers
         
         Indexation procedure:
         - receive batch of ids
         - retrieve the associated data depending on object type
         - compute for that object some index
         - store the result to swh's storage
         
         Current content indexers:
         
         - mimetype (queue swh_indexer_content_mimetype): detect the encoding
           and mimetype
         
         - language (queue swh_indexer_content_language): detect the
           programming language
         
         - ctags (queue swh_indexer_content_ctags): compute tags information
         
         - fossology-license (queue swh_indexer_fossology_license): compute the
           license
         
         - metadata: translate file into translated_metadata dict
         
         Current revision indexers:
         
         - metadata: detects files containing metadata and retrieves translated_metadata
           in content_metadata table in storage or run content indexer to translate
           files.
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt
index 2bc85f8..80cb6e8 100644
--- a/swh.indexer.egg-info/SOURCES.txt
+++ b/swh.indexer.egg-info/SOURCES.txt
@@ -1,85 +1,91 @@
 MANIFEST.in
 Makefile
 README.md
 requirements-swh.txt
 requirements.txt
 setup.py
 version.txt
 sql/bin/db-upgrade
 sql/bin/dot_add_content
 sql/doc/json/.gitignore
 sql/doc/json/Makefile
 sql/doc/json/indexer_configuration.tool_configuration.schema.json
 sql/doc/json/revision_metadata.translated_metadata.json
 sql/json/.gitignore
 sql/json/Makefile
 sql/json/indexer_configuration.tool_configuration.schema.json
 sql/json/revision_metadata.translated_metadata.json
 sql/upgrades/115.sql
 sql/upgrades/116.sql
 sql/upgrades/117.sql
 sql/upgrades/118.sql
 sql/upgrades/119.sql
 sql/upgrades/120.sql
 sql/upgrades/121.sql
 sql/upgrades/122.sql
 sql/upgrades/123.sql
 swh/__init__.py
 swh.indexer.egg-info/PKG-INFO
 swh.indexer.egg-info/SOURCES.txt
 swh.indexer.egg-info/dependency_links.txt
 swh.indexer.egg-info/entry_points.txt
 swh.indexer.egg-info/requires.txt
 swh.indexer.egg-info/top_level.txt
 swh/indexer/__init__.py
 swh/indexer/cli.py
 swh/indexer/codemeta.py
 swh/indexer/ctags.py
 swh/indexer/fossology_license.py
 swh/indexer/indexer.py
 swh/indexer/journal_client.py
 swh/indexer/language.py
 swh/indexer/metadata.py
 swh/indexer/metadata_detector.py
-swh/indexer/metadata_dictionary.py
 swh/indexer/mimetype.py
 swh/indexer/origin_head.py
 swh/indexer/rehash.py
 swh/indexer/tasks.py
 swh/indexer/data/codemeta/CITATION
 swh/indexer/data/codemeta/LICENSE
 swh/indexer/data/codemeta/codemeta.jsonld
 swh/indexer/data/codemeta/crosswalk.csv
+swh/indexer/metadata_dictionary/__init__.py
+swh/indexer/metadata_dictionary/base.py
+swh/indexer/metadata_dictionary/codemeta.py
+swh/indexer/metadata_dictionary/maven.py
+swh/indexer/metadata_dictionary/npm.py
+swh/indexer/metadata_dictionary/python.py
+swh/indexer/metadata_dictionary/ruby.py
 swh/indexer/sql/10-swh-init.sql
 swh/indexer/sql/20-swh-enums.sql
 swh/indexer/sql/30-swh-schema.sql
 swh/indexer/sql/40-swh-func.sql
 swh/indexer/sql/50-swh-data.sql
 swh/indexer/sql/60-swh-indexes.sql
 swh/indexer/storage/__init__.py
 swh/indexer/storage/converters.py
 swh/indexer/storage/db.py
 swh/indexer/storage/in_memory.py
 swh/indexer/storage/api/__init__.py
 swh/indexer/storage/api/client.py
 swh/indexer/storage/api/server.py
 swh/indexer/storage/api/wsgi.py
 swh/indexer/tests/__init__.py
 swh/indexer/tests/conftest.py
 swh/indexer/tests/tasks.py
 swh/indexer/tests/test_cli.py
 swh/indexer/tests/test_ctags.py
 swh/indexer/tests/test_fossology_license.py
 swh/indexer/tests/test_language.py
 swh/indexer/tests/test_metadata.py
 swh/indexer/tests/test_mimetype.py
 swh/indexer/tests/test_origin_head.py
 swh/indexer/tests/test_origin_metadata.py
 swh/indexer/tests/utils.py
 swh/indexer/tests/storage/__init__.py
 swh/indexer/tests/storage/generate_data_test.py
 swh/indexer/tests/storage/test_api_client.py
 swh/indexer/tests/storage/test_converters.py
 swh/indexer/tests/storage/test_in_memory.py
 swh/indexer/tests/storage/test_server.py
 swh/indexer/tests/storage/test_storage.py
\ No newline at end of file
diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt
index 9d81572..cc485e1 100644
--- a/swh.indexer.egg-info/requires.txt
+++ b/swh.indexer.egg-info/requires.txt
@@ -1,18 +1,18 @@
 vcversioner
 pygments
 click
 chardet
-file_magic
+file-magic
 pyld
 xmltodict
 swh.core>=0.0.53
 swh.model>=0.0.15
 swh.objstorage>=0.0.28
 swh.scheduler>=0.0.47
 swh.storage>=0.0.123
 swh.journal>=0.0.6
 
 [testing]
 pytest<4
 pytest-postgresql
 hypothesis>=3.11.0
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
index 56c7f88..c5244be 100644
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -1,208 +1,177 @@
 # Copyright (C) 2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import click
 
 from swh.core import config
 from swh.scheduler import get_scheduler
-from swh.scheduler.utils import create_task_dict
+from swh.scheduler.cli_utils import schedule_origin_batches
 from swh.storage import get_storage
 
 from swh.indexer import metadata_dictionary
 from swh.indexer.storage import get_indexer_storage
 from swh.indexer.storage.api.server import load_and_check_config, app
 
 
 CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
 
-TASK_BATCH_SIZE = 1000  # Number of tasks per query to the scheduler
-
 
 @click.group(context_settings=CONTEXT_SETTINGS)
 @click.option('--config-file', '-C', default=None,
               type=click.Path(exists=True, dir_okay=False,),
               help="Configuration file.")
 @click.pass_context
 def cli(ctx, config_file):
     """Software Heritage Indexer CLI interface
     """
     ctx.ensure_object(dict)
 
     conf = config.read(config_file)
     ctx.obj['config'] = conf
 
 
 def _get_api(getter, config, config_key, url):
     if url:
         config[config_key] = {
             'cls': 'remote',
             'args': {'url': url}
         }
     elif config_key not in config:
         raise click.ClickException(
             'Missing configuration for {}'.format(config_key))
     return getter(**config[config_key])
 
 
 @cli.group('mapping')
 def mapping():
     pass
 
 
 @mapping.command('list')
 def mapping_list():
     """Prints the list of known mappings."""
     mapping_names = [mapping.name
                      for mapping in metadata_dictionary.MAPPINGS.values()]
     mapping_names.sort()
     for mapping_name in mapping_names:
         click.echo(mapping_name)
 
 
 @mapping.command('list-terms')
 @click.option('--exclude-mapping', multiple=True,
               help='Exclude the given mapping from the output')
 @click.option('--concise', is_flag=True,
               default=False,
               help='Don\'t print the list of mappings supporting each term.')
 def mapping_list_terms(concise, exclude_mapping):
     """Prints the list of known CodeMeta terms, and which mappings
     support them."""
     properties = metadata_dictionary.list_terms()
     for (property_name, supported_mappings) in sorted(properties.items()):
         supported_mappings = {m.name for m in supported_mappings}
         supported_mappings -= set(exclude_mapping)
         if supported_mappings:
             if concise:
                 click.echo(property_name)
             else:
                 click.echo('{}:'.format(property_name))
                 click.echo('\t' + ', '.join(sorted(supported_mappings)))
 
 
 @cli.group('schedule')
 @click.option('--scheduler-url', '-s', default=None,
               help="URL of the scheduler API")
 @click.option('--indexer-storage-url', '-i', default=None,
               help="URL of the indexer storage API")
 @click.option('--storage-url', '-g', default=None,
               help="URL of the (graph) storage API")
 @click.option('--dry-run/--no-dry-run', is_flag=True,
               default=False,
-              help='Default to list only what would be scheduled.')
+              help='List only what would be scheduled.')
 @click.pass_context
 def schedule(ctx, scheduler_url, storage_url, indexer_storage_url,
              dry_run):
     """Manipulate indexer tasks via SWH Scheduler's API."""
     ctx.obj['indexer_storage'] = _get_api(
         get_indexer_storage,
         ctx.obj['config'],
         'indexer_storage',
         indexer_storage_url
     )
     ctx.obj['storage'] = _get_api(
         get_storage,
         ctx.obj['config'],
         'storage',
         storage_url
     )
     ctx.obj['scheduler'] = _get_api(
         get_scheduler,
         ctx.obj['config'],
         'scheduler',
         scheduler_url
     )
     if dry_run:
         ctx.obj['scheduler'] = None
 
 
 def list_origins_by_producer(idx_storage, mappings, tool_ids):
     start = 0
     limit = 10000
     while True:
         origins = list(
             idx_storage.origin_intrinsic_metadata_search_by_producer(
                 start=start, limit=limit, ids_only=True,
                 mappings=mappings or None, tool_ids=tool_ids or None))
         if not origins:
             break
         start = origins[-1]+1
         yield from origins
 
 
 @schedule.command('reindex_origin_metadata')
 @click.option('--batch-size', '-b', 'origin_batch_size',
               default=10, show_default=True, type=int,
               help="Number of origins per task")
 @click.option('--tool-id', '-t', 'tool_ids', type=int, multiple=True,
               help="Restrict search of old metadata to this/these tool ids.")
 @click.option('--mapping', '-m', 'mappings', multiple=True,
               help="Mapping(s) that should be re-scheduled (eg. 'npm', "
                    "'gemspec', 'maven')")
 @click.option('--task-type',
               default='indexer_origin_metadata', show_default=True,
               help="Name of the task type to schedule.")
 @click.pass_context
 def schedule_origin_metadata_reindex(
-        ctx, origin_batch_size, mappings, tool_ids, task_type):
+        ctx, origin_batch_size, tool_ids, mappings, task_type):
     """Schedules indexing tasks for origins that were already indexed."""
     idx_storage = ctx.obj['indexer_storage']
     scheduler = ctx.obj['scheduler']
 
     origins = list_origins_by_producer(idx_storage, mappings, tool_ids)
-    kwargs = {"policy_update": "update-dups", "parse_ids": False}
-    nb_origins = 0
-    nb_tasks = 0
 
-    while True:
-        task_batch = []
-        for _ in range(TASK_BATCH_SIZE):
-            # Group origins
-            origin_batch = []
-            for (_, origin) in zip(range(origin_batch_size), origins):
-                origin_batch.append(origin)
-            nb_origins += len(origin_batch)
-            if not origin_batch:
-                break
-
-            # Create a task for these origins
-            args = [origin_batch]
-            task_dict = create_task_dict(task_type, 'oneshot', *args, **kwargs)
-            task_batch.append(task_dict)
-
-        # Schedule a batch of tasks
-        if not task_batch:
-            break
-        nb_tasks += len(task_batch)
-        if scheduler:
-            scheduler.create_tasks(task_batch)
-        click.echo('Scheduled %d tasks (%d origins).' % (nb_tasks, nb_origins))
-
-    # Print final status.
-    if nb_tasks:
-        click.echo('Done.')
-    else:
-        click.echo('Nothing to do (no origin metadata matched the criteria).')
+    kwargs = {"policy_update": "update-dups", "parse_ids": False}
+    schedule_origin_batches(
+        scheduler, task_type, origins, origin_batch_size, kwargs)
 
 
 @cli.command('api-server')
 @click.argument('config-path', required=1)
 @click.option('--host', default='0.0.0.0', help="Host to run the server")
 @click.option('--port', default=5007, type=click.INT,
               help="Binding port of the server")
 @click.option('--debug/--nodebug', default=True,
               help="Indicates if the server should run in debug mode")
 def api_server(config_path, host, port, debug):
     api_cfg = load_and_check_config(config_path, type='any')
     app.config.update(api_cfg)
     app.run(host, port=int(port), debug=bool(debug))
 
 
 def main():
     return cli(auto_envvar_prefix='SWH_INDEXER')
 
 
 if __name__ == '__main__':
     main()
diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py
deleted file mode 100644
index 02b4626..0000000
--- a/swh/indexer/metadata_dictionary.py
+++ /dev/null
@@ -1,733 +0,0 @@
-# Copyright (C) 2017  The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import os
-import re
-import abc
-import ast
-import json
-import logging
-import itertools
-import collections
-import email.parser
-import email.policy
-import xml.parsers.expat
-
-import click
-import xmltodict
-
-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI, CODEMETA_TERMS
-from swh.indexer.codemeta import compact, expand
-
-
-MAPPINGS = {}
-
-
-def register_mapping(cls):
-    MAPPINGS[cls.__name__] = cls
-    return cls
-
-
-def list_terms():
-    """Returns a dictionary with all supported CodeMeta terms as keys,
-    and the mappings that support each of them as values."""
-    d = collections.defaultdict(set)
-    for mapping in MAPPINGS.values():
-        for term in mapping.supported_terms():
-            d[term].add(mapping)
-    return d
-
-
-def merge_values(v1, v2):
-    """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`,
-    returns `{"@list": l1 + l2}`.
-    Otherwise, make them lists (if they are not already) and concatenate
-    them.
-
-    >>> merge_values('a', 'b')
-    ['a', 'b']
-    >>> merge_values(['a', 'b'], 'c')
-    ['a', 'b', 'c']
-    >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']})
-    {'@list': ['a', 'b', 'c']}
-    """
-    if v1 is None:
-        return v2
-    elif v2 is None:
-        return v1
-    elif isinstance(v1, dict) and set(v1) == {'@list'}:
-        assert isinstance(v1['@list'], list)
-        if isinstance(v2, dict) and set(v2) == {'@list'}:
-            assert isinstance(v2['@list'], list)
-            return {'@list': v1['@list'] + v2['@list']}
-        else:
-            raise ValueError('Cannot merge %r and %r' % (v1, v2))
-    else:
-        if isinstance(v2, dict) and '@list' in v2:
-            raise ValueError('Cannot merge %r and %r' % (v1, v2))
-        if not isinstance(v1, list):
-            v1 = [v1]
-        if not isinstance(v2, list):
-            v2 = [v2]
-        return v1 + v2
-
-
-class BaseMapping(metaclass=abc.ABCMeta):
-    """Base class for mappings to inherit from
-
-    To implement a new mapping:
-
-    - inherit this class
-    - override translate function
-    """
-    def __init__(self, log_suffix=''):
-        self.log_suffix = log_suffix
-        self.log = logging.getLogger('%s.%s' % (
-            self.__class__.__module__,
-            self.__class__.__name__))
-
-    @property
-    @abc.abstractmethod
-    def name(self):
-        """A name of this mapping, used as an identifier in the
-        indexer storage."""
-        pass
-
-    @classmethod
-    @abc.abstractmethod
-    def detect_metadata_files(cls, files):
-        """
-        Detects files potentially containing metadata
-
-        Args:
-            file_entries (list): list of files
-
-        Returns:
-            list: list of sha1 (possibly empty)
-        """
-        pass
-
-    @abc.abstractmethod
-    def translate(self, file_content):
-        pass
-
-    def normalize_translation(self, metadata):
-        return compact(metadata)
-
-
-class SingleFileMapping(BaseMapping):
-    """Base class for all mappings that use a single file as input."""
-
-    @property
-    @abc.abstractmethod
-    def filename(self):
-        """The .json file to extract metadata from."""
-        pass
-
-    @classmethod
-    def detect_metadata_files(cls, file_entries):
-        for entry in file_entries:
-            if entry['name'] == cls.filename:
-                return [entry['sha1']]
-        return []
-
-
-class DictMapping(BaseMapping):
-    """Base class for mappings that take as input a file that is mostly
-    a key-value store (eg. a shallow JSON dict)."""
-
-    string_fields = []
-    '''List of fields that are simple strings, and don't need any
-    normalization.'''
-
-    @property
-    @abc.abstractmethod
-    def mapping(self):
-        """A translation dict to map dict keys into a canonical name."""
-        pass
-
-    @staticmethod
-    def _normalize_method_name(name):
-        return name.replace('-', '_')
-
-    @classmethod
-    def supported_terms(cls):
-        return {
-            term for (key, term) in cls.mapping.items()
-            if key in cls.string_fields
-            or hasattr(cls, 'translate_' + cls._normalize_method_name(key))
-            or hasattr(cls, 'normalize_' + cls._normalize_method_name(key))}
-
-    def _translate_dict(self, content_dict, *, normalize=True):
-        """
-        Translates content  by parsing content from a dict object
-        and translating with the appropriate mapping
-
-        Args:
-            content_dict (dict): content dict to translate
-
-        Returns:
-            dict: translated metadata in json-friendly form needed for
-            the indexer
-
-        """
-        translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'}
-        for k, v in content_dict.items():
-            # First, check if there is a specific translation
-            # method for this key
-            translation_method = getattr(
-                self, 'translate_' + self._normalize_method_name(k), None)
-            if translation_method:
-                translation_method(translated_metadata, v)
-            elif k in self.mapping:
-                # if there is no method, but the key is known from the
-                # crosswalk table
-                codemeta_key = self.mapping[k]
-
-                # if there is a normalization method, use it on the value
-                normalization_method = getattr(
-                    self, 'normalize_' + self._normalize_method_name(k), None)
-                if normalization_method:
-                    v = normalization_method(v)
-                elif k in self.string_fields and isinstance(v, str):
-                    pass
-                elif k in self.string_fields and isinstance(v, list):
-                    v = [x for x in v if isinstance(x, str)]
-                else:
-                    continue
-
-                # set the translation metadata with the normalized value
-                if codemeta_key in translated_metadata:
-                    translated_metadata[codemeta_key] = merge_values(
-                        translated_metadata[codemeta_key], v)
-                else:
-                    translated_metadata[codemeta_key] = v
-        if normalize:
-            return self.normalize_translation(translated_metadata)
-        else:
-            return translated_metadata
-
-
-class JsonMapping(DictMapping, SingleFileMapping):
-    """Base class for all mappings that use a JSON file as input."""
-
-    def translate(self, raw_content):
-        """
-        Translates content by parsing content from a bytestring containing
-        json data and translating with the appropriate mapping
-
-        Args:
-            raw_content (bytes): raw content to translate
-
-        Returns:
-            dict: translated metadata in json-friendly form needed for
-            the indexer
-
-        """
-        try:
-            raw_content = raw_content.decode()
-        except UnicodeDecodeError:
-            self.log.warning('Error unidecoding from %s', self.log_suffix)
-            return
-        try:
-            content_dict = json.loads(raw_content)
-        except json.JSONDecodeError:
-            self.log.warning('Error unjsoning from %s', self.log_suffix)
-            return
-        if isinstance(content_dict, dict):
-            return self._translate_dict(content_dict)
-
-
-@register_mapping
-class NpmMapping(JsonMapping):
-    """
-    dedicated class for NPM (package.json) mapping and translation
-    """
-    name = 'npm'
-    mapping = CROSSWALK_TABLE['NodeJS']
-    filename = b'package.json'
-    string_fields = ['name', 'version', 'homepage', 'description', 'email']
-
-    _schema_shortcuts = {
-            'github': 'git+https://github.com/%s.git',
-            'gist': 'git+https://gist.github.com/%s.git',
-            'gitlab': 'git+https://gitlab.com/%s.git',
-            # Bitbucket supports both hg and git, and the shortcut does not
-            # tell which one to use.
-            # 'bitbucket': 'https://bitbucket.org/',
-            }
-
-    def normalize_repository(self, d):
-        """https://docs.npmjs.com/files/package.json#repository
-
-        >>> NpmMapping().normalize_repository({
-        ...     'type': 'git',
-        ...     'url': 'https://example.org/foo.git'
-        ... })
-        {'@id': 'git+https://example.org/foo.git'}
-        >>> NpmMapping().normalize_repository(
-        ...     'gitlab:foo/bar')
-        {'@id': 'git+https://gitlab.com/foo/bar.git'}
-        >>> NpmMapping().normalize_repository(
-        ...     'foo/bar')
-        {'@id': 'git+https://github.com/foo/bar.git'}
-        """
-        if isinstance(d, dict) and isinstance(d.get('type'), str) \
-                and isinstance(d.get('url'), str):
-            url = '{type}+{url}'.format(**d)
-        elif isinstance(d, str):
-            if '://' in d:
-                url = d
-            elif ':' in d:
-                (schema, rest) = d.split(':', 1)
-                if schema in self._schema_shortcuts:
-                    url = self._schema_shortcuts[schema] % rest
-                else:
-                    return None
-            else:
-                url = self._schema_shortcuts['github'] % d
-
-        else:
-            return None
-
-        return {'@id': url}
-
-    def normalize_bugs(self, d):
-        """https://docs.npmjs.com/files/package.json#bugs
-
-        >>> NpmMapping().normalize_bugs({
-        ...     'url': 'https://example.org/bugs/',
-        ...     'email': 'bugs@example.org'
-        ... })
-        {'@id': 'https://example.org/bugs/'}
-        >>> NpmMapping().normalize_bugs(
-        ...     'https://example.org/bugs/')
-        {'@id': 'https://example.org/bugs/'}
-        """
-        if isinstance(d, dict) and isinstance(d.get('url'), str):
-            return {'@id': d['url']}
-        elif isinstance(d, str):
-            return {'@id': d}
-        else:
-            return None
-
-    _parse_author = re.compile(r'^ *'
-                               r'(?P<name>.*?)'
-                               r'( +<(?P<email>.*)>)?'
-                               r'( +\((?P<url>.*)\))?'
-                               r' *$')
-
-    def normalize_author(self, d):
-        """https://docs.npmjs.com/files/package.json#people-fields-author-contributors'
-
-        >>> from pprint import pprint
-        >>> pprint(NpmMapping().normalize_author({
-        ...     'name': 'John Doe',
-        ...     'email': 'john.doe@example.org',
-        ...     'url': 'https://example.org/~john.doe',
-        ... }))
-        {'@list': [{'@type': 'http://schema.org/Person',
-                    'http://schema.org/email': 'john.doe@example.org',
-                    'http://schema.org/name': 'John Doe',
-                    'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
-        >>> pprint(NpmMapping().normalize_author(
-        ...     'John Doe <john.doe@example.org> (https://example.org/~john.doe)'
-        ... ))
-        {'@list': [{'@type': 'http://schema.org/Person',
-                    'http://schema.org/email': 'john.doe@example.org',
-                    'http://schema.org/name': 'John Doe',
-                    'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
-        """ # noqa
-        author = {'@type': SCHEMA_URI+'Person'}
-        if isinstance(d, dict):
-            name = d.get('name', None)
-            email = d.get('email', None)
-            url = d.get('url', None)
-        elif isinstance(d, str):
-            match = self._parse_author.match(d)
-            name = match.group('name')
-            email = match.group('email')
-            url = match.group('url')
-        else:
-            return None
-        if name and isinstance(name, str):
-            author[SCHEMA_URI+'name'] = name
-        if email and isinstance(email, str):
-            author[SCHEMA_URI+'email'] = email
-        if url and isinstance(url, str):
-            author[SCHEMA_URI+'url'] = {'@id': url}
-        return {"@list": [author]}
-
-    def normalize_license(self, s):
-        """https://docs.npmjs.com/files/package.json#license
-
-        >>> NpmMapping().normalize_license('MIT')
-        {'@id': 'https://spdx.org/licenses/MIT'}
-        """
-        if isinstance(s, str):
-            return {"@id": "https://spdx.org/licenses/" + s}
-        else:
-            return None
-
-    def normalize_homepage(self, s):
-        """https://docs.npmjs.com/files/package.json#homepage
-
-        >>> NpmMapping().normalize_homepage('https://example.org/~john.doe')
-        {'@id': 'https://example.org/~john.doe'}
-        """
-        if isinstance(s, str):
-            return {"@id": s}
-
-    def normalize_keywords(self, l):
-        """https://docs.npmjs.com/files/package.json#homepage
-
-        >>> NpmMapping().normalize_keywords(['foo', 'bar'])
-        ['foo', 'bar']
-        """
-        if isinstance(l, list):
-            return [x for x in l if isinstance(x, str)]
-
-
-@register_mapping
-class CodemetaMapping(SingleFileMapping):
-    """
-    dedicated class for CodeMeta (codemeta.json) mapping and translation
-    """
-    name = 'codemeta'
-    filename = b'codemeta.json'
-    string_fields = None
-
-    @classmethod
-    def supported_terms(cls):
-        return [term for term in CODEMETA_TERMS if not term.startswith('@')]
-
-    def translate(self, content):
-        try:
-            return self.normalize_translation(expand(
-                json.loads(content.decode())))
-        except Exception:
-            return None
-
-
-@register_mapping
-class MavenMapping(DictMapping, SingleFileMapping):
-    """
-    dedicated class for Maven (pom.xml) mapping and translation
-    """
-    name = 'maven'
-    filename = b'pom.xml'
-    mapping = CROSSWALK_TABLE['Java (Maven)']
-    string_fields = ['name', 'version', 'description', 'email']
-
-    def translate(self, content):
-        try:
-            d = xmltodict.parse(content).get('project') or {}
-        except xml.parsers.expat.ExpatError:
-            self.log.warning('Error parsing XML from %s', self.log_suffix)
-            return None
-        except UnicodeDecodeError:
-            self.log.warning('Error unidecoding XML from %s', self.log_suffix)
-            return None
-        except (LookupError, ValueError):
-            # unknown encoding or multi-byte encoding
-            self.log.warning('Error detecting XML encoding from %s',
-                             self.log_suffix)
-            return None
-        metadata = self._translate_dict(d, normalize=False)
-        metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d)
-        metadata[SCHEMA_URI+'license'] = self.parse_licenses(d)
-        return self.normalize_translation(metadata)
-
-    _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'}
-
-    def parse_repositories(self, d):
-        """https://maven.apache.org/pom.html#Repositories
-
-        >>> import xmltodict
-        >>> from pprint import pprint
-        >>> d = xmltodict.parse('''
-        ... <repositories>
-        ...   <repository>
-        ...     <id>codehausSnapshots</id>
-        ...     <name>Codehaus Snapshots</name>
-        ...     <url>http://snapshots.maven.codehaus.org/maven2</url>
-        ...     <layout>default</layout>
-        ...   </repository>
-        ... </repositories>
-        ... ''')
-        >>> MavenMapping().parse_repositories(d)
-        """
-        repositories = d.get('repositories')
-        if not repositories:
-            results = [self.parse_repository(d, self._default_repository)]
-        elif isinstance(repositories, dict):
-            repositories = repositories.get('repository') or []
-            if not isinstance(repositories, list):
-                repositories = [repositories]
-            results = [self.parse_repository(d, repo)
-                       for repo in repositories]
-        else:
-            results = []
-        return [res for res in results if res] or None
-
-    def parse_repository(self, d, repo):
-        if not isinstance(repo, dict):
-            return
-        if repo.get('layout', 'default') != 'default':
-            return  # TODO ?
-        url = repo.get('url')
-        group_id = d.get('groupId')
-        artifact_id = d.get('artifactId')
-        if (isinstance(url, str) and isinstance(group_id, str)
-                and isinstance(artifact_id, str)):
-            repo = os.path.join(url, *group_id.split('.'), artifact_id)
-            return {"@id": repo}
-
-    def normalize_groupId(self, id_):
-        """https://maven.apache.org/pom.html#Maven_Coordinates
-
-        >>> MavenMapping().normalize_groupId('org.example')
-        {'@id': 'org.example'}
-        """
-        if isinstance(id_, str):
-            return {"@id": id_}
-
-    def parse_licenses(self, d):
-        """https://maven.apache.org/pom.html#Licenses
-
-        >>> import xmltodict
-        >>> import json
-        >>> d = xmltodict.parse('''
-        ... <licenses>
-        ...   <license>
-        ...     <name>Apache License, Version 2.0</name>
-        ...     <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
-        ...   </license>
-        ... </licenses>
-        ... ''')
-        >>> print(json.dumps(d, indent=4))
-        {
-            "licenses": {
-                "license": {
-                    "name": "Apache License, Version 2.0",
-                    "url": "https://www.apache.org/licenses/LICENSE-2.0.txt"
-                }
-            }
-        }
-        >>> MavenMapping().parse_licenses(d)
-        [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}]
-
-        or, if there are more than one license:
-
-        >>> import xmltodict
-        >>> from pprint import pprint
-        >>> d = xmltodict.parse('''
-        ... <licenses>
-        ...   <license>
-        ...     <name>Apache License, Version 2.0</name>
-        ...     <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
-        ...   </license>
-        ...   <license>
-        ...     <name>MIT License</name>
-        ...     <url>https://opensource.org/licenses/MIT</url>
-        ...   </license>
-        ... </licenses>
-        ... ''')
-        >>> pprint(MavenMapping().parse_licenses(d))
-        [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'},
-         {'@id': 'https://opensource.org/licenses/MIT'}]
-        """
-
-        licenses = d.get('licenses')
-        if not isinstance(licenses, dict):
-            return
-        licenses = licenses.get('license')
-        if isinstance(licenses, dict):
-            licenses = [licenses]
-        elif not isinstance(licenses, list):
-            return
-        return [{"@id": license['url']}
-                for license in licenses
-                if isinstance(license, dict)
-                and isinstance(license.get('url'), str)] or None
-
-
-_normalize_pkginfo_key = str.lower
-
-
-class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy):
-    def header_fetch_parse(self, name, value):
-        if hasattr(value, 'name'):
-            return value
-        value = value.replace('\n        ', '\n')
-        return self.header_factory(name, value)
-
-
-@register_mapping
-class PythonPkginfoMapping(DictMapping, SingleFileMapping):
-    """Dedicated class for Python's PKG-INFO mapping and translation.
-
-    https://www.python.org/dev/peps/pep-0314/"""
-    name = 'pkg-info'
-    filename = b'PKG-INFO'
-    mapping = {_normalize_pkginfo_key(k): v
-               for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()}
-    string_fields = ['name', 'version', 'description', 'summary',
-                     'author', 'author-email']
-
-    _parser = email.parser.BytesHeaderParser(
-        policy=LinebreakPreservingEmailPolicy())
-
-    def translate(self, content):
-        msg = self._parser.parsebytes(content)
-        d = {}
-        for (key, value) in msg.items():
-            key = _normalize_pkginfo_key(key)
-            if value != 'UNKNOWN':
-                d.setdefault(key, []).append(value)
-        metadata = self._translate_dict(d, normalize=False)
-        if SCHEMA_URI+'author' in metadata or SCHEMA_URI+'email' in metadata:
-            metadata[SCHEMA_URI+'author'] = {
-                '@list': [{
-                    '@type': SCHEMA_URI+'Person',
-                    SCHEMA_URI+'name':
-                        metadata.pop(SCHEMA_URI+'author', [None])[0],
-                    SCHEMA_URI+'email':
-                        metadata.pop(SCHEMA_URI+'email', [None])[0],
-                }]
-            }
-        return self.normalize_translation(metadata)
-
-    def normalize_home_page(self, urls):
-        return [{'@id': url} for url in urls]
-
-    def normalize_keywords(self, keywords):
-        return list(itertools.chain.from_iterable(
-            s.split(' ') for s in keywords))
-
-    def normalize_license(self, licenses):
-        return [{'@id': license} for license in licenses]
-
-
-@register_mapping
-class GemspecMapping(DictMapping):
-    name = 'gemspec'
-    mapping = CROSSWALK_TABLE['Ruby Gem']
-    string_fields = ['name', 'version', 'description', 'summary', 'email']
-
-    _re_spec_new = re.compile(r'.*Gem::Specification.new +(do|\{) +\|.*\|.*')
-    _re_spec_entry = re.compile(r'\s*\w+\.(?P<key>\w+)\s*=\s*(?P<expr>.*)')
-
-    @classmethod
-    def detect_metadata_files(cls, file_entries):
-        for entry in file_entries:
-            if entry['name'].endswith(b'.gemspec'):
-                return [entry['sha1']]
-        return []
-
-    def translate(self, raw_content):
-        try:
-            raw_content = raw_content.decode()
-        except UnicodeDecodeError:
-            self.log.warning('Error unidecoding from %s', self.log_suffix)
-            return
-
-        # Skip lines before 'Gem::Specification.new'
-        lines = itertools.dropwhile(
-            lambda x: not self._re_spec_new.match(x),
-            raw_content.split('\n'))
-
-        try:
-            next(lines)  # Consume 'Gem::Specification.new'
-        except StopIteration:
-            self.log.warning('Could not find Gem::Specification in %s',
-                             self.log_suffix)
-            return
-
-        content_dict = {}
-        for line in lines:
-            match = self._re_spec_entry.match(line)
-            if match:
-                value = self.eval_ruby_expression(match.group('expr'))
-                if value:
-                    content_dict[match.group('key')] = value
-        return self._translate_dict(content_dict)
-
-    def eval_ruby_expression(self, expr):
-        """Very simple evaluator of Ruby expressions.
-
-        >>> GemspecMapping().eval_ruby_expression('"Foo bar"')
-        'Foo bar'
-        >>> GemspecMapping().eval_ruby_expression("'Foo bar'")
-        'Foo bar'
-        >>> GemspecMapping().eval_ruby_expression("['Foo', 'bar']")
-        ['Foo', 'bar']
-        >>> GemspecMapping().eval_ruby_expression("'Foo bar'.freeze")
-        'Foo bar'
-        >>> GemspecMapping().eval_ruby_expression( \
-                "['Foo'.freeze, 'bar'.freeze]")
-        ['Foo', 'bar']
-        """
-        def evaluator(node):
-            if isinstance(node, ast.Str):
-                return node.s
-            elif isinstance(node, ast.List):
-                res = []
-                for element in node.elts:
-                    val = evaluator(element)
-                    if not val:
-                        return
-                    res.append(val)
-                return res
-
-        expr = expr.replace('.freeze', '')
-        try:
-            # We're parsing Ruby expressions here, but Python's
-            # ast.parse works for very simple Ruby expressions
-            # (mainly strings delimited with " or ', and lists
-            # of such strings).
-            tree = ast.parse(expr, mode='eval')
-        except (SyntaxError, ValueError):
-            return
-        if isinstance(tree, ast.Expression):
-            return evaluator(tree.body)
-
-    def normalize_homepage(self, s):
-        if isinstance(s, str):
-            return {"@id": s}
-
-    def normalize_license(self, s):
-        if isinstance(s, str):
-            return [{"@id": "https://spdx.org/licenses/" + s}]
-
-    def normalize_licenses(self, licenses):
-        if isinstance(licenses, list):
-            return [{"@id": "https://spdx.org/licenses/" + license}
-                    for license in licenses
-                    if isinstance(license, str)]
-
-    def normalize_author(self, author):
-        if isinstance(author, str):
-            return {"@list": [author]}
-
-    def normalize_authors(self, authors):
-        if isinstance(authors, list):
-            return {"@list": [author for author in authors
-                              if isinstance(author, str)]}
-
-
-@click.command()
-@click.argument('mapping_name')
-@click.argument('file_name')
-def main(mapping_name, file_name):
-    from pprint import pprint
-    with open(file_name, 'rb') as fd:
-        file_content = fd.read()
-    res = MAPPINGS[mapping_name]().translate(file_content)
-    pprint(res)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py
new file mode 100644
index 0000000..107a8b3
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/__init__.py
@@ -0,0 +1,38 @@
+import collections
+
+import click
+
+from . import maven, npm, codemeta, python, ruby
+
+MAPPINGS = {
+    'CodemetaMapping': codemeta.CodemetaMapping,
+    'MavenMapping': maven.MavenMapping,
+    'NpmMapping': npm.NpmMapping,
+    'PythonPkginfoMapping': python.PythonPkginfoMapping,
+    'GemspecMapping': ruby.GemspecMapping,
+}
+
+
+def list_terms():
+    """Returns a dictionary with all supported CodeMeta terms as keys,
+    and the mappings that support each of them as values."""
+    d = collections.defaultdict(set)
+    for mapping in MAPPINGS.values():
+        for term in mapping.supported_terms():
+            d[term].add(mapping)
+    return d
+
+
+@click.command()
+@click.argument('mapping_name')
+@click.argument('file_name')
+def main(mapping_name, file_name):
+    from pprint import pprint
+    with open(file_name, 'rb') as fd:
+        file_content = fd.read()
+    res = MAPPINGS[mapping_name]().translate(file_content)
+    pprint(res)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
new file mode 100644
index 0000000..9bc0ef5
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -0,0 +1,211 @@
+# Copyright (C) 2017-2019  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import abc
+import json
+import logging
+
+from swh.indexer.codemeta import SCHEMA_URI
+from swh.indexer.codemeta import compact
+
+
+def merge_values(v1, v2):
+    """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`,
+    returns `{"@list": l1 + l2}`.
+    Otherwise, make them lists (if they are not already) and concatenate
+    them.
+
+    >>> merge_values('a', 'b')
+    ['a', 'b']
+    >>> merge_values(['a', 'b'], 'c')
+    ['a', 'b', 'c']
+    >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']})
+    {'@list': ['a', 'b', 'c']}
+    """
+    if v1 is None:
+        return v2
+    elif v2 is None:
+        return v1
+    elif isinstance(v1, dict) and set(v1) == {'@list'}:
+        assert isinstance(v1['@list'], list)
+        if isinstance(v2, dict) and set(v2) == {'@list'}:
+            assert isinstance(v2['@list'], list)
+            return {'@list': v1['@list'] + v2['@list']}
+        else:
+            raise ValueError('Cannot merge %r and %r' % (v1, v2))
+    else:
+        if isinstance(v2, dict) and '@list' in v2:
+            raise ValueError('Cannot merge %r and %r' % (v1, v2))
+        if not isinstance(v1, list):
+            v1 = [v1]
+        if not isinstance(v2, list):
+            v2 = [v2]
+        return v1 + v2
+
+
+class BaseMapping(metaclass=abc.ABCMeta):
+    """Base class for mappings to inherit from
+
+    To implement a new mapping:
+
+    - inherit this class
+    - override translate function
+    """
+    def __init__(self, log_suffix=''):
+        self.log_suffix = log_suffix
+        self.log = logging.getLogger('%s.%s' % (
+            self.__class__.__module__,
+            self.__class__.__name__))
+
+    @property
+    @abc.abstractmethod
+    def name(self):
+        """A name of this mapping, used as an identifier in the
+        indexer storage."""
+        pass
+
+    @classmethod
+    @abc.abstractmethod
+    def detect_metadata_files(cls, files):
+        """
+        Detects files potentially containing metadata
+
+        Args:
+            file_entries (list): list of files
+
+        Returns:
+            list: list of sha1 (possibly empty)
+        """
+        pass
+
+    @abc.abstractmethod
+    def translate(self, file_content):
+        pass
+
+    def normalize_translation(self, metadata):
+        return compact(metadata)
+
+
+class SingleFileMapping(BaseMapping):
+    """Base class for all mappings that use a single file as input."""
+
+    @property
+    @abc.abstractmethod
+    def filename(self):
+        """The .json file to extract metadata from."""
+        pass
+
+    @classmethod
+    def detect_metadata_files(cls, file_entries):
+        for entry in file_entries:
+            if entry['name'] == cls.filename:
+                return [entry['sha1']]
+        return []
+
+
+class DictMapping(BaseMapping):
+    """Base class for mappings that take as input a file that is mostly
+    a key-value store (eg. a shallow JSON dict)."""
+
+    string_fields = []
+    '''List of fields that are simple strings, and don't need any
+    normalization.'''
+
+    @property
+    @abc.abstractmethod
+    def mapping(self):
+        """A translation dict to map dict keys into a canonical name."""
+        pass
+
+    @staticmethod
+    def _normalize_method_name(name):
+        return name.replace('-', '_')
+
+    @classmethod
+    def supported_terms(cls):
+        return {
+            term for (key, term) in cls.mapping.items()
+            if key in cls.string_fields
+            or hasattr(cls, 'translate_' + cls._normalize_method_name(key))
+            or hasattr(cls, 'normalize_' + cls._normalize_method_name(key))}
+
+    def _translate_dict(self, content_dict, *, normalize=True):
+        """
+        Translates content  by parsing content from a dict object
+        and translating with the appropriate mapping
+
+        Args:
+            content_dict (dict): content dict to translate
+
+        Returns:
+            dict: translated metadata in json-friendly form needed for
+            the indexer
+
+        """
+        translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'}
+        for k, v in content_dict.items():
+            # First, check if there is a specific translation
+            # method for this key
+            translation_method = getattr(
+                self, 'translate_' + self._normalize_method_name(k), None)
+            if translation_method:
+                translation_method(translated_metadata, v)
+            elif k in self.mapping:
+                # if there is no method, but the key is known from the
+                # crosswalk table
+                codemeta_key = self.mapping[k]
+
+                # if there is a normalization method, use it on the value
+                normalization_method = getattr(
+                    self, 'normalize_' + self._normalize_method_name(k), None)
+                if normalization_method:
+                    v = normalization_method(v)
+                elif k in self.string_fields and isinstance(v, str):
+                    pass
+                elif k in self.string_fields and isinstance(v, list):
+                    v = [x for x in v if isinstance(x, str)]
+                else:
+                    continue
+
+                # set the translation metadata with the normalized value
+                if codemeta_key in translated_metadata:
+                    translated_metadata[codemeta_key] = merge_values(
+                        translated_metadata[codemeta_key], v)
+                else:
+                    translated_metadata[codemeta_key] = v
+        if normalize:
+            return self.normalize_translation(translated_metadata)
+        else:
+            return translated_metadata
+
+
+class JsonMapping(DictMapping, SingleFileMapping):
+    """Base class for all mappings that use a JSON file as input."""
+
+    def translate(self, raw_content):
+        """
+        Translates content by parsing content from a bytestring containing
+        json data and translating with the appropriate mapping
+
+        Args:
+            raw_content (bytes): raw content to translate
+
+        Returns:
+            dict: translated metadata in json-friendly form needed for
+            the indexer
+
+        """
+        try:
+            raw_content = raw_content.decode()
+        except UnicodeDecodeError:
+            self.log.warning('Error unidecoding from %s', self.log_suffix)
+            return
+        try:
+            content_dict = json.loads(raw_content)
+        except json.JSONDecodeError:
+            self.log.warning('Error unjsoning from %s', self.log_suffix)
+            return
+        if isinstance(content_dict, dict):
+            return self._translate_dict(content_dict)
diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py
new file mode 100644
index 0000000..bfb336c
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/codemeta.py
@@ -0,0 +1,30 @@
+# Copyright (C) 2018-2019  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+
+from swh.indexer.codemeta import CODEMETA_TERMS
+from swh.indexer.codemeta import expand
+from .base import SingleFileMapping
+
+
+class CodemetaMapping(SingleFileMapping):
+    """
+    dedicated class for CodeMeta (codemeta.json) mapping and translation
+    """
+    name = 'codemeta'
+    filename = b'codemeta.json'
+    string_fields = None
+
+    @classmethod
+    def supported_terms(cls):
+        return [term for term in CODEMETA_TERMS if not term.startswith('@')]
+
+    def translate(self, content):
+        try:
+            return self.normalize_translation(expand(
+                json.loads(content.decode())))
+        except Exception:
+            return None
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
new file mode 100644
index 0000000..38592ba
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -0,0 +1,154 @@
+# Copyright (C) 2018-2019  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import xml.parsers.expat
+
+import xmltodict
+
+from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
+from .base import DictMapping, SingleFileMapping
+
+
+class MavenMapping(DictMapping, SingleFileMapping):
+    """
+    dedicated class for Maven (pom.xml) mapping and translation
+    """
+    name = 'maven'
+    filename = b'pom.xml'
+    mapping = CROSSWALK_TABLE['Java (Maven)']
+    string_fields = ['name', 'version', 'description', 'email']
+
+    def translate(self, content):
+        try:
+            d = xmltodict.parse(content).get('project') or {}
+        except xml.parsers.expat.ExpatError:
+            self.log.warning('Error parsing XML from %s', self.log_suffix)
+            return None
+        except UnicodeDecodeError:
+            self.log.warning('Error unidecoding XML from %s', self.log_suffix)
+            return None
+        except (LookupError, ValueError):
+            # unknown encoding or multi-byte encoding
+            self.log.warning('Error detecting XML encoding from %s',
+                             self.log_suffix)
+            return None
+        metadata = self._translate_dict(d, normalize=False)
+        metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d)
+        metadata[SCHEMA_URI+'license'] = self.parse_licenses(d)
+        return self.normalize_translation(metadata)
+
+    _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'}
+
+    def parse_repositories(self, d):
+        """https://maven.apache.org/pom.html#Repositories
+
+        >>> import xmltodict
+        >>> from pprint import pprint
+        >>> d = xmltodict.parse('''
+        ... <repositories>
+        ...   <repository>
+        ...     <id>codehausSnapshots</id>
+        ...     <name>Codehaus Snapshots</name>
+        ...     <url>http://snapshots.maven.codehaus.org/maven2</url>
+        ...     <layout>default</layout>
+        ...   </repository>
+        ... </repositories>
+        ... ''')
+        >>> MavenMapping().parse_repositories(d)
+        """
+        repositories = d.get('repositories')
+        if not repositories:
+            results = [self.parse_repository(d, self._default_repository)]
+        elif isinstance(repositories, dict):
+            repositories = repositories.get('repository') or []
+            if not isinstance(repositories, list):
+                repositories = [repositories]
+            results = [self.parse_repository(d, repo)
+                       for repo in repositories]
+        else:
+            results = []
+        return [res for res in results if res] or None
+
+    def parse_repository(self, d, repo):
+        if not isinstance(repo, dict):
+            return
+        if repo.get('layout', 'default') != 'default':
+            return  # TODO ?
+        url = repo.get('url')
+        group_id = d.get('groupId')
+        artifact_id = d.get('artifactId')
+        if (isinstance(url, str) and isinstance(group_id, str)
+                and isinstance(artifact_id, str)):
+            repo = os.path.join(url, *group_id.split('.'), artifact_id)
+            return {"@id": repo}
+
+    def normalize_groupId(self, id_):
+        """https://maven.apache.org/pom.html#Maven_Coordinates
+
+        >>> MavenMapping().normalize_groupId('org.example')
+        {'@id': 'org.example'}
+        """
+        if isinstance(id_, str):
+            return {"@id": id_}
+
+    def parse_licenses(self, d):
+        """https://maven.apache.org/pom.html#Licenses
+
+        >>> import xmltodict
+        >>> import json
+        >>> d = xmltodict.parse('''
+        ... <licenses>
+        ...   <license>
+        ...     <name>Apache License, Version 2.0</name>
+        ...     <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
+        ...   </license>
+        ... </licenses>
+        ... ''')
+        >>> print(json.dumps(d, indent=4))
+        {
+            "licenses": {
+                "license": {
+                    "name": "Apache License, Version 2.0",
+                    "url": "https://www.apache.org/licenses/LICENSE-2.0.txt"
+                }
+            }
+        }
+        >>> MavenMapping().parse_licenses(d)
+        [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}]
+
+        or, if there are more than one license:
+
+        >>> import xmltodict
+        >>> from pprint import pprint
+        >>> d = xmltodict.parse('''
+        ... <licenses>
+        ...   <license>
+        ...     <name>Apache License, Version 2.0</name>
+        ...     <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
+        ...   </license>
+        ...   <license>
+        ...     <name>MIT License</name>
+        ...     <url>https://opensource.org/licenses/MIT</url>
+        ...   </license>
+        ... </licenses>
+        ... ''')
+        >>> pprint(MavenMapping().parse_licenses(d))
+        [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'},
+         {'@id': 'https://opensource.org/licenses/MIT'}]
+        """
+
+        licenses = d.get('licenses')
+        if not isinstance(licenses, dict):
+            return
+        licenses = licenses.get('license')
+        if isinstance(licenses, dict):
+            licenses = [licenses]
+        elif not isinstance(licenses, list):
+            return
+        return [{"@id": license['url']}
+                for license in licenses
+                if isinstance(license, dict)
+                and isinstance(license.get('url'), str)] or None
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
new file mode 100644
index 0000000..659fe77
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -0,0 +1,156 @@
+# Copyright (C) 2018-2019  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import re
+
+from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
+from .base import JsonMapping
+
+
+class NpmMapping(JsonMapping):
+    """
+    dedicated class for NPM (package.json) mapping and translation
+    """
+    name = 'npm'
+    mapping = CROSSWALK_TABLE['NodeJS']
+    filename = b'package.json'
+    string_fields = ['name', 'version', 'homepage', 'description', 'email']
+
+    _schema_shortcuts = {
+            'github': 'git+https://github.com/%s.git',
+            'gist': 'git+https://gist.github.com/%s.git',
+            'gitlab': 'git+https://gitlab.com/%s.git',
+            # Bitbucket supports both hg and git, and the shortcut does not
+            # tell which one to use.
+            # 'bitbucket': 'https://bitbucket.org/',
+            }
+
+    def normalize_repository(self, d):
+        """https://docs.npmjs.com/files/package.json#repository
+
+        >>> NpmMapping().normalize_repository({
+        ...     'type': 'git',
+        ...     'url': 'https://example.org/foo.git'
+        ... })
+        {'@id': 'git+https://example.org/foo.git'}
+        >>> NpmMapping().normalize_repository(
+        ...     'gitlab:foo/bar')
+        {'@id': 'git+https://gitlab.com/foo/bar.git'}
+        >>> NpmMapping().normalize_repository(
+        ...     'foo/bar')
+        {'@id': 'git+https://github.com/foo/bar.git'}
+        """
+        if isinstance(d, dict) and isinstance(d.get('type'), str) \
+                and isinstance(d.get('url'), str):
+            url = '{type}+{url}'.format(**d)
+        elif isinstance(d, str):
+            if '://' in d:
+                url = d
+            elif ':' in d:
+                (schema, rest) = d.split(':', 1)
+                if schema in self._schema_shortcuts:
+                    url = self._schema_shortcuts[schema] % rest
+                else:
+                    return None
+            else:
+                url = self._schema_shortcuts['github'] % d
+
+        else:
+            return None
+
+        return {'@id': url}
+
+    def normalize_bugs(self, d):
+        """https://docs.npmjs.com/files/package.json#bugs
+
+        >>> NpmMapping().normalize_bugs({
+        ...     'url': 'https://example.org/bugs/',
+        ...     'email': 'bugs@example.org'
+        ... })
+        {'@id': 'https://example.org/bugs/'}
+        >>> NpmMapping().normalize_bugs(
+        ...     'https://example.org/bugs/')
+        {'@id': 'https://example.org/bugs/'}
+        """
+        if isinstance(d, dict) and isinstance(d.get('url'), str):
+            return {'@id': d['url']}
+        elif isinstance(d, str):
+            return {'@id': d}
+        else:
+            return None
+
+    _parse_author = re.compile(r'^ *'
+                               r'(?P<name>.*?)'
+                               r'( +<(?P<email>.*)>)?'
+                               r'( +\((?P<url>.*)\))?'
+                               r' *$')
+
+    def normalize_author(self, d):
+        """https://docs.npmjs.com/files/package.json#people-fields-author-contributors'
+
+        >>> from pprint import pprint
+        >>> pprint(NpmMapping().normalize_author({
+        ...     'name': 'John Doe',
+        ...     'email': 'john.doe@example.org',
+        ...     'url': 'https://example.org/~john.doe',
+        ... }))
+        {'@list': [{'@type': 'http://schema.org/Person',
+                    'http://schema.org/email': 'john.doe@example.org',
+                    'http://schema.org/name': 'John Doe',
+                    'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
+        >>> pprint(NpmMapping().normalize_author(
+        ...     'John Doe <john.doe@example.org> (https://example.org/~john.doe)'
+        ... ))
+        {'@list': [{'@type': 'http://schema.org/Person',
+                    'http://schema.org/email': 'john.doe@example.org',
+                    'http://schema.org/name': 'John Doe',
+                    'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
+        """ # noqa
+        author = {'@type': SCHEMA_URI+'Person'}
+        if isinstance(d, dict):
+            name = d.get('name', None)
+            email = d.get('email', None)
+            url = d.get('url', None)
+        elif isinstance(d, str):
+            match = self._parse_author.match(d)
+            name = match.group('name')
+            email = match.group('email')
+            url = match.group('url')
+        else:
+            return None
+        if name and isinstance(name, str):
+            author[SCHEMA_URI+'name'] = name
+        if email and isinstance(email, str):
+            author[SCHEMA_URI+'email'] = email
+        if url and isinstance(url, str):
+            author[SCHEMA_URI+'url'] = {'@id': url}
+        return {"@list": [author]}
+
+    def normalize_license(self, s):
+        """https://docs.npmjs.com/files/package.json#license
+
+        >>> NpmMapping().normalize_license('MIT')
+        {'@id': 'https://spdx.org/licenses/MIT'}
+        """
+        if isinstance(s, str):
+            return {"@id": "https://spdx.org/licenses/" + s}
+
+    def normalize_homepage(self, s):
+        """https://docs.npmjs.com/files/package.json#homepage
+
+        >>> NpmMapping().normalize_homepage('https://example.org/~john.doe')
+        {'@id': 'https://example.org/~john.doe'}
+        """
+        if isinstance(s, str):
+            return {"@id": s}
+
+    def normalize_keywords(self, l):
+        """https://docs.npmjs.com/files/package.json#homepage
+
+        >>> NpmMapping().normalize_keywords(['foo', 'bar'])
+        ['foo', 'bar']
+        """
+        if isinstance(l, list):
+            return [x for x in l if isinstance(x, str)]
diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py
new file mode 100644
index 0000000..4bcb81b
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/python.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2018-2019  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import email.parser
+import email.policy
+import itertools
+
+from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
+from .base import DictMapping, SingleFileMapping
+
+
+_normalize_pkginfo_key = str.lower
+
+
+class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy):
+    def header_fetch_parse(self, name, value):
+        if hasattr(value, 'name'):
+            return value
+        value = value.replace('\n        ', '\n')
+        return self.header_factory(name, value)
+
+
+class PythonPkginfoMapping(DictMapping, SingleFileMapping):
+    """Dedicated class for Python's PKG-INFO mapping and translation.
+
+    https://www.python.org/dev/peps/pep-0314/"""
+    name = 'pkg-info'
+    filename = b'PKG-INFO'
+    mapping = {_normalize_pkginfo_key(k): v
+               for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()}
+    string_fields = ['name', 'version', 'description', 'summary',
+                     'author', 'author-email']
+
+    _parser = email.parser.BytesHeaderParser(
+        policy=LinebreakPreservingEmailPolicy())
+
+    def translate(self, content):
+        msg = self._parser.parsebytes(content)
+        d = {}
+        for (key, value) in msg.items():
+            key = _normalize_pkginfo_key(key)
+            if value != 'UNKNOWN':
+                d.setdefault(key, []).append(value)
+        metadata = self._translate_dict(d, normalize=False)
+        if SCHEMA_URI+'author' in metadata or SCHEMA_URI+'email' in metadata:
+            metadata[SCHEMA_URI+'author'] = {
+                '@list': [{
+                    '@type': SCHEMA_URI+'Person',
+                    SCHEMA_URI+'name':
+                        metadata.pop(SCHEMA_URI+'author', [None])[0],
+                    SCHEMA_URI+'email':
+                        metadata.pop(SCHEMA_URI+'email', [None])[0],
+                }]
+            }
+        return self.normalize_translation(metadata)
+
+    def normalize_home_page(self, urls):
+        return [{'@id': url} for url in urls]
+
+    def normalize_keywords(self, keywords):
+        return list(itertools.chain.from_iterable(
+            s.split(' ') for s in keywords))
+
+    def normalize_license(self, licenses):
+        return [{'@id': license} for license in licenses]
diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py
new file mode 100644
index 0000000..8d5b4a7
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/ruby.py
@@ -0,0 +1,117 @@
+# Copyright (C) 2018-2019  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import ast
+import itertools
+import re
+
+from swh.indexer.codemeta import CROSSWALK_TABLE
+from .base import DictMapping
+
+
+class GemspecMapping(DictMapping):
+    name = 'gemspec'
+    mapping = CROSSWALK_TABLE['Ruby Gem']
+    string_fields = ['name', 'version', 'description', 'summary', 'email']
+
+    _re_spec_new = re.compile(r'.*Gem::Specification.new +(do|\{) +\|.*\|.*')
+    _re_spec_entry = re.compile(r'\s*\w+\.(?P<key>\w+)\s*=\s*(?P<expr>.*)')
+
+    @classmethod
+    def detect_metadata_files(cls, file_entries):
+        for entry in file_entries:
+            if entry['name'].endswith(b'.gemspec'):
+                return [entry['sha1']]
+        return []
+
+    def translate(self, raw_content):
+        try:
+            raw_content = raw_content.decode()
+        except UnicodeDecodeError:
+            self.log.warning('Error unidecoding from %s', self.log_suffix)
+            return
+
+        # Skip lines before 'Gem::Specification.new'
+        lines = itertools.dropwhile(
+            lambda x: not self._re_spec_new.match(x),
+            raw_content.split('\n'))
+
+        try:
+            next(lines)  # Consume 'Gem::Specification.new'
+        except StopIteration:
+            self.log.warning('Could not find Gem::Specification in %s',
+                             self.log_suffix)
+            return
+
+        content_dict = {}
+        for line in lines:
+            match = self._re_spec_entry.match(line)
+            if match:
+                value = self.eval_ruby_expression(match.group('expr'))
+                if value:
+                    content_dict[match.group('key')] = value
+        return self._translate_dict(content_dict)
+
+    def eval_ruby_expression(self, expr):
+        """Very simple evaluator of Ruby expressions.
+
+        >>> GemspecMapping().eval_ruby_expression('"Foo bar"')
+        'Foo bar'
+        >>> GemspecMapping().eval_ruby_expression("'Foo bar'")
+        'Foo bar'
+        >>> GemspecMapping().eval_ruby_expression("['Foo', 'bar']")
+        ['Foo', 'bar']
+        >>> GemspecMapping().eval_ruby_expression("'Foo bar'.freeze")
+        'Foo bar'
+        >>> GemspecMapping().eval_ruby_expression( \
+                "['Foo'.freeze, 'bar'.freeze]")
+        ['Foo', 'bar']
+        """
+        def evaluator(node):
+            if isinstance(node, ast.Str):
+                return node.s
+            elif isinstance(node, ast.List):
+                res = []
+                for element in node.elts:
+                    val = evaluator(element)
+                    if not val:
+                        return
+                    res.append(val)
+                return res
+
+        expr = expr.replace('.freeze', '')
+        try:
+            # We're parsing Ruby expressions here, but Python's
+            # ast.parse works for very simple Ruby expressions
+            # (mainly strings delimited with " or ', and lists
+            # of such strings).
+            tree = ast.parse(expr, mode='eval')
+        except (SyntaxError, ValueError):
+            return
+        if isinstance(tree, ast.Expression):
+            return evaluator(tree.body)
+
+    def normalize_homepage(self, s):
+        if isinstance(s, str):
+            return {"@id": s}
+
+    def normalize_license(self, s):
+        if isinstance(s, str):
+            return [{"@id": "https://spdx.org/licenses/" + s}]
+
+    def normalize_licenses(self, licenses):
+        if isinstance(licenses, list):
+            return [{"@id": "https://spdx.org/licenses/" + license}
+                    for license in licenses
+                    if isinstance(license, str)]
+
+    def normalize_author(self, author):
+        if isinstance(author, str):
+            return {"@list": [author]}
+
+    def normalize_authors(self, authors):
+        if isinstance(authors, list):
+            return {"@list": [author for author in authors
+                              if isinstance(author, str)]}
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
index cd8c2f5..841c17e 100644
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -1,915 +1,917 @@
 # Copyright (C) 2015-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 
 import json
 import psycopg2
 
 from collections import defaultdict
 
 from swh.core.api import remote_api_endpoint
 from swh.storage.common import db_transaction_generator, db_transaction
 from swh.storage.exc import StorageDBError
 from .db import Db
 
 from . import converters
 
 
 INDEXER_CFG_KEY = 'indexer_storage'
 
 
 MAPPING_NAMES = ['codemeta', 'gemspec', 'maven', 'npm', 'pkg-info']
 
 
 def get_indexer_storage(cls, args):
     """Get an indexer storage object of class `storage_class` with
     arguments `storage_args`.
 
     Args:
         cls (str): storage's class, either 'local' or 'remote'
         args (dict): dictionary of arguments passed to the
             storage class constructor
 
     Returns:
         an instance of swh.indexer's storage (either local or remote)
 
     Raises:
         ValueError if passed an unknown storage class.
 
     """
     if cls == 'remote':
         from .api.client import RemoteStorage as IndexerStorage
     elif cls == 'local':
         from . import IndexerStorage
     elif cls == 'memory':
         from .in_memory import IndexerStorage
     else:
         raise ValueError('Unknown indexer storage class `%s`' % cls)
 
     return IndexerStorage(**args)
 
 
 def _check_id_duplicates(data):
     """
     If any two dictionaries in `data` have the same id, raises
     a `ValueError`.
 
     Values associated to the key must be hashable.
 
     Args:
         data (List[dict]): List of dictionaries to be inserted
 
     >>> _check_id_duplicates([
     ...     {'id': 'foo', 'data': 'spam'},
     ...     {'id': 'bar', 'data': 'egg'},
     ... ])
     >>> _check_id_duplicates([
     ...     {'id': 'foo', 'data': 'spam'},
     ...     {'id': 'foo', 'data': 'egg'},
     ... ])
     Traceback (most recent call last):
       ...
     ValueError: The same id is present more than once.
     """
     if len({item['id'] for item in data}) < len(data):
         raise ValueError('The same id is present more than once.')
 
 
 class IndexerStorage:
     """SWH Indexer Storage
 
     """
     def __init__(self, db, min_pool_conns=1, max_pool_conns=10):
         """
         Args:
             db_conn: either a libpq connection string, or a psycopg2 connection
 
         """
         try:
             if isinstance(db, psycopg2.extensions.connection):
                 self._pool = None
                 self._db = Db(db)
             else:
                 self._pool = psycopg2.pool.ThreadedConnectionPool(
                     min_pool_conns, max_pool_conns, db
                 )
                 self._db = None
         except psycopg2.OperationalError as e:
             raise StorageDBError(e)
 
     def get_db(self):
         if self._db:
             return self._db
         return Db.from_pool(self._pool)
 
+    def put_db(self, db):
+        if db is not self._db:
+            db.put_conn()
+
     @remote_api_endpoint('check_config')
-    def check_config(self, *, check_write):
+    @db_transaction()
+    def check_config(self, *, check_write, db=None, cur=None):
         """Check that the storage is configured and ready to go."""
         # Check permissions on one of the tables
-        with self.get_db().transaction() as cur:
-            if check_write:
-                check = 'INSERT'
-            else:
-                check = 'SELECT'
-
-            cur.execute(
-                "select has_table_privilege(current_user, 'content_mimetype', %s)",  # noqa
-                (check,)
-            )
-            return cur.fetchone()[0]
+        if check_write:
+            check = 'INSERT'
+        else:
+            check = 'SELECT'
 
-        return True
+        cur.execute(
+            "select has_table_privilege(current_user, 'content_mimetype', %s)",  # noqa
+            (check,)
+        )
+        return cur.fetchone()[0]
 
     @remote_api_endpoint('content_mimetype/missing')
     @db_transaction_generator()
     def content_mimetype_missing(self, mimetypes, db=None, cur=None):
         """Generate mimetypes missing from storage.
 
         Args:
             mimetypes (iterable): iterable of dict with keys:
 
               - **id** (bytes): sha1 identifier
               - **indexer_configuration_id** (int): tool used to compute the
                 results
 
         Yields:
             tuple (id, indexer_configuration_id): missing id
 
         """
         for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
             yield obj[0]
 
     def _content_get_range(self, content_type, start, end,
                            indexer_configuration_id, limit=1000,
                            with_textual_data=False,
                            db=None, cur=None):
         """Retrieve ids of type content_type within range [start, end] bound
            by limit.
 
         Args:
             **content_type** (str): content's type (mimetype, language, etc...)
             **start** (bytes): Starting identifier range (expected smaller
                            than end)
             **end** (bytes): Ending identifier range (expected larger
                              than start)
             **indexer_configuration_id** (int): The tool used to index data
             **limit** (int): Limit result (default to 1000)
             **with_textual_data** (bool): Deal with only textual
                                           content (True) or all
                                           content (all contents by
                                           defaults, False)
 
         Raises:
             ValueError for;
             - limit to None
             - wrong content_type provided
 
         Returns:
             a dict with keys:
             - **ids** [bytes]: iterable of content ids within the range.
             - **next** (Optional[bytes]): The next range of sha1 starts at
                                           this sha1 if any
 
         """
         if limit is None:
             raise ValueError('Development error: limit should not be None')
         if content_type not in db.content_indexer_names:
             err = 'Development error: Wrong type. Should be one of [%s]' % (
                 ','.join(db.content_indexer_names))
             raise ValueError(err)
 
         ids = []
         next_id = None
         for counter, obj in enumerate(db.content_get_range(
                 content_type, start, end, indexer_configuration_id,
                 limit=limit+1, with_textual_data=with_textual_data, cur=cur)):
             _id = obj[0]
             if counter >= limit:
                 next_id = _id
                 break
 
             ids.append(_id)
 
         return {
             'ids': ids,
             'next': next_id
         }
 
     @remote_api_endpoint('content_mimetype/range')
     @db_transaction()
     def content_mimetype_get_range(self, start, end, indexer_configuration_id,
                                    limit=1000, db=None, cur=None):
         """Retrieve mimetypes within range [start, end] bound by limit.
 
         Args:
             **start** (bytes): Starting identifier range (expected smaller
                            than end)
             **end** (bytes): Ending identifier range (expected larger
                              than start)
             **indexer_configuration_id** (int): The tool used to index data
             **limit** (int): Limit result (default to 1000)
 
         Raises:
             ValueError for limit to None
 
         Returns:
             a dict with keys:
             - **ids** [bytes]: iterable of content ids within the range.
             - **next** (Optional[bytes]): The next range of sha1 starts at
                                           this sha1 if any
 
         """
         return self._content_get_range('mimetype', start, end,
                                        indexer_configuration_id, limit=limit,
                                        db=db, cur=cur)
 
     @remote_api_endpoint('content_mimetype/add')
     @db_transaction()
     def content_mimetype_add(self, mimetypes, conflict_update=False, db=None,
                              cur=None):
         """Add mimetypes not present in storage.
 
         Args:
             mimetypes (iterable): dictionaries with keys:
 
               - **id** (bytes): sha1 identifier
               - **mimetype** (bytes): raw content's mimetype
               - **encoding** (bytes): raw content's encoding
               - **indexer_configuration_id** (int): tool's id used to
                 compute the results
               - **conflict_update** (bool): Flag to determine if we want to
                 overwrite (``True``) or skip duplicates (``False``, the
                 default)
 
         """
         _check_id_duplicates(mimetypes)
         mimetypes.sort(key=lambda m: m['id'])
         db.mktemp_content_mimetype(cur)
         db.copy_to(mimetypes, 'tmp_content_mimetype',
                    ['id', 'mimetype', 'encoding', 'indexer_configuration_id'],
                    cur)
         db.content_mimetype_add_from_temp(conflict_update, cur)
 
     @remote_api_endpoint('content_mimetype')
     @db_transaction_generator()
     def content_mimetype_get(self, ids, db=None, cur=None):
         """Retrieve full content mimetype per ids.
 
         Args:
             ids (iterable): sha1 identifier
 
         Yields:
             mimetypes (iterable): dictionaries with keys:
 
                 - **id** (bytes): sha1 identifier
                 - **mimetype** (bytes): raw content's mimetype
                 - **encoding** (bytes): raw content's encoding
                 - **tool** (dict): Tool used to compute the language
 
         """
         for c in db.content_mimetype_get_from_list(ids, cur):
             yield converters.db_to_mimetype(
                 dict(zip(db.content_mimetype_cols, c)))
 
     @remote_api_endpoint('content_language/missing')
     @db_transaction_generator()
     def content_language_missing(self, languages, db=None, cur=None):
         """List languages missing from storage.
 
         Args:
             languages (iterable): dictionaries with keys:
 
                 - **id** (bytes): sha1 identifier
                 - **indexer_configuration_id** (int): tool used to compute
                   the results
 
         Yields:
             an iterable of missing id for the tuple (id,
             indexer_configuration_id)
 
         """
         for obj in db.content_language_missing_from_list(languages, cur):
             yield obj[0]
 
     @remote_api_endpoint('content_language')
     @db_transaction_generator()
     def content_language_get(self, ids, db=None, cur=None):
         """Retrieve full content language per ids.
 
         Args:
             ids (iterable): sha1 identifier
 
         Yields:
             languages (iterable): dictionaries with keys:
 
                 - **id** (bytes): sha1 identifier
                 - **lang** (bytes): raw content's language
                 - **tool** (dict): Tool used to compute the language
 
         """
         for c in db.content_language_get_from_list(ids, cur):
             yield converters.db_to_language(
                 dict(zip(db.content_language_cols, c)))
 
     @remote_api_endpoint('content_language/add')
     @db_transaction()
     def content_language_add(self, languages, conflict_update=False, db=None,
                              cur=None):
         """Add languages not present in storage.
 
         Args:
             languages (iterable): dictionaries with keys:
 
                 - **id** (bytes): sha1
                 - **lang** (bytes): language detected
 
             conflict_update (bool): Flag to determine if we want to
                 overwrite (true) or skip duplicates (false, the
                 default)
 
         """
         _check_id_duplicates(languages)
         languages.sort(key=lambda m: m['id'])
         db.mktemp_content_language(cur)
         # empty language is mapped to 'unknown'
         db.copy_to(
             ({
                 'id': l['id'],
                 'lang': 'unknown' if not l['lang'] else l['lang'],
                 'indexer_configuration_id': l['indexer_configuration_id'],
             } for l in languages),
             'tmp_content_language',
             ['id', 'lang', 'indexer_configuration_id'], cur)
 
         db.content_language_add_from_temp(conflict_update, cur)
 
     @remote_api_endpoint('content/ctags/missing')
     @db_transaction_generator()
     def content_ctags_missing(self, ctags, db=None, cur=None):
         """List ctags missing from storage.
 
         Args:
             ctags (iterable): dicts with keys:
 
                 - **id** (bytes): sha1 identifier
                 - **indexer_configuration_id** (int): tool used to compute
                   the results
 
         Yields:
             an iterable of missing id for the tuple (id,
             indexer_configuration_id)
 
         """
         for obj in db.content_ctags_missing_from_list(ctags, cur):
             yield obj[0]
 
     @remote_api_endpoint('content/ctags')
     @db_transaction_generator()
     def content_ctags_get(self, ids, db=None, cur=None):
         """Retrieve ctags per id.
 
         Args:
             ids (iterable): sha1 checksums
 
         Yields:
             Dictionaries with keys:
 
                 - **id** (bytes): content's identifier
                 - **name** (str): symbol's name
                 - **kind** (str): symbol's kind
                 - **lang** (str): language for that content
                 - **tool** (dict): tool used to compute the ctags' info
 
 
         """
         for c in db.content_ctags_get_from_list(ids, cur):
             yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c)))
 
     @remote_api_endpoint('content/ctags/add')
     @db_transaction()
     def content_ctags_add(self, ctags, conflict_update=False, db=None,
                           cur=None):
         """Add ctags not present in storage
 
         Args:
             ctags (iterable): dictionaries with keys:
 
                 - **id** (bytes): sha1
                 - **ctags** ([list): List of dictionary with keys: name, kind,
                   line, lang
 
         """
         _check_id_duplicates(ctags)
         ctags.sort(key=lambda m: m['id'])
 
         def _convert_ctags(__ctags):
             """Convert ctags dict to list of ctags.
 
             """
             for ctags in __ctags:
                 yield from converters.ctags_to_db(ctags)
 
         db.mktemp_content_ctags(cur)
         db.copy_to(list(_convert_ctags(ctags)),
                    tblname='tmp_content_ctags',
                    columns=['id', 'name', 'kind', 'line',
                             'lang', 'indexer_configuration_id'],
                    cur=cur)
 
         db.content_ctags_add_from_temp(conflict_update, cur)
 
     @remote_api_endpoint('content/ctags/search')
     @db_transaction_generator()
     def content_ctags_search(self, expression,
                              limit=10, last_sha1=None, db=None, cur=None):
         """Search through content's raw ctags symbols.
 
         Args:
             expression (str): Expression to search for
             limit (int): Number of rows to return (default to 10).
             last_sha1 (str): Offset from which retrieving data (default to '').
 
         Yields:
             rows of ctags including id, name, lang, kind, line, etc...
 
         """
         for obj in db.content_ctags_search(expression, last_sha1, limit,
                                            cur=cur):
             yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj)))
 
     @remote_api_endpoint('content/fossology_license')
     @db_transaction_generator()
     def content_fossology_license_get(self, ids, db=None, cur=None):
         """Retrieve licenses per id.
 
         Args:
             ids (iterable): sha1 checksums
 
         Yields:
             dict: ``{id: facts}`` where ``facts`` is a dict with the
             following keys:
 
                 - **licenses** ([str]): associated licenses for that content
                 - **tool** (dict): Tool used to compute the license
 
         """
         d = defaultdict(list)
         for c in db.content_fossology_license_get_from_list(ids, cur):
             license = dict(zip(db.content_fossology_license_cols, c))
 
             id_ = license['id']
             d[id_].append(converters.db_to_fossology_license(license))
 
         for id_, facts in d.items():
             yield {id_: facts}
 
     @remote_api_endpoint('content/fossology_license/add')
     @db_transaction()
     def content_fossology_license_add(self, licenses, conflict_update=False,
                                       db=None, cur=None):
         """Add licenses not present in storage.
 
         Args:
             licenses (iterable): dictionaries with keys:
 
                 - **id**: sha1
                 - **licenses** ([bytes]): List of licenses associated to sha1
                 - **tool** (str): nomossa
 
             conflict_update: Flag to determine if we want to overwrite (true)
                 or skip duplicates (false, the default)
 
         Returns:
             list: content_license entries which failed due to unknown licenses
 
         """
         _check_id_duplicates(licenses)
         licenses.sort(key=lambda m: m['id'])
         db.mktemp_content_fossology_license(cur)
         db.copy_to(
             ({
                 'id': sha1['id'],
                 'indexer_configuration_id': sha1['indexer_configuration_id'],
                 'license': license,
               } for sha1 in licenses
                 for license in sha1['licenses']),
             tblname='tmp_content_fossology_license',
             columns=['id', 'license', 'indexer_configuration_id'],
             cur=cur)
         db.content_fossology_license_add_from_temp(conflict_update, cur)
 
     @remote_api_endpoint('content/fossology_license/range')
     @db_transaction()
     def content_fossology_license_get_range(
             self, start, end, indexer_configuration_id,
             limit=1000, db=None, cur=None):
         """Retrieve licenses within range [start, end] bound by limit.
 
         Args:
             **start** (bytes): Starting identifier range (expected smaller
                            than end)
             **end** (bytes): Ending identifier range (expected larger
                              than start)
             **indexer_configuration_id** (int): The tool used to index data
             **limit** (int): Limit result (default to 1000)
 
         Raises:
             ValueError for limit to None
 
         Returns:
             a dict with keys:
             - **ids** [bytes]: iterable of content ids within the range.
             - **next** (Optional[bytes]): The next range of sha1 starts at
                                           this sha1 if any
 
         """
         return self._content_get_range('fossology_license', start, end,
                                        indexer_configuration_id, limit=limit,
                                        with_textual_data=True, db=db, cur=cur)
 
     @remote_api_endpoint('content_metadata/missing')
     @db_transaction_generator()
     def content_metadata_missing(self, metadata, db=None, cur=None):
         """List metadata missing from storage.
 
         Args:
             metadata (iterable): dictionaries with keys:
 
                 - **id** (bytes): sha1 identifier
                 - **indexer_configuration_id** (int): tool used to compute
                   the results
 
         Yields:
             missing sha1s
 
         """
         for obj in db.content_metadata_missing_from_list(metadata, cur):
             yield obj[0]
 
     @remote_api_endpoint('content_metadata')
     @db_transaction_generator()
     def content_metadata_get(self, ids, db=None, cur=None):
         """Retrieve metadata per id.
 
         Args:
             ids (iterable): sha1 checksums
 
         Yields:
             dictionaries with the following keys:
 
                 id (bytes)
                 metadata (str): associated metadata
                 tool (dict): tool used to compute metadata
 
         """
         for c in db.content_metadata_get_from_list(ids, cur):
             yield converters.db_to_metadata(
                 dict(zip(db.content_metadata_cols, c)))
 
     @remote_api_endpoint('content_metadata/add')
     @db_transaction()
     def content_metadata_add(self, metadata, conflict_update=False, db=None,
                              cur=None):
         """Add metadata not present in storage.
 
         Args:
             metadata (iterable): dictionaries with keys:
 
                 - **id**: sha1
                 - **metadata**: arbitrary dict
 
             conflict_update: Flag to determine if we want to overwrite (true)
                 or skip duplicates (false, the default)
 
         """
         _check_id_duplicates(metadata)
         metadata.sort(key=lambda m: m['id'])
 
         db.mktemp_content_metadata(cur)
 
         db.copy_to(metadata, 'tmp_content_metadata',
                    ['id', 'metadata', 'indexer_configuration_id'],
                    cur)
         db.content_metadata_add_from_temp(conflict_update, cur)
 
     @remote_api_endpoint('revision_intrinsic_metadata/missing')
     @db_transaction_generator()
     def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None):
         """List metadata missing from storage.
 
         Args:
             metadata (iterable): dictionaries with keys:
 
                - **id** (bytes): sha1_git revision identifier
                - **indexer_configuration_id** (int): tool used to compute
                  the results
 
         Yields:
             missing ids
 
         """
         for obj in db.revision_intrinsic_metadata_missing_from_list(
                 metadata, cur):
             yield obj[0]
 
     @remote_api_endpoint('revision_intrinsic_metadata')
     @db_transaction_generator()
     def revision_intrinsic_metadata_get(self, ids, db=None, cur=None):
         """Retrieve revision metadata per id.
 
         Args:
             ids (iterable): sha1 checksums
 
         Yields:
             dictionaries with the following keys:
 
                 - **id** (bytes)
                 - **metadata** (str): associated metadata
                 - **tool** (dict): tool used to compute metadata
                 - **mappings** (List[str]): list of mappings used to translate
                   these metadata
 
         """
         for c in db.revision_intrinsic_metadata_get_from_list(ids, cur):
             yield converters.db_to_metadata(
                 dict(zip(db.revision_intrinsic_metadata_cols, c)))
 
     @remote_api_endpoint('revision_intrinsic_metadata/add')
     @db_transaction()
     def revision_intrinsic_metadata_add(self, metadata, conflict_update=False,
                                         db=None, cur=None):
         """Add metadata not present in storage.
 
         Args:
             metadata (iterable): dictionaries with keys:
 
                 - **id**: sha1_git of revision
                 - **metadata**: arbitrary dict
                 - **indexer_configuration_id**: tool used to compute metadata
                 - **mappings** (List[str]): list of mappings used to translate
                   these metadata
 
             conflict_update: Flag to determine if we want to overwrite (true)
               or skip duplicates (false, the default)
 
         """
         _check_id_duplicates(metadata)
         metadata.sort(key=lambda m: m['id'])
 
         db.mktemp_revision_intrinsic_metadata(cur)
 
         db.copy_to(metadata, 'tmp_revision_intrinsic_metadata',
                    ['id', 'metadata', 'mappings',
                     'indexer_configuration_id'],
                    cur)
         db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur)
 
     @remote_api_endpoint('revision_intrinsic_metadata/delete')
     @db_transaction()
     def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None):
         """Remove revision metadata from the storage.
 
         Args:
             entries (dict): dictionaries with the following keys:
                 - **id** (bytes): revision identifier
                 - **indexer_configuration_id** (int): tool used to compute
                   metadata
         """
         db.revision_intrinsic_metadata_delete(entries, cur)
 
     @remote_api_endpoint('origin_intrinsic_metadata')
     @db_transaction_generator()
     def origin_intrinsic_metadata_get(self, ids, db=None, cur=None):
         """Retrieve origin metadata per id.
 
         Args:
             ids (iterable): origin identifiers
 
         Yields:
             list: dictionaries with the following keys:
 
                 - **id** (int)
                 - **metadata** (str): associated metadata
                 - **tool** (dict): tool used to compute metadata
                 - **mappings** (List[str]): list of mappings used to translate
                   these metadata
 
         """
         for c in db.origin_intrinsic_metadata_get_from_list(ids, cur):
             yield converters.db_to_metadata(
                 dict(zip(db.origin_intrinsic_metadata_cols, c)))
 
     @remote_api_endpoint('origin_intrinsic_metadata/add')
     @db_transaction()
     def origin_intrinsic_metadata_add(self, metadata,
                                       conflict_update=False, db=None,
                                       cur=None):
         """Add origin metadata not present in storage.
 
         Args:
             metadata (iterable): dictionaries with keys:
 
                 - **id**: origin identifier
                 - **from_revision**: sha1 id of the revision used to generate
                   these metadata.
                 - **metadata**: arbitrary dict
                 - **indexer_configuration_id**: tool used to compute metadata
                 - **mappings** (List[str]): list of mappings used to translate
                   these metadata
 
             conflict_update: Flag to determine if we want to overwrite (true)
               or skip duplicates (false, the default)
 
         """
         _check_id_duplicates(metadata)
         metadata.sort(key=lambda m: m['id'])
 
         db.mktemp_origin_intrinsic_metadata(cur)
 
         db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',
                    ['id', 'metadata', 'indexer_configuration_id',
                     'from_revision', 'mappings'],
                    cur)
         db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)
 
     @remote_api_endpoint('origin_intrinsic_metadata/delete')
     @db_transaction()
     def origin_intrinsic_metadata_delete(
             self, entries, db=None, cur=None):
         """Remove origin metadata from the storage.
 
         Args:
             entries (dict): dictionaries with the following keys:
                 - **id** (int): origin identifier
                 - **indexer_configuration_id** (int): tool used to compute
                   metadata
         """
         db.origin_intrinsic_metadata_delete(entries, cur)
 
     @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext')
     @db_transaction_generator()
     def origin_intrinsic_metadata_search_fulltext(
             self, conjunction, limit=100, db=None, cur=None):
         """Returns the list of origins whose metadata contain all the terms.
 
         Args:
             conjunction (List[str]): List of terms to be searched for.
             limit (int): The maximum number of results to return
 
         Yields:
             list: dictionaries with the following keys:
 
                 - **id** (int)
                 - **metadata** (str): associated metadata
                 - **tool** (dict): tool used to compute metadata
                 - **mappings** (List[str]): list of mappings used to translate
                   these metadata
 
         """
         for c in db.origin_intrinsic_metadata_search_fulltext(
                 conjunction, limit=limit, cur=cur):
             yield converters.db_to_metadata(
                 dict(zip(db.origin_intrinsic_metadata_cols, c)))
 
     @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer')
     @db_transaction_generator()
     def origin_intrinsic_metadata_search_by_producer(
             self, start=0, end=None, limit=100, ids_only=False,
             mappings=None, tool_ids=None,
             db=None, cur=None):
         """Returns the list of origins whose metadata contain all the terms.
 
         Args:
             start (int): The minimum origin id to return
             end (int): The maximum origin id to return
             limit (int): The maximum number of results to return
             ids_only (bool): Determines whether only origin ids are returned
                 or the content as well
             mappings (List[str]): Returns origins whose intrinsic metadata
                 were generated using at least one of these mappings.
 
         Yields:
             list: list of origin ids (int) if `ids_only=True`, else
                 dictionaries with the following keys:
 
                 - **id** (int)
                 - **metadata** (str): associated metadata
                 - **tool** (dict): tool used to compute metadata
                 - **mappings** (List[str]): list of mappings used to translate
                   these metadata
 
         """
         res = db.origin_intrinsic_metadata_search_by_producer(
             start, end, limit, ids_only, mappings, tool_ids, cur)
         if ids_only:
             for (origin_id,) in res:
                 yield origin_id
         else:
             for c in res:
                 yield converters.db_to_metadata(
                     dict(zip(db.origin_intrinsic_metadata_cols, c)))
 
     @remote_api_endpoint('origin_intrinsic_metadata/stats')
     @db_transaction()
     def origin_intrinsic_metadata_stats(
             self, db=None, cur=None):
         """Returns counts of indexed metadata per origins, broken down
         into metadata types.
 
         Returns:
             dict: dictionary with keys:
 
                 - total (int): total number of origins that were indexed
                   (possibly yielding an empty metadata dictionary)
                 - non_empty (int): total number of origins that we extracted
                   a non-empty metadata dictionary from
                 - per_mapping (dict): a dictionary with mapping names as
                   keys and number of origins whose indexing used this
                   mapping. Note that indexing a given origin may use
                   0, 1, or many mappings.
         """
         mapping_names = [m for m in MAPPING_NAMES]
         select_parts = []
 
         # Count rows for each mapping
         for mapping_name in mapping_names:
             select_parts.append((
                 "sum(case when (mappings @> ARRAY['%s']) "
                 "         then 1 else 0 end)"
                 ) % mapping_name)
 
         # Total
         select_parts.append("sum(1)")
 
         # Rows whose metadata has at least one key that is not '@context'
         select_parts.append(
             "sum(case when ('{}'::jsonb @> (metadata - '@context')) "
             "         then 0 else 1 end)")
         cur.execute('select ' + ', '.join(select_parts)
                     + ' from origin_intrinsic_metadata')
         results = dict(zip(mapping_names + ['total', 'non_empty'],
                            cur.fetchone()))
         return {
             'total': results.pop('total'),
             'non_empty': results.pop('non_empty'),
             'per_mapping': results,
         }
 
     @remote_api_endpoint('indexer_configuration/add')
     @db_transaction_generator()
     def indexer_configuration_add(self, tools, db=None, cur=None):
         """Add new tools to the storage.
 
         Args:
             tools ([dict]): List of dictionary representing tool to
                 insert in the db. Dictionary with the following keys:
 
                 - **tool_name** (str): tool's name
                 - **tool_version** (str): tool's version
                 - **tool_configuration** (dict): tool's configuration
                   (free form dict)
 
         Returns:
             List of dict inserted in the db (holding the id key as
             well).  The order of the list is not guaranteed to match
             the order of the initial list.
 
         """
         db.mktemp_indexer_configuration(cur)
         db.copy_to(tools, 'tmp_indexer_configuration',
                    ['tool_name', 'tool_version', 'tool_configuration'],
                    cur)
 
         tools = db.indexer_configuration_add_from_temp(cur)
         for line in tools:
             yield dict(zip(db.indexer_configuration_cols, line))
 
     @remote_api_endpoint('indexer_configuration/data')
     @db_transaction()
     def indexer_configuration_get(self, tool, db=None, cur=None):
         """Retrieve tool information.
 
         Args:
             tool (dict): Dictionary representing a tool with the
                 following keys:
 
                 - **tool_name** (str): tool's name
                 - **tool_version** (str): tool's version
                 - **tool_configuration** (dict): tool's configuration
                   (free form dict)
 
         Returns:
             The same dictionary with an `id` key, None otherwise.
 
         """
         tool_conf = tool['tool_configuration']
         if isinstance(tool_conf, dict):
             tool_conf = json.dumps(tool_conf)
         idx = db.indexer_configuration_get(tool['tool_name'],
                                            tool['tool_version'],
                                            tool_conf)
         if not idx:
             return None
         return dict(zip(db.indexer_configuration_cols, idx))
diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py
index 7dc616d..085c8cd 100644
--- a/swh/indexer/storage/api/client.py
+++ b/swh/indexer/storage/api/client.py
@@ -1,20 +1,17 @@
 # Copyright (C) 2015-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from swh.core.api import SWHRemoteAPI
 
 from swh.storage.exc import StorageAPIError
 
 from .. import IndexerStorage
 
 
 class RemoteStorage(SWHRemoteAPI):
     """Proxy to a remote storage API"""
 
     backend_class = IndexerStorage
-
-    def __init__(self, url, timeout=None):
-        super().__init__(
-            api_exception=StorageAPIError, url=url, timeout=timeout)
+    api_exception = StorageAPIError
diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py
index e2ae3eb..5f651bf 100644
--- a/swh/indexer/tests/conftest.py
+++ b/swh/indexer/tests/conftest.py
@@ -1,71 +1,71 @@
 from datetime import timedelta
 from unittest.mock import patch
 
 import pytest
 
-from swh.objstorage.objstorage_in_memory import InMemoryObjStorage
+from swh.objstorage import get_objstorage
 from swh.scheduler.tests.conftest import *  # noqa
 from swh.storage.in_memory import Storage
 
 from swh.indexer.storage.in_memory import IndexerStorage
 
 from .utils import fill_storage, fill_obj_storage
 
 
 TASK_NAMES = ['revision_intrinsic_metadata', 'origin_intrinsic_metadata']
 
 
 @pytest.fixture
 def indexer_scheduler(swh_scheduler):
     for taskname in TASK_NAMES:
         swh_scheduler.create_task_type({
             'type': taskname,
             'description': 'The {} indexer testing task'.format(taskname),
             'backend_name': 'swh.indexer.tests.tasks.{}'.format(taskname),
             'default_interval': timedelta(days=1),
             'min_interval': timedelta(hours=6),
             'max_interval': timedelta(days=12),
             'num_retries': 3,
         })
     return swh_scheduler
 
 
 @pytest.fixture
 def idx_storage():
     """An instance of swh.indexer.storage.in_memory.IndexerStorage that
     gets injected into all indexers classes."""
     idx_storage = IndexerStorage()
     with patch('swh.indexer.storage.in_memory.IndexerStorage') \
             as idx_storage_mock:
         idx_storage_mock.return_value = idx_storage
         yield idx_storage
 
 
 @pytest.fixture
 def storage():
     """An instance of swh.storage.in_memory.Storage that gets injected
     into all indexers classes."""
     storage = Storage()
     fill_storage(storage)
     with patch('swh.storage.in_memory.Storage') as storage_mock:
         storage_mock.return_value = storage
         yield storage
 
 
 @pytest.fixture
 def obj_storage():
     """An instance of swh.objstorage.objstorage_in_memory.InMemoryObjStorage
     that gets injected into all indexers classes."""
-    objstorage = InMemoryObjStorage()
+    objstorage = get_objstorage('memory', {})
     fill_obj_storage(objstorage)
     with patch.dict('swh.objstorage._STORAGE_CLASSES',
                     {'memory': lambda: objstorage}):
         yield objstorage
 
 
 @pytest.fixture(scope='session')
 def celery_includes():
     return [
         'swh.indexer.tests.tasks',
         'swh.indexer.tasks',
     ]
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
index d58dd3d..d6aaf02 100644
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -1,315 +1,315 @@
 # Copyright (C) 2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from functools import reduce
 import re
 import tempfile
 from unittest.mock import patch
 
 from click.testing import CliRunner
 
 from swh.model.hashutil import hash_to_bytes
 
 from swh.indexer.cli import cli
 
 
 CLI_CONFIG = '''
 scheduler:
     cls: foo
     args: {}
 storage:
     cls: memory
     args: {}
 indexer_storage:
     cls: memory
     args: {}
 '''
 
 
 def fill_idx_storage(idx_storage, nb_rows):
     tools = [
         {
             'tool_name': 'tool %d' % i,
             'tool_version': '0.0.1',
             'tool_configuration': {},
         }
         for i in range(2)
     ]
     tools = idx_storage.indexer_configuration_add(tools)
 
     origin_metadata = [
         {
             'id': origin_id,
             'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)),
             'indexer_configuration_id': tools[origin_id % 2]['id'],
             'metadata': {'name': 'origin %d' % origin_id},
             'mappings': ['mapping%d' % (origin_id % 10)]
         }
         for origin_id in range(nb_rows)
     ]
     revision_metadata = [
         {
             'id': hash_to_bytes('abcd{:0>4}'.format(origin_id)),
             'indexer_configuration_id': tools[origin_id % 2]['id'],
             'metadata': {'name': 'origin %d' % origin_id},
             'mappings': ['mapping%d' % (origin_id % 10)]
         }
         for origin_id in range(nb_rows)
     ]
 
     idx_storage.revision_intrinsic_metadata_add(revision_metadata)
     idx_storage.origin_intrinsic_metadata_add(origin_metadata)
 
     return [tool['id'] for tool in tools]
 
 
 def _origins_in_task_args(tasks):
     """Returns the set of origins contained in the arguments of the
     provided tasks (assumed to be of type indexer_origin_metadata)."""
     return reduce(
         set.union,
         (set(task['arguments']['args'][0]) for task in tasks),
         set()
     )
 
 
 def _assert_tasks_for_origins(tasks, origins):
     expected_kwargs = {"policy_update": "update-dups", "parse_ids": False}
     assert {task['type'] for task in tasks} == {'indexer_origin_metadata'}
     assert all(len(task['arguments']['args']) == 1 for task in tasks)
     assert all(task['arguments']['kwargs'] == expected_kwargs
                for task in tasks)
     assert _origins_in_task_args(tasks) == set(origins)
 
 
 def invoke(scheduler, catch_exceptions, args):
     runner = CliRunner()
     with patch('swh.indexer.cli.get_scheduler') as get_scheduler_mock, \
             tempfile.NamedTemporaryFile('a', suffix='.yml') as config_fd:
         config_fd.write(CLI_CONFIG)
         config_fd.seek(0)
         get_scheduler_mock.return_value = scheduler
         result = runner.invoke(cli, ['-C' + config_fd.name] + args)
     if not catch_exceptions and result.exception:
         print(result.output)
         raise result.exception
     return result
 
 
 def test_mapping_list(indexer_scheduler):
     result = invoke(indexer_scheduler, False, [
         'mapping', 'list',
     ])
     expected_output = '\n'.join([
         'codemeta', 'gemspec', 'maven', 'npm', 'pkg-info', '',
     ])
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
 
 def test_mapping_list_terms(indexer_scheduler):
     result = invoke(indexer_scheduler, False, [
         'mapping', 'list-terms',
     ])
     assert result.exit_code == 0, result.output
     assert re.search(r'http://schema.org/url:\n.*npm', result.output)
     assert re.search(r'http://schema.org/url:\n.*codemeta', result.output)
     assert re.search(
         r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta',
         result.output)
 
 
 def test_mapping_list_terms_exclude(indexer_scheduler):
     result = invoke(indexer_scheduler, False, [
         'mapping', 'list-terms',
         '--exclude-mapping', 'codemeta'
     ])
     assert result.exit_code == 0, result.output
     assert re.search(r'http://schema.org/url:\n.*npm', result.output)
     assert not re.search(r'http://schema.org/url:\n.*codemeta', result.output)
     assert not re.search(
         r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta',
         result.output)
 
 
-@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3)
+@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_empty_db(
         indexer_scheduler, idx_storage, storage):
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
     ])
     expected_output = (
         'Nothing to do (no origin metadata matched the criteria).\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 0
 
 
-@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3)
+@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_divisor(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when origin_batch_size*task_batch_size is a
     divisor of nb_origins."""
     fill_idx_storage(idx_storage, 90)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 3 tasks (30 origins).\n'
         'Scheduled 6 tasks (60 origins).\n'
         'Scheduled 9 tasks (90 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 9
     _assert_tasks_for_origins(tasks, range(90))
 
 
-@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3)
+@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_dry_run(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when origin_batch_size*task_batch_size is a
     divisor of nb_origins."""
     fill_idx_storage(idx_storage, 90)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', '--dry-run', 'reindex_origin_metadata',
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 3 tasks (30 origins).\n'
         'Scheduled 6 tasks (60 origins).\n'
         'Scheduled 9 tasks (90 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 0
 
 
-@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3)
+@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_nondivisor(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when neither origin_batch_size or
     task_batch_size is a divisor of nb_origins."""
     fill_idx_storage(idx_storage, 70)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
         '--batch-size', '20',
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 3 tasks (60 origins).\n'
         'Scheduled 4 tasks (70 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 4
     _assert_tasks_for_origins(tasks, range(70))
 
 
-@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3)
+@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_filter_one_mapping(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when origin_batch_size*task_batch_size is a
     divisor of nb_origins."""
     fill_idx_storage(idx_storage, 110)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
         '--mapping', 'mapping1',
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 2 tasks (11 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 2
     _assert_tasks_for_origins(
         tasks,
         [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101])
 
 
-@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3)
+@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_filter_two_mappings(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when origin_batch_size*task_batch_size is a
     divisor of nb_origins."""
     fill_idx_storage(idx_storage, 110)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
         '--mapping', 'mapping1', '--mapping', 'mapping2',
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 3 tasks (22 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 3
     _assert_tasks_for_origins(
         tasks,
         [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101,
          2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102])
 
 
-@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3)
+@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
 def test_origin_metadata_reindex_filter_one_tool(
         indexer_scheduler, idx_storage, storage):
     """Tests the re-indexing when origin_batch_size*task_batch_size is a
     divisor of nb_origins."""
     tool_ids = fill_idx_storage(idx_storage, 110)
 
     result = invoke(indexer_scheduler, False, [
         'schedule', 'reindex_origin_metadata',
         '--tool-id', str(tool_ids[0]),
     ])
 
     # Check the output
     expected_output = (
         'Scheduled 3 tasks (30 origins).\n'
         'Scheduled 6 tasks (55 origins).\n'
         'Done.\n'
     )
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
     # Check scheduled tasks
     tasks = indexer_scheduler.search_tasks()
     assert len(tasks) == 6
     _assert_tasks_for_origins(
         tasks,
         [x*2 for x in range(55)])
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 2b8d651..e20e1e9 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,1209 +1,1209 @@
 # Copyright (C) 2017-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import json
 import unittest
 
 from hypothesis import given, strategies, settings, HealthCheck
 import xmltodict
 
 from swh.model.hashutil import hash_to_bytes
 
-from swh.indexer.codemeta import CODEMETA_TERMS
-from swh.indexer.metadata_dictionary import (
-    CROSSWALK_TABLE, MAPPINGS, merge_values)
+from swh.indexer.codemeta import CODEMETA_TERMS, CROSSWALK_TABLE
+from swh.indexer.metadata_dictionary import MAPPINGS
+from swh.indexer.metadata_dictionary.base import merge_values
 from swh.indexer.metadata_detector import (
     detect_metadata, extract_minimal_metadata_dict
 )
 from swh.indexer.metadata import (
     ContentMetadataIndexer, RevisionMetadataIndexer
 )
 
 from .utils import (
     BASE_TEST_CONFIG, fill_obj_storage, fill_storage,
     YARN_PARSER_METADATA, json_document_strategy
 )
 
 
 TRANSLATOR_TOOL = {
     'name': 'swh-metadata-translator',
     'version': '0.0.2',
     'configuration': {
         'type': 'local',
         'context': 'NpmMapping'
     }
 }
 
 
 class ContentMetadataTestIndexer(ContentMetadataIndexer):
     """Specific Metadata whose configuration is enough to satisfy the
        indexing tests.
     """
     def parse_config_file(self, *args, **kwargs):
         assert False, 'should not be called; the rev indexer configures it.'
 
 
 REVISION_METADATA_CONFIG = {
     **BASE_TEST_CONFIG,
     'tools': TRANSLATOR_TOOL,
 }
 
 
 class Metadata(unittest.TestCase):
     """
     Tests metadata_mock_tool tool for Metadata detection
     """
     def setUp(self):
         """
         shows the entire diff in the results
         """
         self.maxDiff = None
         self.npm_mapping = MAPPINGS['NpmMapping']()
         self.codemeta_mapping = MAPPINGS['CodemetaMapping']()
         self.maven_mapping = MAPPINGS['MavenMapping']()
         self.pkginfo_mapping = MAPPINGS['PythonPkginfoMapping']()
         self.gemspec_mapping = MAPPINGS['GemspecMapping']()
 
     def test_crosstable(self):
         self.assertEqual(CROSSWALK_TABLE['NodeJS'], {
             'repository': 'http://schema.org/codeRepository',
             'os': 'http://schema.org/operatingSystem',
             'cpu': 'http://schema.org/processorRequirements',
             'engines':
                 'http://schema.org/processorRequirements',
             'author': 'http://schema.org/author',
             'author.email': 'http://schema.org/email',
             'author.name': 'http://schema.org/name',
             'contributor': 'http://schema.org/contributor',
             'keywords': 'http://schema.org/keywords',
             'license': 'http://schema.org/license',
             'version': 'http://schema.org/version',
             'description': 'http://schema.org/description',
             'name': 'http://schema.org/name',
             'bugs': 'https://codemeta.github.io/terms/issueTracker',
             'homepage': 'http://schema.org/url'
         })
 
     def test_merge_values(self):
         self.assertEqual(
             merge_values('a', 'b'),
             ['a', 'b'])
         self.assertEqual(
             merge_values(['a', 'b'], 'c'),
             ['a', 'b', 'c'])
         self.assertEqual(
             merge_values('a', ['b', 'c']),
             ['a', 'b', 'c'])
 
         self.assertEqual(
             merge_values({'@list': ['a']}, {'@list': ['b']}),
             {'@list': ['a', 'b']})
         self.assertEqual(
             merge_values({'@list': ['a', 'b']}, {'@list': ['c']}),
             {'@list': ['a', 'b', 'c']})
 
         with self.assertRaises(ValueError):
             merge_values({'@list': ['a']}, 'b')
         with self.assertRaises(ValueError):
             merge_values('a', {'@list': ['b']})
         with self.assertRaises(ValueError):
             merge_values({'@list': ['a']}, ['b'])
         with self.assertRaises(ValueError):
             merge_values(['a'], {'@list': ['b']})
 
         self.assertEqual(
             merge_values('a', None),
             'a')
         self.assertEqual(
             merge_values(['a', 'b'], None),
             ['a', 'b'])
         self.assertEqual(
             merge_values(None, ['b', 'c']),
             ['b', 'c'])
         self.assertEqual(
             merge_values({'@list': ['a']}, None),
             {'@list': ['a']})
         self.assertEqual(
             merge_values(None, {'@list': ['a']}),
             {'@list': ['a']})
 
     def test_compute_metadata_none(self):
         """
         testing content empty content is empty
         should return None
         """
         # given
         content = b""
 
         # None if no metadata was found or an error occurred
         declared_metadata = None
         # when
         result = self.npm_mapping.translate(content)
         # then
         self.assertEqual(declared_metadata, result)
 
     def test_compute_metadata_npm(self):
         """
         testing only computation of metadata with hard_mapping_npm
         """
         # given
         content = b"""
             {
                 "name": "test_metadata",
                 "version": "0.0.2",
                 "description": "Simple package.json test for indexer",
                   "repository": {
                     "type": "git",
                     "url": "https://github.com/moranegg/metadata_test"
                 },
                 "author": {
                     "email": "moranegg@example.com",
                     "name": "Morane G"
                 }
             }
         """
         declared_metadata = {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'test_metadata',
             'version': '0.0.2',
             'description': 'Simple package.json test for indexer',
             'codeRepository':
                 'git+https://github.com/moranegg/metadata_test',
             'author': [{
                 'type': 'Person',
                 'name': 'Morane G',
                 'email': 'moranegg@example.com',
             }],
         }
 
         # when
         result = self.npm_mapping.translate(content)
         # then
         self.assertEqual(declared_metadata, result)
 
     def test_extract_minimal_metadata_dict(self):
         """
         Test the creation of a coherent minimal metadata set
         """
         # given
         metadata_list = [{
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'test_1',
             'version': '0.0.2',
             'description': 'Simple package.json test for indexer',
             'codeRepository':
                 'git+https://github.com/moranegg/metadata_test',
         }, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'test_0_1',
             'version': '0.0.2',
             'description': 'Simple package.json test for indexer',
             'codeRepository':
                 'git+https://github.com/moranegg/metadata_test'
         }, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'test_metadata',
             'version': '0.0.2',
             'author': 'moranegg',
         }]
 
         # when
         results = extract_minimal_metadata_dict(metadata_list)
 
         # then
         expected_results = {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             "version": '0.0.2',
             "description": 'Simple package.json test for indexer',
             "name": ['test_1', 'test_0_1', 'test_metadata'],
             "author": ['moranegg'],
             "codeRepository":
                 'git+https://github.com/moranegg/metadata_test',
         }
         self.assertEqual(expected_results, results)
 
     def test_index_content_metadata_npm(self):
         """
         testing NPM with package.json
         - one sha1 uses a file that can't be translated to metadata and
           should return None in the translated metadata
         """
         # given
         sha1s = [
             hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'),
             hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'),
             hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'),
         ]
         # this metadata indexer computes only metadata for package.json
         # in npm context with a hard mapping
         config = BASE_TEST_CONFIG.copy()
         config['tools'] = [TRANSLATOR_TOOL]
         metadata_indexer = ContentMetadataTestIndexer(config=config)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         # when
         metadata_indexer.run(sha1s, policy_update='ignore-dups')
         results = list(metadata_indexer.idx_storage.content_metadata_get(
             sha1s))
 
         expected_results = [{
             'metadata': {
                 '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
                 'type': 'SoftwareSourceCode',
                 'codeRepository':
                     'git+https://github.com/moranegg/metadata_test',
                 'description': 'Simple package.json test for indexer',
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
             'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'),
             }, {
             'metadata': {
                 '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
                 'type': 'SoftwareSourceCode',
                 'issueTracker':
                     'https://github.com/npm/npm/issues',
                 'author': [{
                     'type': 'Person',
                     'name': 'Isaac Z. Schlueter',
                     'email': 'i@izs.me',
                     'url': 'http://blog.izs.me',
                 }],
                 'codeRepository':
                     'git+https://github.com/npm/npm',
                 'description': 'a package manager for JavaScript',
                 'license': 'https://spdx.org/licenses/Artistic-2.0',
                 'version': '5.0.3',
                 'name': 'npm',
                 'keywords': [
                     'install',
                     'modules',
                     'package manager',
                     'package.json'
                 ],
                 'url': 'https://docs.npmjs.com/'
             },
             'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607')
         }]
 
         for result in results:
             del result['tool']
 
         # The assertion below returns False sometimes because of nested lists
         self.assertEqual(expected_results, results)
 
     def test_npm_bugs_normalization(self):
         # valid dictionary
         package_json = b"""{
             "name": "foo",
             "bugs": {
                 "url": "https://github.com/owner/project/issues",
                 "email": "foo@example.com"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'issueTracker': 'https://github.com/owner/project/issues',
             'type': 'SoftwareSourceCode',
         })
 
         # "invalid" dictionary
         package_json = b"""{
             "name": "foo",
             "bugs": {
                 "email": "foo@example.com"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'type': 'SoftwareSourceCode',
         })
 
         # string
         package_json = b"""{
             "name": "foo",
             "bugs": "https://github.com/owner/project/issues"
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'issueTracker': 'https://github.com/owner/project/issues',
             'type': 'SoftwareSourceCode',
         })
 
     def test_npm_repository_normalization(self):
         # normal
         package_json = b"""{
             "name": "foo",
             "repository": {
                 "type" : "git",
                 "url" : "https://github.com/npm/cli.git"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'codeRepository': 'git+https://github.com/npm/cli.git',
             'type': 'SoftwareSourceCode',
         })
 
         # missing url
         package_json = b"""{
             "name": "foo",
             "repository": {
                 "type" : "git"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'type': 'SoftwareSourceCode',
         })
 
         # github shortcut
         package_json = b"""{
             "name": "foo",
             "repository": "github:npm/cli"
         }"""
         result = self.npm_mapping.translate(package_json)
         expected_result = {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'codeRepository': 'git+https://github.com/npm/cli.git',
             'type': 'SoftwareSourceCode',
         }
         self.assertEqual(result, expected_result)
 
         # github shortshortcut
         package_json = b"""{
             "name": "foo",
             "repository": "npm/cli"
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, expected_result)
 
         # gitlab shortcut
         package_json = b"""{
             "name": "foo",
             "repository": "gitlab:user/repo"
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'codeRepository': 'git+https://gitlab.com/user/repo.git',
             'type': 'SoftwareSourceCode',
         })
 
     def test_detect_metadata_package_json(self):
         # given
         df = [{
                 'sha1_git': b'abc',
                 'name': b'index.js',
                 'target': b'abc',
                 'length': 897,
                 'status': 'visible',
                 'type': 'file',
                 'perms': 33188,
                 'dir_id': b'dir_a',
                 'sha1': b'bcd'
             },
             {
                 'sha1_git': b'aab',
                 'name': b'package.json',
                 'target': b'aab',
                 'length': 712,
                 'status': 'visible',
                 'type': 'file',
                 'perms': 33188,
                 'dir_id': b'dir_a',
                 'sha1': b'cde'
         }]
         # when
         results = detect_metadata(df)
 
         expected_results = {
             'NpmMapping': [
                 b'cde'
             ]
         }
         # then
         self.assertEqual(expected_results, results)
 
     def test_compute_metadata_valid_codemeta(self):
         raw_content = (
             b"""{
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "@type": "SoftwareSourceCode",
             "identifier": "CodeMeta",
             "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
             "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
             "codeRepository": "https://github.com/codemeta/codemeta",
             "issueTracker": "https://github.com/codemeta/codemeta/issues",
             "license": "https://spdx.org/licenses/Apache-2.0",
             "version": "2.0",
             "author": [
               {
                 "@type": "Person",
                 "givenName": "Carl",
                 "familyName": "Boettiger",
                 "email": "cboettig@gmail.com",
                 "@id": "http://orcid.org/0000-0002-1642-628X"
               },
               {
                 "@type": "Person",
                 "givenName": "Matthew B.",
                 "familyName": "Jones",
                 "email": "jones@nceas.ucsb.edu",
                 "@id": "http://orcid.org/0000-0003-0077-4738"
               }
             ],
             "maintainer": {
               "@type": "Person",
               "givenName": "Carl",
               "familyName": "Boettiger",
               "email": "cboettig@gmail.com",
               "@id": "http://orcid.org/0000-0002-1642-628X"
             },
             "contIntegration": "https://travis-ci.org/codemeta/codemeta",
             "developmentStatus": "active",
             "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
             "funder": {
                 "@id": "https://doi.org/10.13039/100000001",
                 "@type": "Organization",
                 "name": "National Science Foundation"
             },
             "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
             "keywords": [
               "metadata",
               "software"
             ],
             "version":"2.0",
             "dateCreated":"2017-06-05",
             "datePublished":"2017-06-05",
             "programmingLanguage": "JSON-LD"
           }""") # noqa
         expected_result = {
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "type": "SoftwareSourceCode",
             "identifier": "CodeMeta",
             "description":
                 "CodeMeta is a concept vocabulary that can "
                 "be used to standardize the exchange of software metadata "
                 "across repositories and organizations.",
             "name":
                 "CodeMeta: Minimal metadata schemas for science "
                 "software and code, in JSON-LD",
             "codeRepository": "https://github.com/codemeta/codemeta",
             "issueTracker": "https://github.com/codemeta/codemeta/issues",
             "license": "https://spdx.org/licenses/Apache-2.0",
             "version": "2.0",
             "author": [
               {
                 "type": "Person",
                 "givenName": "Carl",
                 "familyName": "Boettiger",
                 "email": "cboettig@gmail.com",
                 "id": "http://orcid.org/0000-0002-1642-628X"
               },
               {
                 "type": "Person",
                 "givenName": "Matthew B.",
                 "familyName": "Jones",
                 "email": "jones@nceas.ucsb.edu",
                 "id": "http://orcid.org/0000-0003-0077-4738"
               }
             ],
             "maintainer": {
               "type": "Person",
               "givenName": "Carl",
               "familyName": "Boettiger",
               "email": "cboettig@gmail.com",
               "id": "http://orcid.org/0000-0002-1642-628X"
             },
             "contIntegration": "https://travis-ci.org/codemeta/codemeta",
             "developmentStatus": "active",
             "downloadUrl":
                 "https://github.com/codemeta/codemeta/archive/2.0.zip",
             "funder": {
                 "id": "https://doi.org/10.13039/100000001",
                 "type": "Organization",
                 "name": "National Science Foundation"
             },
             "funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
                 "in Scientific Software",
             "keywords": [
               "metadata",
               "software"
             ],
             "version": "2.0",
             "dateCreated": "2017-06-05",
             "datePublished": "2017-06-05",
             "programmingLanguage": "JSON-LD"
           }
         result = self.codemeta_mapping.translate(raw_content)
         self.assertEqual(result, expected_result)
 
     def test_compute_metadata_codemeta_alternate_context(self):
         raw_content = (
             b"""{
             "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
             "@type": "SoftwareSourceCode",
             "identifier": "CodeMeta"
         }""")  # noqa
         expected_result = {
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "type": "SoftwareSourceCode",
             "identifier": "CodeMeta",
         }
         result = self.codemeta_mapping.translate(raw_content)
         self.assertEqual(result, expected_result)
 
     def test_compute_metadata_maven(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <repositories>
             <repository>
               <id>central</id>
               <name>Maven Repository Switchboard</name>
               <layout>default</layout>
               <url>http://repo1.maven.org/maven2</url>
               <snapshots>
                 <enabled>false</enabled>
               </snapshots>
             </repository>
           </repositories>
           <licenses>
             <license>
               <name>Apache License, Version 2.0</name>
               <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
               <distribution>repo</distribution>
               <comments>A business-friendly OSS license</comments>
             </license>
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt',
             'codeRepository':
                 'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
         })
 
     def test_compute_metadata_maven_empty(self):
         raw_content = b"""
         <project>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
         })
 
     def test_compute_metadata_maven_almost_empty(self):
         raw_content = b"""
         <project>
           <foo/>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
         })
 
     def test_compute_metadata_maven_invalid_xml(self):
         expected_warning = (
-            'WARNING:swh.indexer.metadata_dictionary.MavenMapping:'
+            'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:'
             'Error parsing XML from foo')
 
         raw_content = b"""
         <project>"""
         with self.assertLogs('swh.indexer.metadata_dictionary',
                              level='WARNING') as cm:
             result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
         raw_content = b"""
         """
         with self.assertLogs('swh.indexer.metadata_dictionary',
                              level='WARNING') as cm:
             result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
     def test_compute_metadata_maven_unknown_encoding(self):
         expected_warning = (
-            'WARNING:swh.indexer.metadata_dictionary.MavenMapping:'
+            'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:'
             'Error detecting XML encoding from foo')
 
         raw_content = b"""<?xml version="1.0" encoding="foo"?>
         <project>
         </project>"""
         with self.assertLogs('swh.indexer.metadata_dictionary',
                              level='WARNING') as cm:
             result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
         raw_content = b"""<?xml version="1.0" encoding="UTF-7"?>
         <project>
         </project>"""
         with self.assertLogs('swh.indexer.metadata_dictionary',
                              level='WARNING') as cm:
             result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
     def test_compute_metadata_maven_invalid_encoding(self):
         expected_warning = (
-            'WARNING:swh.indexer.metadata_dictionary.MavenMapping:'
+            'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:'
             'Error unidecoding XML from foo')
 
         raw_content = b"""<?xml version="1.0" encoding="UTF-8"?>
         <foo\xe5ct>
         </foo>"""
         with self.assertLogs('swh.indexer.metadata_dictionary',
                              level='WARNING') as cm:
             result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
     def test_compute_metadata_maven_minimal(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
     def test_compute_metadata_maven_empty_nodes(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <repositories>
           </repositories>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version></version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
         raw_content = b"""
         <project>
           <name></name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <licenses>
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
         raw_content = b"""
         <project>
           <groupId></groupId>
           <version>1.2.3</version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'version': '1.2.3',
         })
 
     def test_compute_metadata_maven_invalid_licenses(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <licenses>
             foo
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
     def test_compute_metadata_maven_multiple(self):
         '''Tests when there are multiple code repos and licenses.'''
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <repositories>
             <repository>
               <id>central</id>
               <name>Maven Repository Switchboard</name>
               <layout>default</layout>
               <url>http://repo1.maven.org/maven2</url>
               <snapshots>
                 <enabled>false</enabled>
               </snapshots>
             </repository>
             <repository>
               <id>example</id>
               <name>Example Maven Repo</name>
               <layout>default</layout>
               <url>http://example.org/maven2</url>
             </repository>
           </repositories>
           <licenses>
             <license>
               <name>Apache License, Version 2.0</name>
               <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
               <distribution>repo</distribution>
               <comments>A business-friendly OSS license</comments>
             </license>
             <license>
               <name>MIT license</name>
               <url>https://opensource.org/licenses/MIT</url>
             </license>
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'license': [
                 'https://www.apache.org/licenses/LICENSE-2.0.txt',
                 'https://opensource.org/licenses/MIT',
             ],
             'codeRepository': [
                 'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
                 'http://example.org/maven2/com/mycompany/app/my-app',
             ]
         })
 
     def test_compute_metadata_pkginfo(self):
         raw_content = (b"""\
 Metadata-Version: 2.1
 Name: swh.core
 Version: 0.0.49
 Summary: Software Heritage core utilities
 Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
 Description: swh-core
         ========
        \x20
         core library for swh's modules:
         - config parser
         - hash computations
         - serialization
         - logging mechanism
        \x20
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 """) # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertCountEqual(result['description'], [
             'Software Heritage core utilities',  # note the comma here
             'swh-core\n'
             '========\n'
             '\n'
             "core library for swh's modules:\n"
             '- config parser\n'
             '- hash computations\n'
             '- serialization\n'
             '- logging mechanism\n'
             ''],
             result)
         del result['description']
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'url': 'https://forge.softwareheritage.org/diffusion/DCORE/',
             'name': 'swh.core',
             'author': [{
                 'type': 'Person',
                 'name': 'Software Heritage developers',
                 'email': 'swh-devel@inria.fr',
             }],
             'version': '0.0.49',
         })
 
     def test_compute_metadata_pkginfo_utf8(self):
         raw_content = (b'''\
 Metadata-Version: 1.1
 Name: snowpyt
 Description-Content-Type: UNKNOWN
 Description: foo
         Hydrology N\xc2\xb083
 ''') # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'snowpyt',
             'description': 'foo\nHydrology N°83',
         })
 
     def test_compute_metadata_pkginfo_keywords(self):
         raw_content = (b"""\
 Metadata-Version: 2.1
 Name: foo
 Keywords: foo bar baz
 """) # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'foo',
             'keywords': ['foo', 'bar', 'baz'],
         })
 
     def test_compute_metadata_pkginfo_license(self):
         raw_content = (b"""\
 Metadata-Version: 2.1
 Name: foo
 License: MIT
 """) # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'foo',
             'license': 'MIT',
         })
 
     def test_gemspec_base(self):
         raw_content = b"""
 Gem::Specification.new do |s|
   s.name        = 'example'
   s.version     = '0.1.0'
   s.licenses    = ['MIT']
   s.summary     = "This is an example!"
   s.description = "Much longer explanation of the example!"
   s.authors     = ["Ruby Coder"]
   s.email       = 'rubycoder@example.com'
   s.files       = ["lib/example.rb"]
   s.homepage    = 'https://rubygems.org/gems/example'
   s.metadata    = { "source_code_uri" => "https://github.com/example/example" }
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertCountEqual(result.pop('description'), [
             "This is an example!",
             "Much longer explanation of the example!"
         ])
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'author': ['Ruby Coder'],
             'name': 'example',
             'license': 'https://spdx.org/licenses/MIT',
             'codeRepository': 'https://rubygems.org/gems/example',
             'email': 'rubycoder@example.com',
             'version': '0.1.0',
         })
 
     def test_gemspec_two_author_fields(self):
         raw_content = b"""
 Gem::Specification.new do |s|
   s.authors     = ["Ruby Coder1"]
   s.author      = "Ruby Coder2"
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertCountEqual(result.pop('author'), [
             'Ruby Coder1', 'Ruby Coder2'])
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
         })
 
     def test_gemspec_invalid_author(self):
         raw_content = b"""
 Gem::Specification.new do |s|
   s.author      = ["Ruby Coder"]
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
         })
         raw_content = b"""
 Gem::Specification.new do |s|
   s.author      = "Ruby Coder1",
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
         })
         raw_content = b"""
 Gem::Specification.new do |s|
   s.authors     = ["Ruby Coder1", ["Ruby Coder2"]]
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'author': ['Ruby Coder1'],
         })
 
     def test_gemspec_alternative_header(self):
         raw_content = b"""
 require './lib/version'
 
 Gem::Specification.new { |s|
   s.name = 'rb-system-with-aliases'
   s.summary = 'execute system commands with aliases'
 }
 """
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'rb-system-with-aliases',
             'description': 'execute system commands with aliases',
         })
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(json_document_strategy(
         keys=list(MAPPINGS['NpmMapping'].mapping)))
     def test_npm_adversarial(self, doc):
         raw = json.dumps(doc).encode()
         self.npm_mapping.translate(raw)
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(json_document_strategy(keys=CODEMETA_TERMS))
     def test_codemeta_adversarial(self, doc):
         raw = json.dumps(doc).encode()
         self.codemeta_mapping.translate(raw)
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(json_document_strategy(
         keys=list(MAPPINGS['MavenMapping'].mapping)))
     def test_maven_adversarial(self, doc):
         raw = xmltodict.unparse({'project': doc}, pretty=True)
         self.maven_mapping.translate(raw)
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(strategies.dictionaries(
         # keys
         strategies.one_of(
             strategies.text(),
             *map(strategies.just, MAPPINGS['GemspecMapping'].mapping)
         ),
         # values
         strategies.recursive(
             strategies.characters(),
             lambda children: strategies.lists(children, 1)
         )
     ))
     def test_gemspec_adversarial(self, doc):
         parts = [b'Gem::Specification.new do |s|\n']
         for (k, v) in doc.items():
             parts.append('  s.{} = {}\n'.format(k, repr(v)).encode())
         parts.append(b'end\n')
         self.gemspec_mapping.translate(b''.join(parts))
 
     def test_revision_metadata_indexer(self):
         metadata_indexer = RevisionMetadataIndexer(
             config=REVISION_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()})
         assert tool is not None
 
         metadata_indexer.idx_storage.content_metadata_add([{
             'indexer_configuration_id': tool['id'],
             'id': b'cde',
             'metadata': YARN_PARSER_METADATA,
         }])
 
         sha1_gits = [
             hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
         ]
         metadata_indexer.run(sha1_gits, 'update-dups')
 
         results = list(
             metadata_indexer.idx_storage.
             revision_intrinsic_metadata_get(sha1_gits))
 
         expected_results = [{
             'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
             'tool': TRANSLATOR_TOOL,
             'metadata': YARN_PARSER_METADATA,
             'mappings': ['npm'],
         }]
 
         for result in results:
             del result['tool']['id']
 
         # then
         self.assertEqual(expected_results, results)
 
     def test_revision_metadata_indexer_single_root_dir(self):
         metadata_indexer = RevisionMetadataIndexer(
             config=REVISION_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         # Add a parent directory, that is the only directory at the root
         # of the revision
         rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
         subdir_id = metadata_indexer.storage._revisions[rev_id]['directory']
         metadata_indexer.storage._revisions[rev_id]['directory'] = b'123456'
         metadata_indexer.storage.directory_add([{
             'id': b'123456',
             'entries': [{
                 'target': subdir_id,
                 'type': 'dir',
                 'length': None,
                 'name': b'foobar-1.0.0',
                 'sha1': None,
                 'perms': 16384,
                 'sha1_git': None,
                 'status': None,
                 'sha256': None
             }],
         }])
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()})
         assert tool is not None
 
         metadata_indexer.idx_storage.content_metadata_add([{
             'indexer_configuration_id': tool['id'],
             'id': b'cde',
             'metadata': YARN_PARSER_METADATA,
         }])
 
         sha1_gits = [
             hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
         ]
         metadata_indexer.run(sha1_gits, 'update-dups')
 
         results = list(
             metadata_indexer.idx_storage.
             revision_intrinsic_metadata_get(sha1_gits))
 
         expected_results = [{
             'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
             'tool': TRANSLATOR_TOOL,
             'metadata': YARN_PARSER_METADATA,
             'mappings': ['npm'],
         }]
 
         for result in results:
             del result['tool']['id']
 
         # then
         self.assertEqual(expected_results, results)
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 3971fdb..a5be367 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,217 +1,217 @@
 # Copyright (C) 2018-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from unittest.mock import patch
 
 from swh.model.hashutil import hash_to_bytes
 
 from swh.indexer.metadata import OriginMetadataIndexer
 
 from .utils import YARN_PARSER_METADATA
 from .test_metadata import REVISION_METADATA_CONFIG
 
 
 def test_origin_metadata_indexer(
         idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     rev_metadata = {
         'id': rev_id,
         'metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
     origin_metadata = {
         'id': origin['id'],
         'from_revision': rev_id,
         'metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     for result in results:
         del result['tool']
     assert results == [rev_metadata]
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     for result in results:
         del result['tool']
     assert results == [origin_metadata]
 
 
 def test_origin_metadata_indexer_duplicate_origin(
         idx_storage, storage, obj_storage):
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2)
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert len(results) == 1
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert len(results) == 1
 
 
 def test_origin_metadata_indexer_missing_head(
         idx_storage, storage, obj_storage):
 
     storage.origin_add([{
         'type': 'git',
         'url': 'https://example.com'
     }])
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.run(["git+https://example.com"])
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://example.com'})
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert results == []
 
 
 def test_origin_metadata_indexer_partial_missing_head(
         idx_storage, storage, obj_storage):
 
     storage.origin_add([{
         'type': 'git',
         'url': 'https://example.com'
     }])
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.run(["git+https://example.com",
                  "git+https://github.com/librariesio/yarn-parser"])
 
     origin1 = storage.origin_get({
         'type': 'git',
         'url': 'https://example.com'})
     origin2 = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     rev_metadata = {
         'id': rev_id,
         'metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
     origin_metadata = {
         'id': origin2['id'],
         'from_revision': rev_id,
         'metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     for result in results:
         del result['tool']
     assert results == [rev_metadata]
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin1['id'], origin2['id']]))
     for result in results:
         del result['tool']
     assert results == [origin_metadata]
 
 
 def test_origin_metadata_indexer_duplicate_revision(
         idx_storage, storage, obj_storage):
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.run(["git+https://github.com/librariesio/yarn-parser",
                  "git+https://github.com/librariesio/yarn-parser.git"])
 
     origin1 = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     origin2 = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser.git'})
     assert origin1['id'] != origin2['id']
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert len(results) == 1
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin1['id'], origin2['id']]))
     assert len(results) == 2
 
 
 def test_origin_metadata_indexer_no_metadata(
         idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
-    with patch('swh.indexer.metadata_dictionary.NpmMapping.filename',
+    with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename',
                b'foo.json'):
         indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results == []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert results == []
 
 
 def test_origin_metadata_indexer_delete_metadata(
         idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results != []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert results != []
 
-    with patch('swh.indexer.metadata_dictionary.NpmMapping.filename',
+    with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename',
                b'foo.json'):
         indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     results = list(
         indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results == []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert results == []
diff --git a/version.txt b/version.txt
index 0f33698..94717a4 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v0.0.145-0-g645f08e
\ No newline at end of file
+v0.0.146-0-g669998e
\ No newline at end of file