diff --git a/PKG-INFO b/PKG-INFO
index 2ce3fdd..a126486 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,69 +1,69 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 0.0.152
+Version: 0.0.153
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Description: swh-indexer
         ============
         
         Tools to compute multiple indexes on SWH's raw contents:
         - content:
           - mimetype
           - ctags
           - language
           - fossology-license
           - metadata
         - revision:
           - metadata
         
         An indexer is in charge of:
         - looking up objects
         - extracting information from those objects
         - store those information in the swh-indexer db
         
         There are multiple indexers working on different object types:
           - content indexer: works with content sha1 hashes
           - revision indexer: works with revision sha1 hashes
           - origin indexer: works with origin identifiers
         
         Indexation procedure:
         - receive batch of ids
         - retrieve the associated data depending on object type
         - compute for that object some index
         - store the result to swh's storage
         
         Current content indexers:
         
         - mimetype (queue swh_indexer_content_mimetype): detect the encoding
           and mimetype
         
         - language (queue swh_indexer_content_language): detect the
           programming language
         
         - ctags (queue swh_indexer_content_ctags): compute tags information
         
         - fossology-license (queue swh_indexer_fossology_license): compute the
           license
         
         - metadata: translate file into translated_metadata dict
         
         Current revision indexers:
         
         - metadata: detects files containing metadata and retrieves translated_metadata
           in content_metadata table in storage or run content indexer to translate
           files.
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
diff --git a/requirements-swh.txt b/requirements-swh.txt
index 6f6371e..0ca12b4 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,6 +1,6 @@
-swh.core[db,http] >= 0.0.61
+swh.core[db,http] >= 0.0.65
 swh.model >= 0.0.15
 swh.objstorage >= 0.0.28
 swh.scheduler >= 0.0.47
 swh.storage >= 0.0.143
 swh.journal >= 0.0.11
diff --git a/sql/upgrades/126.sql b/sql/upgrades/126.sql
new file mode 100644
index 0000000..89d63c2
--- /dev/null
+++ b/sql/upgrades/126.sql
@@ -0,0 +1,42 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 125
+-- to_version: 126
+-- description: Make swh_origin_intrinsic_metadata_add update all fields
+
+insert into dbversion(version, release, description)
+values(126, now(), 'Work In Progress');
+
+
+create or replace function swh_origin_intrinsic_metadata_add(
+        conflict_update boolean)
+    returns void
+    language plpgsql
+as $$
+begin
+    perform swh_origin_intrinsic_metadata_compute_tsvector();
+    if conflict_update then
+      insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+      select id, origin_url, metadata, indexer_configuration_id, from_revision,
+             metadata_tsvector, mappings
+    	from tmp_origin_intrinsic_metadata
+            on conflict(id, indexer_configuration_id)
+                do update set
+                    metadata = excluded.metadata,
+                    metadata_tsvector = excluded.metadata_tsvector,
+                    mappings = excluded.mappings,
+                    origin_url = excluded.origin_url,
+                    from_revision = excluded.from_revision;
+
+    else
+        insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+        select id, origin_url, metadata, indexer_configuration_id, from_revision,
+               metadata_tsvector, mappings
+    	from tmp_origin_intrinsic_metadata
+            on conflict(id, indexer_configuration_id)
+            do nothing;
+    end if;
+    return;
+end
+$$;
+
+comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index 2ce3fdd..a126486 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,69 +1,69 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 0.0.152
+Version: 0.0.153
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Description: swh-indexer
         ============
         
         Tools to compute multiple indexes on SWH's raw contents:
         - content:
           - mimetype
           - ctags
           - language
           - fossology-license
           - metadata
         - revision:
           - metadata
         
         An indexer is in charge of:
         - looking up objects
         - extracting information from those objects
         - store those information in the swh-indexer db
         
         There are multiple indexers working on different object types:
           - content indexer: works with content sha1 hashes
           - revision indexer: works with revision sha1 hashes
           - origin indexer: works with origin identifiers
         
         Indexation procedure:
         - receive batch of ids
         - retrieve the associated data depending on object type
         - compute for that object some index
         - store the result to swh's storage
         
         Current content indexers:
         
         - mimetype (queue swh_indexer_content_mimetype): detect the encoding
           and mimetype
         
         - language (queue swh_indexer_content_language): detect the
           programming language
         
         - ctags (queue swh_indexer_content_ctags): compute tags information
         
         - fossology-license (queue swh_indexer_fossology_license): compute the
           license
         
         - metadata: translate file into translated_metadata dict
         
         Current revision indexers:
         
         - metadata: detects files containing metadata and retrieves translated_metadata
           in content_metadata table in storage or run content indexer to translate
           files.
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt
index ae60662..7640b31 100644
--- a/swh.indexer.egg-info/SOURCES.txt
+++ b/swh.indexer.egg-info/SOURCES.txt
@@ -1,92 +1,93 @@
 MANIFEST.in
 Makefile
 README.md
 requirements-swh.txt
 requirements.txt
 setup.py
 version.txt
 sql/bin/db-upgrade
 sql/bin/dot_add_content
 sql/doc/json/.gitignore
 sql/doc/json/Makefile
 sql/doc/json/indexer_configuration.tool_configuration.schema.json
 sql/doc/json/revision_metadata.translated_metadata.json
 sql/json/.gitignore
 sql/json/Makefile
 sql/json/indexer_configuration.tool_configuration.schema.json
 sql/json/revision_metadata.translated_metadata.json
 sql/upgrades/115.sql
 sql/upgrades/116.sql
 sql/upgrades/117.sql
 sql/upgrades/118.sql
 sql/upgrades/119.sql
 sql/upgrades/120.sql
 sql/upgrades/121.sql
 sql/upgrades/122.sql
 sql/upgrades/123.sql
 sql/upgrades/124.sql
 sql/upgrades/125.sql
+sql/upgrades/126.sql
 swh/__init__.py
 swh.indexer.egg-info/PKG-INFO
 swh.indexer.egg-info/SOURCES.txt
 swh.indexer.egg-info/dependency_links.txt
 swh.indexer.egg-info/entry_points.txt
 swh.indexer.egg-info/requires.txt
 swh.indexer.egg-info/top_level.txt
 swh/indexer/__init__.py
 swh/indexer/cli.py
 swh/indexer/codemeta.py
 swh/indexer/ctags.py
 swh/indexer/fossology_license.py
 swh/indexer/indexer.py
 swh/indexer/journal_client.py
 swh/indexer/metadata.py
 swh/indexer/metadata_detector.py
 swh/indexer/mimetype.py
 swh/indexer/origin_head.py
 swh/indexer/rehash.py
 swh/indexer/tasks.py
 swh/indexer/data/codemeta/CITATION
 swh/indexer/data/codemeta/LICENSE
 swh/indexer/data/codemeta/codemeta.jsonld
 swh/indexer/data/codemeta/crosswalk.csv
 swh/indexer/metadata_dictionary/__init__.py
 swh/indexer/metadata_dictionary/base.py
 swh/indexer/metadata_dictionary/codemeta.py
 swh/indexer/metadata_dictionary/maven.py
 swh/indexer/metadata_dictionary/npm.py
 swh/indexer/metadata_dictionary/python.py
 swh/indexer/metadata_dictionary/ruby.py
 swh/indexer/sql/10-swh-init.sql
 swh/indexer/sql/20-swh-enums.sql
 swh/indexer/sql/30-swh-schema.sql
 swh/indexer/sql/40-swh-func.sql
 swh/indexer/sql/50-swh-data.sql
 swh/indexer/sql/60-swh-indexes.sql
 swh/indexer/storage/__init__.py
 swh/indexer/storage/converters.py
 swh/indexer/storage/db.py
 swh/indexer/storage/in_memory.py
 swh/indexer/storage/api/__init__.py
 swh/indexer/storage/api/client.py
 swh/indexer/storage/api/server.py
 swh/indexer/storage/api/wsgi.py
 swh/indexer/tests/__init__.py
 swh/indexer/tests/conftest.py
 swh/indexer/tests/tasks.py
 swh/indexer/tests/test_cli.py
 swh/indexer/tests/test_ctags.py
 swh/indexer/tests/test_fossology_license.py
 swh/indexer/tests/test_journal_client.py
 swh/indexer/tests/test_metadata.py
 swh/indexer/tests/test_mimetype.py
 swh/indexer/tests/test_origin_head.py
 swh/indexer/tests/test_origin_metadata.py
 swh/indexer/tests/utils.py
 swh/indexer/tests/storage/__init__.py
 swh/indexer/tests/storage/generate_data_test.py
 swh/indexer/tests/storage/test_api_client.py
 swh/indexer/tests/storage/test_converters.py
 swh/indexer/tests/storage/test_in_memory.py
 swh/indexer/tests/storage/test_server.py
 swh/indexer/tests/storage/test_storage.py
\ No newline at end of file
diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt
index dded71d..2880880 100644
--- a/swh.indexer.egg-info/requires.txt
+++ b/swh.indexer.egg-info/requires.txt
@@ -1,16 +1,16 @@
 vcversioner
 click
 file-magic
 pyld
 xmltodict
-swh.core[db,http]>=0.0.61
+swh.core[db,http]>=0.0.65
 swh.model>=0.0.15
 swh.objstorage>=0.0.28
 swh.scheduler>=0.0.47
 swh.storage>=0.0.143
 swh.journal>=0.0.11
 
 [testing]
 pytest<4
 pytest-postgresql
 hypothesis>=3.11.0
diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py
index 7cc316f..d02cdf2 100644
--- a/swh/indexer/codemeta.py
+++ b/swh/indexer/codemeta.py
@@ -1,129 +1,147 @@
 # Copyright (C) 2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+import collections
 import csv
+import itertools
 import json
 import os.path
 import re
 
 import swh.indexer
 from pyld import jsonld
 
 _DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), 'data')
 
 CROSSWALK_TABLE_PATH = os.path.join(_DATA_DIR, 'codemeta', 'crosswalk.csv')
 
 CODEMETA_CONTEXT_PATH = os.path.join(_DATA_DIR, 'codemeta', 'codemeta.jsonld')
 
 
 with open(CODEMETA_CONTEXT_PATH) as fd:
     CODEMETA_CONTEXT = json.load(fd)
 
 CODEMETA_CONTEXT_URL = 'https://doi.org/10.5063/schema/codemeta-2.0'
 CODEMETA_ALTERNATE_CONTEXT_URLS = {
     ('https://raw.githubusercontent.com/codemeta/codemeta/'
      'master/codemeta.jsonld')
 }
 CODEMETA_URI = 'https://codemeta.github.io/terms/'
 SCHEMA_URI = 'http://schema.org/'
 
 
 PROPERTY_BLACKLIST = {
     # CodeMeta properties that we cannot properly represent.
     SCHEMA_URI + 'softwareRequirements',
     CODEMETA_URI + 'softwareSuggestions',
 
     # Duplicate of 'author'
     SCHEMA_URI + 'creator',
     }
 
 _codemeta_field_separator = re.compile(r'\s*[,/]\s*')
 
 
 def make_absolute_uri(local_name):
     definition = CODEMETA_CONTEXT['@context'][local_name]
     if isinstance(definition, str):
         return definition
     elif isinstance(definition, dict):
         prefixed_name = definition['@id']
         (prefix, local_name) = prefixed_name.split(':')
         if prefix == 'schema':
             canonical_name = SCHEMA_URI + local_name
         elif prefix == 'codemeta':
             canonical_name = CODEMETA_URI + local_name
         else:
             assert False, prefix
         return canonical_name
     else:
         assert False, definition
 
 
 def _read_crosstable(fd):
     reader = csv.reader(fd)
     try:
         header = next(reader)
     except StopIteration:
         raise ValueError('empty file')
 
     data_sources = set(header) - {'Parent Type', 'Property',
                                   'Type', 'Description'}
     assert 'codemeta-V1' in data_sources
 
     codemeta_translation = {data_source: {} for data_source in data_sources}
     terms = set()
 
     for line in reader:  # For each canonical name
         local_name = dict(zip(header, line))['Property']
         if not local_name:
             continue
         canonical_name = make_absolute_uri(local_name)
         if canonical_name in PROPERTY_BLACKLIST:
             continue
         terms.add(canonical_name)
         for (col, value) in zip(header, line):  # For each cell in the row
             if col in data_sources:
                 # If that's not the parentType/property/type/description
                 for local_name in _codemeta_field_separator.split(value):
                     # For each of the data source's properties that maps
                     # to this canonical name
                     if local_name.strip():
                         codemeta_translation[col][local_name.strip()] = \
                                 canonical_name
 
     return (terms, codemeta_translation)
 
 
 with open(CROSSWALK_TABLE_PATH) as fd:
     (CODEMETA_TERMS, CROSSWALK_TABLE) = _read_crosstable(fd)
 
 
 def _document_loader(url):
     """Document loader for pyld.
 
     Reads the local codemeta.jsonld file instead of fetching it
     from the Internet every single time."""
     if url == CODEMETA_CONTEXT_URL or url in CODEMETA_ALTERNATE_CONTEXT_URLS:
         return {
                 'contextUrl': None,
                 'documentUrl': url,
                 'document': CODEMETA_CONTEXT,
                 }
     elif url == CODEMETA_URI:
         raise Exception('{} is CodeMeta\'s URI, use {} as context url'.format(
             CODEMETA_URI, CODEMETA_CONTEXT_URL))
     else:
         raise Exception(url)
 
 
 def compact(doc):
     """Same as `pyld.jsonld.compact`, but in the context of CodeMeta."""
     return jsonld.compact(doc, CODEMETA_CONTEXT_URL,
                           options={'documentLoader': _document_loader})
 
 
 def expand(doc):
     """Same as `pyld.jsonld.expand`, but in the context of CodeMeta."""
     return jsonld.expand(doc,
                          options={'documentLoader': _document_loader})
+
+
+def merge_documents(documents):
+    """Takes a list of metadata dicts, each generated from a different
+    metadata file, and merges them.
+
+    Removes duplicates, if any."""
+    documents = list(itertools.chain.from_iterable(map(expand, documents)))
+    merged_document = collections.defaultdict(list)
+    for document in documents:
+        for (key, values) in document.items():
+            for value in values:
+                if value not in merged_document[key]:
+                    merged_document[key].append(value)
+
+    return compact(merged_document)
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 83331f2..ecb097a 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,348 +1,356 @@
 # Copyright (C) 2017-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from copy import deepcopy
 
 from swh.core.utils import grouper
 
+from swh.indexer.codemeta import merge_documents
 from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
 from swh.indexer.origin_head import OriginHeadIndexer
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.metadata_detector import detect_metadata
-from swh.indexer.metadata_detector import extract_minimal_metadata_dict
 from swh.indexer.storage import INDEXER_CFG_KEY
 
 from swh.model import hashutil
 
 
 REVISION_GET_BATCH_SIZE = 10
+ORIGIN_GET_BATCH_SIZE = 10
+
+
+def call_with_batches(f, args, batch_size):
+    """Calls a function with batches of args, and concatenates the results.
+    """
+    groups = grouper(args, batch_size)
+    for group in groups:
+        yield from f(list(group))
 
 
 class ContentMetadataIndexer(ContentIndexer):
     """Content-level indexer
 
     This indexer is in charge of:
 
     - filtering out content already indexed in content_metadata
     - reading content from objstorage with the content's id sha1
     - computing metadata by given context
     - using the metadata_dictionary as the 'swh-metadata-translator' tool
     - store result in content_metadata table
 
     """
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones.
         """
         yield from self.idx_storage.content_metadata_missing((
             {
                 'id': sha1,
                 'indexer_configuration_id': self.tool['id'],
             } for sha1 in ids
         ))
 
     def index(self, id, data, log_suffix='unknown revision'):
         """Index sha1s' content and store result.
 
         Args:
             id (bytes): content's identifier
             data (bytes): raw content in bytes
 
         Returns:
             dict: dictionary representing a content_metadata. If the
             translation wasn't successful the metadata keys will
             be returned as None
 
         """
         result = {
             'id': id,
             'indexer_configuration_id': self.tool['id'],
             'metadata': None
         }
         try:
             mapping_name = self.tool['tool_configuration']['context']
             log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id)
             result['metadata'] = \
                 MAPPINGS[mapping_name](log_suffix).translate(data)
         except Exception:
             self.log.exception(
                 "Problem during metadata translation "
                 "for content %s" % hashutil.hash_to_hex(id))
         if result['metadata'] is None:
             return None
         return result
 
     def persist_index_computations(self, results, policy_update):
         """Persist the results in storage.
 
         Args:
             results ([dict]): list of content_metadata, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - metadata (jsonb): detected metadata
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
               respectively update duplicates or ignore them
 
         """
         self.idx_storage.content_metadata_add(
             results, conflict_update=(policy_update == 'update-dups'))
 
 
 class RevisionMetadataIndexer(RevisionIndexer):
     """Revision-level indexer
 
     This indexer is in charge of:
 
     - filtering revisions already indexed in revision_intrinsic_metadata table
       with defined computation tool
     - retrieve all entry_files in root directory
     - use metadata_detector for file_names containing metadata
     - compute metadata translation if necessary and possible (depends on tool)
     - send sha1s to content indexing if possible
     - store the results for revision
 
     """
     ADDITIONAL_CONFIG = {
         'tools': ('dict', {
             'name': 'swh-metadata-detector',
             'version': '0.0.2',
             'configuration': {
             },
         }),
     }
 
     def filter(self, sha1_gits):
         """Filter out known sha1s and return only missing ones.
 
         """
         yield from self.idx_storage.revision_intrinsic_metadata_missing((
             {
                 'id': sha1_git,
                 'indexer_configuration_id': self.tool['id'],
             } for sha1_git in sha1_gits
         ))
 
     def index(self, rev):
         """Index rev by processing it and organizing result.
 
         use metadata_detector to iterate on filenames
 
         - if one filename detected -> sends file to content indexer
         - if multiple file detected -> translation needed at revision level
 
         Args:
           rev (dict): revision artifact from storage
 
         Returns:
             dict: dictionary representing a revision_intrinsic_metadata, with
             keys:
 
             - id (str): rev's identifier (sha1_git)
             - indexer_configuration_id (bytes): tool used
             - metadata: dict of retrieved metadata
 
         """
         result = {
             'id': rev['id'],
             'indexer_configuration_id': self.tool['id'],
             'mappings': None,
             'metadata': None
         }
 
         try:
             root_dir = rev['directory']
             dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
             if [entry['type'] for entry in dir_ls] == ['dir']:
                 # If the root is just a single directory, recurse into it
                 # eg. PyPI packages, GNU tarballs
                 subdir = dir_ls[0]['target']
                 dir_ls = self.storage.directory_ls(subdir, recursive=False)
             files = [entry for entry in dir_ls if entry['type'] == 'file']
             detected_files = detect_metadata(files)
             (mappings, metadata) = self.translate_revision_intrinsic_metadata(
                 detected_files,
                 log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id']))
             result['mappings'] = mappings
             result['metadata'] = metadata
         except Exception as e:
             self.log.exception(
                 'Problem when indexing rev: %r', e)
         return result
 
     def persist_index_computations(self, results, policy_update):
         """Persist the results in storage.
 
         Args:
             results ([dict]): list of content_mimetype, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - mimetype (bytes): mimetype in bytes
               - encoding (bytes): encoding in bytes
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
               respectively update duplicates or ignore them
 
         """
         # TODO: add functions in storage to keep data in
         # revision_intrinsic_metadata
         self.idx_storage.revision_intrinsic_metadata_add(
             results, conflict_update=(policy_update == 'update-dups'))
 
     def translate_revision_intrinsic_metadata(
             self, detected_files, log_suffix):
         """
         Determine plan of action to translate metadata when containing
         one or multiple detected files:
 
         Args:
             detected_files (dict): dictionary mapping context names (e.g.,
               "npm", "authors") to list of sha1
 
         Returns:
             (List[str], dict): list of mappings used and dict with
             translated metadata according to the CodeMeta vocabulary
 
         """
         used_mappings = [MAPPINGS[context].name for context in detected_files]
         metadata = []
         tool = {
                 'name': 'swh-metadata-translator',
                 'version': '0.0.2',
                 'configuration': {
                 },
             }
         # TODO: iterate on each context, on each file
         # -> get raw_contents
         # -> translate each content
         config = {
             k: self.config[k]
             for k in [INDEXER_CFG_KEY, 'objstorage', 'storage']
         }
         config['tools'] = [tool]
         for context in detected_files.keys():
             cfg = deepcopy(config)
             cfg['tools'][0]['configuration']['context'] = context
             c_metadata_indexer = ContentMetadataIndexer(config=cfg)
             # sha1s that are in content_metadata table
             sha1s_in_storage = []
             metadata_generator = self.idx_storage.content_metadata_get(
                 detected_files[context])
             for c in metadata_generator:
                 # extracting metadata
                 sha1 = c['id']
                 sha1s_in_storage.append(sha1)
                 local_metadata = c['metadata']
                 # local metadata is aggregated
                 if local_metadata:
                     metadata.append(local_metadata)
 
             sha1s_filtered = [item for item in detected_files[context]
                               if item not in sha1s_in_storage]
 
             if sha1s_filtered:
                 # content indexing
                 try:
                     c_metadata_indexer.run(sha1s_filtered,
                                            policy_update='ignore-dups',
                                            log_suffix=log_suffix)
                     # on the fly possibility:
                     for result in c_metadata_indexer.results:
                         local_metadata = result['metadata']
                         metadata.append(local_metadata)
 
                 except Exception:
                     self.log.exception(
                         "Exception while indexing metadata on contents")
 
-        # transform metadata into min set with swh-metadata-detector
-        min_metadata = extract_minimal_metadata_dict(metadata)
-        return (used_mappings, min_metadata)
+        metadata = merge_documents(metadata)
+        return (used_mappings, metadata)
 
 
 class OriginMetadataIndexer(OriginIndexer):
     ADDITIONAL_CONFIG = RevisionMetadataIndexer.ADDITIONAL_CONFIG
 
     USE_TOOLS = False
 
     def __init__(self, config=None, **kwargs):
         super().__init__(config=config, **kwargs)
         self.origin_head_indexer = OriginHeadIndexer(config=config)
         self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
 
     def index_list(self, origin_urls):
         head_rev_ids = []
         origins_with_head = []
-        origins = self.storage.origin_get(
-            [{'url': url} for url in origin_urls])
+        origins = list(call_with_batches(
+            self.storage.origin_get,
+            [{'url': url} for url in origin_urls], ORIGIN_GET_BATCH_SIZE))
         for origin in origins:
             head_result = self.origin_head_indexer.index(origin['url'])
             if head_result:
                 head_result['origin_id'] = origin['id']
                 origins_with_head.append(origin)
                 head_rev_ids.append(head_result['revision_id'])
 
-        head_revs = []
-        groups = grouper(head_rev_ids, REVISION_GET_BATCH_SIZE)
-        for group in groups:
-            head_revs.extend(self.storage.revision_get(group))
+        head_revs = list(call_with_batches(
+            self.storage.revision_get,
+            head_rev_ids, REVISION_GET_BATCH_SIZE))
         assert len(head_revs) == len(head_rev_ids)
 
         results = []
         for (origin, rev) in zip(origins_with_head, head_revs):
             if not rev:
                 self.log.warning('Missing head revision of origin %r',
                                  origin['url'])
                 continue
 
             rev_metadata = self.revision_metadata_indexer.index(rev)
             orig_metadata = {
                 'from_revision': rev_metadata['id'],
                 'id': origin['id'],
                 'origin_url': origin['url'],
                 'metadata': rev_metadata['metadata'],
                 'mappings': rev_metadata['mappings'],
                 'indexer_configuration_id':
                     rev_metadata['indexer_configuration_id'],
             }
             results.append((orig_metadata, rev_metadata))
         return results
 
     def persist_index_computations(self, results, policy_update):
         conflict_update = (policy_update == 'update-dups')
 
         # Deduplicate revisions
         rev_metadata = []
         orig_metadata = []
         revs_to_delete = []
         origs_to_delete = []
         for (orig_item, rev_item) in results:
             assert rev_item['metadata'] == orig_item['metadata']
             if not rev_item['metadata'] or \
                     rev_item['metadata'].keys() <= {'@context'}:
                 # If we didn't find any metadata, don't store a DB record
                 # (and delete existing ones, if any)
                 if rev_item not in revs_to_delete:
                     revs_to_delete.append(rev_item)
                 if orig_item not in origs_to_delete:
                     origs_to_delete.append(orig_item)
             else:
                 if rev_item not in rev_metadata:
                     rev_metadata.append(rev_item)
                 if orig_item not in orig_metadata:
                     orig_metadata.append(orig_item)
 
         if rev_metadata:
             self.idx_storage.revision_intrinsic_metadata_add(
                 rev_metadata, conflict_update=conflict_update)
         if orig_metadata:
             self.idx_storage.origin_intrinsic_metadata_add(
                 orig_metadata, conflict_update=conflict_update)
 
         # revs_to_delete should always be empty unless we changed a mapping
         # to detect less files or less content.
         # However, origs_to_delete may be empty whenever an upstream deletes
         # a metadata file.
         if origs_to_delete:
             self.idx_storage.origin_intrinsic_metadata_delete(origs_to_delete)
         if revs_to_delete:
             self.idx_storage.revision_intrinsic_metadata_delete(revs_to_delete)
diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py
index fb7fc3f..b8e99b5 100644
--- a/swh/indexer/metadata_detector.py
+++ b/swh/indexer/metadata_detector.py
@@ -1,62 +1,24 @@
 # Copyright (C) 2017 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-from swh.indexer.codemeta import compact, expand
-from swh.indexer.codemeta import make_absolute_uri
 from swh.indexer.metadata_dictionary import MAPPINGS
 
 
 def detect_metadata(files):
     """
     Detects files potentially containing metadata
 
     Args:
         file_entries (list): list of files
 
     Returns:
         dict: {mapping_filenames[name]:f['sha1']} (may be empty)
     """
     results = {}
     for (mapping_name, mapping) in MAPPINGS.items():
         matches = mapping.detect_metadata_files(files)
         if matches:
             results[mapping_name] = matches
     return results
-
-
-_MINIMAL_PROPERTY_SET = {
-    "developmentStatus", "version", "operatingSystem", "description",
-    "keywords", "issueTracker", "name", "author", "relatedLink",
-    "url", "license", "maintainer", "email", "identifier",
-    "codeRepository"}
-
-MINIMAL_METADATA_SET = {make_absolute_uri(prop)
-                        for prop in _MINIMAL_PROPERTY_SET}
-
-
-def extract_minimal_metadata_dict(metadata_list):
-    """
-    Every item in the metadata_list is a dict of translated_metadata in the
-    CodeMeta vocabulary.
-
-    We wish to extract a minimal set of terms and keep all values corresponding
-    to this term without duplication.
-
-    Args:
-        metadata_list (list): list of dicts of translated_metadata
-
-    Returns:
-        dict: minimal_dict; dict with selected values of metadata
-    """
-    minimal_dict = {}
-    for document in metadata_list:
-        for metadata_item in expand(document):
-            for (term, value) in metadata_item.items():
-                if term in MINIMAL_METADATA_SET:
-                    if term not in minimal_dict:
-                        minimal_dict[term] = [value]
-                    elif value not in minimal_dict[term]:
-                        minimal_dict[term].append(value)
-    return compact(minimal_dict)
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
index 659fe77..9ac73eb 100644
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -1,156 +1,158 @@
 # Copyright (C) 2018-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import re
 
 from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
 from .base import JsonMapping
 
 
 class NpmMapping(JsonMapping):
     """
     dedicated class for NPM (package.json) mapping and translation
     """
     name = 'npm'
     mapping = CROSSWALK_TABLE['NodeJS']
     filename = b'package.json'
     string_fields = ['name', 'version', 'homepage', 'description', 'email']
 
     _schema_shortcuts = {
             'github': 'git+https://github.com/%s.git',
             'gist': 'git+https://gist.github.com/%s.git',
             'gitlab': 'git+https://gitlab.com/%s.git',
             # Bitbucket supports both hg and git, and the shortcut does not
             # tell which one to use.
             # 'bitbucket': 'https://bitbucket.org/',
             }
 
     def normalize_repository(self, d):
         """https://docs.npmjs.com/files/package.json#repository
 
         >>> NpmMapping().normalize_repository({
         ...     'type': 'git',
         ...     'url': 'https://example.org/foo.git'
         ... })
         {'@id': 'git+https://example.org/foo.git'}
         >>> NpmMapping().normalize_repository(
         ...     'gitlab:foo/bar')
         {'@id': 'git+https://gitlab.com/foo/bar.git'}
         >>> NpmMapping().normalize_repository(
         ...     'foo/bar')
         {'@id': 'git+https://github.com/foo/bar.git'}
         """
         if isinstance(d, dict) and isinstance(d.get('type'), str) \
                 and isinstance(d.get('url'), str):
             url = '{type}+{url}'.format(**d)
         elif isinstance(d, str):
             if '://' in d:
                 url = d
             elif ':' in d:
                 (schema, rest) = d.split(':', 1)
                 if schema in self._schema_shortcuts:
                     url = self._schema_shortcuts[schema] % rest
                 else:
                     return None
             else:
                 url = self._schema_shortcuts['github'] % d
 
         else:
             return None
 
         return {'@id': url}
 
     def normalize_bugs(self, d):
         """https://docs.npmjs.com/files/package.json#bugs
 
         >>> NpmMapping().normalize_bugs({
         ...     'url': 'https://example.org/bugs/',
         ...     'email': 'bugs@example.org'
         ... })
         {'@id': 'https://example.org/bugs/'}
         >>> NpmMapping().normalize_bugs(
         ...     'https://example.org/bugs/')
         {'@id': 'https://example.org/bugs/'}
         """
         if isinstance(d, dict) and isinstance(d.get('url'), str):
             return {'@id': d['url']}
         elif isinstance(d, str):
             return {'@id': d}
         else:
             return None
 
     _parse_author = re.compile(r'^ *'
                                r'(?P<name>.*?)'
                                r'( +<(?P<email>.*)>)?'
                                r'( +\((?P<url>.*)\))?'
                                r' *$')
 
     def normalize_author(self, d):
         """https://docs.npmjs.com/files/package.json#people-fields-author-contributors'
 
         >>> from pprint import pprint
         >>> pprint(NpmMapping().normalize_author({
         ...     'name': 'John Doe',
         ...     'email': 'john.doe@example.org',
         ...     'url': 'https://example.org/~john.doe',
         ... }))
         {'@list': [{'@type': 'http://schema.org/Person',
                     'http://schema.org/email': 'john.doe@example.org',
                     'http://schema.org/name': 'John Doe',
                     'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
         >>> pprint(NpmMapping().normalize_author(
         ...     'John Doe <john.doe@example.org> (https://example.org/~john.doe)'
         ... ))
         {'@list': [{'@type': 'http://schema.org/Person',
                     'http://schema.org/email': 'john.doe@example.org',
                     'http://schema.org/name': 'John Doe',
                     'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
         """ # noqa
         author = {'@type': SCHEMA_URI+'Person'}
         if isinstance(d, dict):
             name = d.get('name', None)
             email = d.get('email', None)
             url = d.get('url', None)
         elif isinstance(d, str):
             match = self._parse_author.match(d)
+            if not match:
+                return None
             name = match.group('name')
             email = match.group('email')
             url = match.group('url')
         else:
             return None
         if name and isinstance(name, str):
             author[SCHEMA_URI+'name'] = name
         if email and isinstance(email, str):
             author[SCHEMA_URI+'email'] = email
         if url and isinstance(url, str):
             author[SCHEMA_URI+'url'] = {'@id': url}
         return {"@list": [author]}
 
     def normalize_license(self, s):
         """https://docs.npmjs.com/files/package.json#license
 
         >>> NpmMapping().normalize_license('MIT')
         {'@id': 'https://spdx.org/licenses/MIT'}
         """
         if isinstance(s, str):
             return {"@id": "https://spdx.org/licenses/" + s}
 
     def normalize_homepage(self, s):
         """https://docs.npmjs.com/files/package.json#homepage
 
         >>> NpmMapping().normalize_homepage('https://example.org/~john.doe')
         {'@id': 'https://example.org/~john.doe'}
         """
         if isinstance(s, str):
             return {"@id": s}
 
     def normalize_keywords(self, l):
         """https://docs.npmjs.com/files/package.json#homepage
 
         >>> NpmMapping().normalize_keywords(['foo', 'bar'])
         ['foo', 'bar']
         """
         if isinstance(l, list):
             return [x for x in l if isinstance(x, str)]
diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql
index 54bce54..af05499 100644
--- a/swh/indexer/sql/30-swh-schema.sql
+++ b/swh/indexer/sql/30-swh-schema.sql
@@ -1,146 +1,146 @@
 ---
 --- Software Heritage Indexers Data Model
 ---
 
 -- drop schema if exists swh cascade;
 -- create schema swh;
 -- set search_path to swh;
 
 create table dbversion
 (
   version     int primary key,
   release     timestamptz,
   description text
 );
 
 insert into dbversion(version, release, description)
-      values(125, now(), 'Work In Progress');
+      values(126, now(), 'Work In Progress');
 -- Computing metadata on sha1's contents
 
 -- a SHA1 checksum (not necessarily originating from Git)
 create domain sha1 as bytea check (length(value) = 20);
 
 -- a Git object ID, i.e., a SHA1 checksum
 create domain sha1_git as bytea check (length(value) = 20);
 
 create table indexer_configuration (
   id serial not null,
   tool_name text not null,
   tool_version text not null,
   tool_configuration jsonb
 );
 
 comment on table indexer_configuration is 'Indexer''s configuration version';
 comment on column indexer_configuration.id is 'Tool identifier';
 comment on column indexer_configuration.tool_version is 'Tool name';
 comment on column indexer_configuration.tool_version is 'Tool version';
 comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...';
 
 -- Properties (mimetype, encoding, etc...)
 create table content_mimetype (
   id sha1 not null,
   mimetype text not null,
   encoding text not null,
   indexer_configuration_id bigint not null
 );
 
 comment on table content_mimetype is 'Metadata associated to a raw content';
 comment on column content_mimetype.mimetype is 'Raw content Mimetype';
 comment on column content_mimetype.encoding is 'Raw content encoding';
 comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information';
 
 -- Language metadata
 create table content_language (
   id sha1 not null,
   lang languages not null,
   indexer_configuration_id bigint not null
 );
 
 comment on table content_language is 'Language information on a raw content';
 comment on column content_language.lang is 'Language information';
 comment on column content_language.indexer_configuration_id is 'Tool used to compute the information';
 
 -- ctags information per content
 create table content_ctags (
   id sha1 not null,
   name text not null,
   kind text not null,
   line bigint not null,
   lang ctags_languages not null,
   indexer_configuration_id bigint not null
 );
 
 comment on table content_ctags is 'Ctags information on a raw content';
 comment on column content_ctags.id is 'Content identifier';
 comment on column content_ctags.name is 'Symbol name';
 comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)';
 comment on column content_ctags.line is 'Symbol line';
 comment on column content_ctags.lang is 'Language information for that content';
 comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information';
 
 create table fossology_license(
   id smallserial,
   name text not null
 );
 
 comment on table fossology_license is 'Possible license recognized by license indexer';
 comment on column fossology_license.id is 'License identifier';
 comment on column fossology_license.name is 'License name';
 
 create table content_fossology_license (
   id sha1 not null,
   license_id smallserial not null,
   indexer_configuration_id bigint not null
 );
 
 comment on table content_fossology_license is 'license associated to a raw content';
 comment on column content_fossology_license.id is 'Raw content identifier';
 comment on column content_fossology_license.license_id is 'One of the content''s license identifier';
 comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information';
 
 
 -- The table content_metadata provides a translation to files
 -- identified as potentially containning metadata with a translation tool (indexer_configuration_id)
 create table content_metadata(
   id                       sha1   not null,
   metadata                 jsonb  not null,
   indexer_configuration_id bigint not null
 );
 
 comment on table content_metadata is 'metadata semantically translated from a content file';
 comment on column content_metadata.id is 'sha1 of content file';
 comment on column content_metadata.metadata is 'result of translation with defined format';
 comment on column content_metadata.indexer_configuration_id is 'tool used for translation';
 
 -- The table revision_intrinsic_metadata provides a minimal set of intrinsic
 -- metadata detected with the detection  tool (indexer_configuration_id) and
 -- aggregated from the content_metadata translation.
 create table revision_intrinsic_metadata(
   id                       sha1_git   not null,
   metadata                 jsonb      not null,
   indexer_configuration_id bigint     not null,
   mappings                 text array not null
 );
 
 comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision';
 comment on column revision_intrinsic_metadata.id is 'sha1_git of revision';
 comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
 comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
 comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
 
 create table origin_intrinsic_metadata(
   id                        bigserial  not null,
   origin_url                text,
   metadata                  jsonb,
   indexer_configuration_id  bigint     not null,
   from_revision             sha1_git   not null,
   metadata_tsvector         tsvector,
   mappings                  text array not null
 );
 
 comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
 comment on column origin_intrinsic_metadata.id is 'the entry id in origin';
 comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision';
 comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
 comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.';
 comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql
index acea556..69d0c05 100644
--- a/swh/indexer/sql/40-swh-func.sql
+++ b/swh/indexer/sql/40-swh-func.sql
@@ -1,453 +1,456 @@
 -- Postgresql index helper function
 create or replace function hash_sha1(text)
     returns text
     language sql strict immutable
 as $$
     select encode(public.digest($1, 'sha1'), 'hex')
 $$;
 
 comment on function hash_sha1(text) is 'Compute sha1 hash as text';
 
 -- create a temporary table called tmp_TBLNAME, mimicking existing table
 -- TBLNAME
 --
 -- Args:
 --     tblname: name of the table to mimick
 create or replace function swh_mktemp(tblname regclass)
     returns void
     language plpgsql
 as $$
 begin
     execute format('
 	create temporary table tmp_%1$I
 	    (like %1$I including defaults)
 	    on commit drop;
       alter table tmp_%1$I drop column if exists object_id;
 	', tblname);
     return;
 end
 $$;
 
 -- create a temporary table for content_mimetype tmp_content_mimetype,
 create or replace function swh_mktemp_content_mimetype()
     returns void
     language sql
 as $$
   create temporary table tmp_content_mimetype (
     like content_mimetype including defaults
   ) on commit drop;
 $$;
 
 comment on function swh_mktemp_content_mimetype() IS 'Helper table to add mimetype information';
 
 -- add tmp_content_mimetype entries to content_mimetype, overwriting
 -- duplicates if conflict_update is true, skipping duplicates otherwise.
 --
 -- If filtering duplicates is in order, the call to
 -- swh_content_mimetype_missing must take place before calling this
 -- function.
 --
 --
 -- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype,
 -- 2. call this function
 create or replace function swh_content_mimetype_add(conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     if conflict_update then
         insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
         select id, mimetype, encoding, indexer_configuration_id
         from tmp_content_mimetype tcm
             on conflict(id, indexer_configuration_id)
                 do update set mimetype = excluded.mimetype,
                               encoding = excluded.encoding;
 
     else
         insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
         select id, mimetype, encoding, indexer_configuration_id
         from tmp_content_mimetype tcm
             on conflict(id, indexer_configuration_id) do nothing;
     end if;
     return;
 end
 $$;
 
 comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes';
 
 -- add tmp_content_language entries to content_language, overwriting
 -- duplicates if conflict_update is true, skipping duplicates otherwise.
 --
 -- If filtering duplicates is in order, the call to
 -- swh_content_language_missing must take place before calling this
 -- function.
 --
 -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
 -- tmp_content_language, 2. call this function
 create or replace function swh_content_language_add(conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     if conflict_update then
       insert into content_language (id, lang, indexer_configuration_id)
       select id, lang, indexer_configuration_id
     	from tmp_content_language tcl
             on conflict(id, indexer_configuration_id)
                 do update set lang = excluded.lang;
 
     else
         insert into content_language (id, lang, indexer_configuration_id)
         select id, lang, indexer_configuration_id
     	  from tmp_content_language tcl
             on conflict(id, indexer_configuration_id)
             do nothing;
     end if;
     return;
 end
 $$;
 
 comment on function swh_content_language_add(boolean) IS 'Add new content languages';
 
 -- create a temporary table for retrieving content_language
 create or replace function swh_mktemp_content_language()
     returns void
     language sql
 as $$
   create temporary table tmp_content_language (
     like content_language including defaults
   ) on commit drop;
 $$;
 
 comment on function swh_mktemp_content_language() is 'Helper table to add content language';
 
 
 -- create a temporary table for content_ctags tmp_content_ctags,
 create or replace function swh_mktemp_content_ctags()
     returns void
     language sql
 as $$
   create temporary table tmp_content_ctags (
     like content_ctags including defaults
   ) on commit drop;
 $$;
 
 comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags';
 
 
 -- add tmp_content_ctags entries to content_ctags, overwriting
 -- duplicates if conflict_update is true, skipping duplicates otherwise.
 --
 -- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags,
 -- 2. call this function
 create or replace function swh_content_ctags_add(conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     if conflict_update then
         delete from content_ctags
         where id in (select tmp.id
                      from tmp_content_ctags tmp
                      inner join indexer_configuration i on i.id=tmp.indexer_configuration_id);
     end if;
 
     insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id)
     select id, name, kind, line, lang, indexer_configuration_id
     from tmp_content_ctags tct
         on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id)
         do nothing;
     return;
 end
 $$;
 
 comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content';
 
 create type content_ctags_signature as (
   id sha1,
   name text,
   kind text,
   line bigint,
   lang ctags_languages,
   tool_id integer,
   tool_name text,
   tool_version text,
   tool_configuration jsonb
 );
 
 -- Search within ctags content.
 --
 create or replace function swh_content_ctags_search(
        expression text,
        l integer default 10,
        last_sha1 sha1 default '\x0000000000000000000000000000000000000000')
     returns setof content_ctags_signature
     language sql
 as $$
     select c.id, name, kind, line, lang,
            i.id as tool_id, tool_name, tool_version, tool_configuration
     from content_ctags c
     inner join indexer_configuration i on i.id = c.indexer_configuration_id
     where hash_sha1(name) = hash_sha1(expression)
     and c.id > last_sha1
     order by id
     limit l;
 $$;
 
 comment on function swh_content_ctags_search(text, integer, sha1) IS 'Equality search through ctags'' symbols';
 
 
 -- create a temporary table for content_fossology_license tmp_content_fossology_license,
 create or replace function swh_mktemp_content_fossology_license()
     returns void
     language sql
 as $$
   create temporary table tmp_content_fossology_license (
     id                       sha1,
     license                  text,
     indexer_configuration_id integer
   ) on commit drop;
 $$;
 
 comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license';
 
 -- add tmp_content_fossology_license entries to content_fossology_license, overwriting
 -- duplicates if conflict_update is true, skipping duplicates otherwise.
 --
 -- operates in bulk: 0. swh_mktemp(content_fossology_license), 1. COPY to
 -- tmp_content_fossology_license, 2. call this function
 create or replace function swh_content_fossology_license_add(conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     -- insert unknown licenses first
     insert into fossology_license (name)
     select distinct license from tmp_content_fossology_license tmp
     where not exists (select 1 from fossology_license where name=tmp.license)
     on conflict(name) do nothing;
 
     if conflict_update then
         -- delete from content_fossology_license c
         --   using tmp_content_fossology_license tmp, indexer_configuration i
         --   where c.id = tmp.id and i.id=tmp.indexer_configuration_id
         delete from content_fossology_license
         where id in (select tmp.id
                      from tmp_content_fossology_license tmp
                      inner join indexer_configuration i on i.id=tmp.indexer_configuration_id);
     end if;
 
     insert into content_fossology_license (id, license_id, indexer_configuration_id)
     select tcl.id,
           (select id from fossology_license where name = tcl.license) as license,
           indexer_configuration_id
     from tmp_content_fossology_license tcl
         on conflict(id, license_id, indexer_configuration_id)
         do nothing;
     return;
 end
 $$;
 
 comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses';
 
 -- content_metadata functions
 
 -- add tmp_content_metadata entries to content_metadata, overwriting
 -- duplicates if conflict_update is true, skipping duplicates otherwise.
 --
 -- If filtering duplicates is in order, the call to
 -- swh_content_metadata_missing must take place before calling this
 -- function.
 --
 -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
 -- tmp_content_metadata, 2. call this function
 create or replace function swh_content_metadata_add(conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     if conflict_update then
       insert into content_metadata (id, metadata, indexer_configuration_id)
       select id, metadata, indexer_configuration_id
     	from tmp_content_metadata tcm
             on conflict(id, indexer_configuration_id)
                 do update set metadata = excluded.metadata;
 
     else
         insert into content_metadata (id, metadata, indexer_configuration_id)
         select id, metadata, indexer_configuration_id
     	from tmp_content_metadata tcm
             on conflict(id, indexer_configuration_id)
             do nothing;
     end if;
     return;
 end
 $$;
 
 comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata';
 
 -- create a temporary table for retrieving content_metadata
 create or replace function swh_mktemp_content_metadata()
     returns void
     language sql
 as $$
   create temporary table tmp_content_metadata (
     like content_metadata including defaults
   ) on commit drop;
 $$;
 
 comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata';
 
 -- end content_metadata functions
 
 -- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata,
 -- overwriting duplicates if conflict_update is true, skipping duplicates
 -- otherwise.
 --
 -- If filtering duplicates is in order, the call to
 -- swh_revision_intrinsic_metadata_missing must take place before calling this
 -- function.
 --
 -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
 -- tmp_revision_intrinsic_metadata, 2. call this function
 create or replace function swh_revision_intrinsic_metadata_add(conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     if conflict_update then
       insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
       select id, metadata, mappings, indexer_configuration_id
     	from tmp_revision_intrinsic_metadata tcm
             on conflict(id, indexer_configuration_id)
                 do update set
                     metadata = excluded.metadata,
                     mappings = excluded.mappings;
 
     else
         insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
         select id, metadata, mappings, indexer_configuration_id
     	from tmp_revision_intrinsic_metadata tcm
             on conflict(id, indexer_configuration_id)
             do nothing;
     end if;
     return;
 end
 $$;
 
 comment on function swh_revision_intrinsic_metadata_add(boolean) IS 'Add new revision intrinsic metadata';
 
 -- create a temporary table for retrieving revision_intrinsic_metadata
 create or replace function swh_mktemp_revision_intrinsic_metadata()
     returns void
     language sql
 as $$
   create temporary table tmp_revision_intrinsic_metadata (
     like revision_intrinsic_metadata including defaults
   ) on commit drop;
 $$;
 
 comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata';
 
 -- create a temporary table for retrieving origin_intrinsic_metadata
 create or replace function swh_mktemp_origin_intrinsic_metadata()
     returns void
     language sql
 as $$
   create temporary table tmp_origin_intrinsic_metadata (
     like origin_intrinsic_metadata including defaults
   ) on commit drop;
 $$;
 
 comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata';
 
 create or replace function swh_mktemp_indexer_configuration()
     returns void
     language sql
 as $$
     create temporary table tmp_indexer_configuration (
       like indexer_configuration including defaults
     ) on commit drop;
     alter table tmp_indexer_configuration drop column id;
 $$;
 
 
 -- add tmp_indexer_configuration entries to indexer_configuration,
 -- skipping duplicates if any.
 --
 -- operates in bulk: 0. create temporary tmp_indexer_configuration, 1. COPY to
 -- it, 2. call this function to insert and filtering out duplicates
 create or replace function swh_indexer_configuration_add()
     returns setof indexer_configuration
     language plpgsql
 as $$
 begin
       insert into indexer_configuration(tool_name, tool_version, tool_configuration)
       select tool_name, tool_version, tool_configuration from tmp_indexer_configuration tmp
       on conflict(tool_name, tool_version, tool_configuration) do nothing;
 
       return query
           select id, tool_name, tool_version, tool_configuration
           from tmp_indexer_configuration join indexer_configuration
               using(tool_name, tool_version, tool_configuration);
 
       return;
 end
 $$;
 
 -- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata,
 -- overwriting duplicates if conflict_update is true, skipping duplicates
 -- otherwise.
 --
 -- If filtering duplicates is in order, the call to
 -- swh_origin_intrinsic_metadata_missing must take place before calling this
 -- function.
 --
 -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
 -- tmp_origin_intrinsic_metadata, 2. call this function
 create or replace function swh_origin_intrinsic_metadata_add(
         conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     perform swh_origin_intrinsic_metadata_compute_tsvector();
     if conflict_update then
       insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
       select id, origin_url, metadata, indexer_configuration_id, from_revision,
              metadata_tsvector, mappings
     	from tmp_origin_intrinsic_metadata
             on conflict(id, indexer_configuration_id)
                 do update set
                     metadata = excluded.metadata,
-                    mappings = excluded.mappings;
+                    metadata_tsvector = excluded.metadata_tsvector,
+                    mappings = excluded.mappings,
+                    origin_url = excluded.origin_url,
+                    from_revision = excluded.from_revision;
 
     else
         insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
         select id, origin_url, metadata, indexer_configuration_id, from_revision,
                metadata_tsvector, mappings
     	from tmp_origin_intrinsic_metadata
             on conflict(id, indexer_configuration_id)
             do nothing;
     end if;
     return;
 end
 $$;
 
 comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
 
 
 -- Compute the metadata_tsvector column in tmp_origin_intrinsic_metadata.
 --
 -- It uses the "pg_catalog.simple" dictionary, as it has no stopword,
 -- so it should be suitable for proper names and non-English text.
 create or replace function swh_origin_intrinsic_metadata_compute_tsvector()
     returns void
     language plpgsql
 as $$
 begin
     update tmp_origin_intrinsic_metadata
         set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata);
 end
 $$;
 
diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py
index 085c8cd..ec4c234 100644
--- a/swh/indexer/storage/api/client.py
+++ b/swh/indexer/storage/api/client.py
@@ -1,17 +1,17 @@
 # Copyright (C) 2015-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-from swh.core.api import SWHRemoteAPI
+from swh.core.api import RPCClient
 
 from swh.storage.exc import StorageAPIError
 
 from .. import IndexerStorage
 
 
-class RemoteStorage(SWHRemoteAPI):
+class RemoteStorage(RPCClient):
     """Proxy to a remote storage API"""
 
     backend_class = IndexerStorage
     api_exception = StorageAPIError
diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py
index 04ea61a..7edfec6 100644
--- a/swh/indexer/storage/api/server.py
+++ b/swh/indexer/storage/api/server.py
@@ -1,106 +1,106 @@
 # Copyright (C) 2015-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import os
 import logging
 
 from swh.core import config
-from swh.core.api import (SWHServerAPIApp, error_handler,
+from swh.core.api import (RPCServerApp, error_handler,
                           encode_data_server as encode_data)
 from swh.indexer.storage import (
     get_indexer_storage, INDEXER_CFG_KEY, IndexerStorage
 )
 
 
 def get_storage():
     global storage
     if not storage:
         storage = get_indexer_storage(**app.config[INDEXER_CFG_KEY])
 
     return storage
 
 
-app = SWHServerAPIApp(__name__,
-                      backend_class=IndexerStorage,
-                      backend_factory=get_storage)
+app = RPCServerApp(__name__,
+                   backend_class=IndexerStorage,
+                   backend_factory=get_storage)
 storage = None
 
 
 @app.errorhandler(Exception)
 def my_error_handler(exception):
     return error_handler(exception, encode_data)
 
 
 @app.route('/')
 def index():
     return 'SWH Indexer Storage API server'
 
 
 api_cfg = None
 
 
 def load_and_check_config(config_file, type='local'):
     """Check the minimal configuration is set to run the api or raise an
        error explanation.
 
     Args:
         config_file (str): Path to the configuration file to load
         type (str): configuration type. For 'local' type, more
                     checks are done.
 
     Raises:
         Error if the setup is not as expected
 
     Returns:
         configuration as a dict
 
     """
     if not config_file:
         raise EnvironmentError('Configuration file must be defined')
 
     if not os.path.exists(config_file):
         raise FileNotFoundError('Configuration file %s does not exist' % (
             config_file, ))
 
     cfg = config.read(config_file)
     if 'indexer_storage' not in cfg:
         raise KeyError("Missing '%indexer_storage' configuration")
 
     if type == 'local':
         vcfg = cfg['indexer_storage']
         cls = vcfg.get('cls')
         if cls != 'local':
             raise ValueError(
                 "The indexer_storage backend can only be started with a "
                 "'local' configuration")
 
         args = vcfg['args']
         if not args.get('db'):
             raise ValueError(
                 "Invalid configuration; missing 'db' config entry")
 
     return cfg
 
 
 def make_app_from_configfile():
     """Run the WSGI app from the webserver, loading the configuration from
        a configuration file.
 
        SWH_CONFIG_FILENAME environment variable defines the
        configuration path to load.
 
     """
     global api_cfg
     if not api_cfg:
         config_file = os.environ.get('SWH_CONFIG_FILENAME')
         api_cfg = load_and_check_config(config_file)
         app.config.update(api_cfg)
     handler = logging.StreamHandler()
     app.logger.addHandler(handler)
     return app
 
 
 if __name__ == '__main__':
     print('Deprecated. Use swh-indexer')
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
index ef5c6d3..b11806a 100644
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -1,1987 +1,1994 @@
 # Copyright (C) 2015-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import os
 import threading
 import unittest
 
 import pytest
 from hypothesis import given
 
 from swh.model.hashutil import hash_to_bytes
 
 from swh.indexer.storage import get_indexer_storage, MAPPING_NAMES
 from swh.core.db.tests.db_testing import SingleDbTestFixture
 from swh.indexer.tests.storage.generate_data_test import (
     gen_content_mimetypes, gen_content_fossology_licenses
 )
 from swh.indexer.tests.storage import SQL_DIR
 from swh.indexer.metadata_dictionary import MAPPINGS
 
 TOOLS = [
     {
         'tool_name': 'universal-ctags',
         'tool_version': '~git7859817b',
         'tool_configuration': {
             "command_line": "ctags --fields=+lnz --sort=no --links=no "
                             "--output-format=json <filepath>"}
     },
     {
         'tool_name': 'swh-metadata-translator',
         'tool_version': '0.0.1',
         'tool_configuration': {"type": "local", "context": "NpmMapping"},
     },
     {
         'tool_name': 'swh-metadata-detector',
         'tool_version': '0.0.1',
         'tool_configuration': {
             "type": "local", "context": ["NpmMapping", "CodemetaMapping"]},
     },
     {
         'tool_name': 'swh-metadata-detector2',
         'tool_version': '0.0.1',
         'tool_configuration': {
             "type": "local", "context": ["NpmMapping", "CodemetaMapping"]},
     },
     {
         'tool_name': 'file',
         'tool_version': '5.22',
         'tool_configuration': {"command_line": "file --mime <filepath>"},
     },
     {
         'tool_name': 'pygments',
         'tool_version': '2.0.1+dfsg-1.1+deb8u1',
         'tool_configuration': {
             "type": "library", "debian-package": "python3-pygments"},
     },
     {
         'tool_name': 'pygments',
         'tool_version': '2.0.1+dfsg-1.1+deb8u1',
         'tool_configuration': {
             "type": "library",
             "debian-package": "python3-pygments",
             "max_content_size": 10240
         },
     },
     {
         'tool_name': 'nomos',
         'tool_version': '3.1.0rc2-31-ga2cbb8c',
         'tool_configuration': {"command_line": "nomossa <filepath>"},
     }
 ]
 
 
 @pytest.mark.db
 class BasePgTestStorage(SingleDbTestFixture):
     """Base test class for most indexer tests.
 
     It adds support for Storage testing to the SingleDbTestFixture class.
     It will also build the database from the swh-indexed/sql/*.sql files.
     """
 
     TEST_DB_NAME = 'softwareheritage-test-indexer'
     TEST_DB_DUMP = os.path.join(SQL_DIR, '*.sql')
 
     def setUp(self):
         super().setUp()
         self.storage_config = {
             'cls': 'local',
             'args': {
                 'db': 'dbname=%s' % self.TEST_DB_NAME,
             },
         }
 
     def tearDown(self):
         self.reset_storage_tables()
         self.storage = None
         super().tearDown()
 
     def reset_storage_tables(self):
         excluded = {'indexer_configuration'}
         self.reset_db_tables(self.TEST_DB_NAME, excluded=excluded)
 
         db = self.test_db[self.TEST_DB_NAME]
         db.conn.commit()
 
 
 def gen_generic_endpoint_tests(endpoint_type, tool_name,
                                example_data1, example_data2):
     def rename(f):
         f.__name__ = 'test_' + endpoint_type + f.__name__
         return f
 
     def endpoint(self, endpoint_name):
         return getattr(self.storage, endpoint_type + '_' + endpoint_name)
 
     @rename
     def missing(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         query = [
             {
                 'id': self.sha1_1,
                 'indexer_configuration_id': tool_id,
             },
             {
                 'id': self.sha1_2,
                 'indexer_configuration_id': tool_id,
             }]
 
         # when
         actual_missing = endpoint(self, 'missing')(query)
 
         # then
         self.assertEqual(list(actual_missing), [
             self.sha1_1,
             self.sha1_2,
         ])
 
         # given
         endpoint(self, 'add')([{
             'id': self.sha1_2,
             **example_data1,
             'indexer_configuration_id': tool_id,
         }])
 
         # when
         actual_missing = endpoint(self, 'missing')(query)
 
         # then
         self.assertEqual(list(actual_missing), [self.sha1_1])
 
     @rename
     def add__drop_duplicate(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         data_v1 = {
             'id': self.sha1_2,
             **example_data1,
             'indexer_configuration_id': tool_id,
         }
 
         # given
         endpoint(self, 'add')([data_v1])
 
         # when
         actual_data = list(endpoint(self, 'get')([self.sha1_2]))
 
         # then
         expected_data_v1 = [{
             'id': self.sha1_2,
             **example_data1,
             'tool': self.tools[tool_name],
         }]
         self.assertEqual(actual_data, expected_data_v1)
 
         # given
         data_v2 = data_v1.copy()
         data_v2.update(example_data2)
 
         endpoint(self, 'add')([data_v2])
 
         actual_data = list(endpoint(self, 'get')([self.sha1_2]))
 
         # data did not change as the v2 was dropped.
         self.assertEqual(actual_data, expected_data_v1)
 
     @rename
     def add__update_in_place_duplicate(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         data_v1 = {
             'id': self.sha1_2,
             **example_data1,
             'indexer_configuration_id': tool_id,
         }
 
         # given
         endpoint(self, 'add')([data_v1])
 
         # when
         actual_data = list(endpoint(self, 'get')([self.sha1_2]))
 
         expected_data_v1 = [{
             'id': self.sha1_2,
             **example_data1,
             'tool': self.tools[tool_name],
         }]
 
         # then
         self.assertEqual(actual_data, expected_data_v1)
 
         # given
         data_v2 = data_v1.copy()
         data_v2.update(example_data2)
 
         endpoint(self, 'add')([data_v2], conflict_update=True)
 
         actual_data = list(endpoint(self, 'get')([self.sha1_2]))
 
         expected_data_v2 = [{
             'id': self.sha1_2,
             **example_data2,
             'tool': self.tools[tool_name],
         }]
 
         # data did change as the v2 was used to overwrite v1
         self.assertEqual(actual_data, expected_data_v2)
 
     @rename
     def add__update_in_place_deadlock(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         hashes = [
             hash_to_bytes(
                 '34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}'.format(i))
             for i in range(1000)]
 
         data_v1 = [
             {
                 'id': hash_,
                 **example_data1,
                 'indexer_configuration_id': tool_id,
             }
             for hash_ in hashes
         ]
         data_v2 = [
             {
                 'id': hash_,
                 **example_data2,
                 'indexer_configuration_id': tool_id,
             }
             for hash_ in hashes
         ]
 
         # Remove one item from each, so that both queries have to succeed for
         # all items to be in the DB.
         data_v2a = data_v2[1:]
         data_v2b = list(reversed(data_v2[0:-1]))
 
         # given
         endpoint(self, 'add')(data_v1)
 
         # when
         actual_data = list(endpoint(self, 'get')(hashes))
 
         expected_data_v1 = [
             {
                 'id': hash_,
                 **example_data1,
                 'tool': self.tools[tool_name],
             }
             for hash_ in hashes
         ]
 
         # then
         self.assertEqual(actual_data, expected_data_v1)
 
         # given
         def f1():
             endpoint(self, 'add')(data_v2a, conflict_update=True)
 
         def f2():
             endpoint(self, 'add')(data_v2b, conflict_update=True)
 
         t1 = threading.Thread(target=f1)
         t2 = threading.Thread(target=f2)
         t2.start()
         t1.start()
 
         t1.join()
         t2.join()
 
         actual_data = list(endpoint(self, 'get')(hashes))
 
         expected_data_v2 = [
             {
                 'id': hash_,
                 **example_data2,
                 'tool': self.tools[tool_name],
             }
             for hash_ in hashes
         ]
 
         self.assertCountEqual(actual_data, expected_data_v2)
 
     def add__duplicate_twice(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         data_rev1 = {
             'id': self.revision_id_2,
             **example_data1,
             'indexer_configuration_id': tool_id
         }
 
         data_rev2 = {
             'id': self.revision_id_2,
             **example_data2,
             'indexer_configuration_id': tool_id
         }
 
         # when
         endpoint(self, 'add')([data_rev1])
 
         with self.assertRaises(ValueError):
             endpoint(self, 'add')(
                 [data_rev2, data_rev2],
                 conflict_update=True)
 
         # then
         actual_data = list(endpoint(self, 'get')(
             [self.revision_id_2, self.revision_id_1]))
 
         expected_data = [{
             'id': self.revision_id_2,
             **example_data1,
             'tool': self.tools[tool_name]
         }]
         self.assertEqual(actual_data, expected_data)
 
     @rename
     def get(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         query = [self.sha1_2, self.sha1_1]
 
         data1 = {
             'id': self.sha1_2,
             **example_data1,
             'indexer_configuration_id': tool_id,
         }
 
         # when
         endpoint(self, 'add')([data1])
 
         # then
         actual_data = list(endpoint(self, 'get')(query))
 
         # then
         expected_data = [{
             'id': self.sha1_2,
             **example_data1,
             'tool': self.tools[tool_name]
         }]
 
         self.assertEqual(actual_data, expected_data)
 
     @rename
     def delete(self):
         # given
         tool_id = self.tools[tool_name]['id']
 
         query = [self.sha1_2, self.sha1_1]
 
         data1 = {
             'id': self.sha1_2,
             **example_data1,
             'indexer_configuration_id': tool_id,
         }
 
         # when
         endpoint(self, 'add')([data1])
         endpoint(self, 'delete')([
             {
                 'id': self.sha1_2,
                 'indexer_configuration_id': tool_id,
             }
         ])
 
         # then
         actual_data = list(endpoint(self, 'get')(query))
 
         # then
         self.assertEqual(actual_data, [])
 
     @rename
     def delete_nonexisting(self):
         tool_id = self.tools[tool_name]['id']
         endpoint(self, 'delete')([
             {
                 'id': self.sha1_2,
                 'indexer_configuration_id': tool_id,
             }
         ])
 
     return (
         missing,
         add__drop_duplicate,
         add__update_in_place_duplicate,
         add__update_in_place_deadlock,
         add__duplicate_twice,
         get,
         delete,
         delete_nonexisting,
     )
 
 
 class CommonTestStorage:
     """Base class for Indexer Storage testing.
 
     """
     def setUp(self):
         super().setUp()
         self.storage = get_indexer_storage(**self.storage_config)
         tools = self.storage.indexer_configuration_add(TOOLS)
         self.tools = {}
         for tool in tools:
             tool_name = tool['tool_name']
             while tool_name in self.tools:
                 tool_name += '_'
             self.tools[tool_name] = {
                 'id': tool['id'],
                 'name': tool['tool_name'],
                 'version': tool['tool_version'],
                 'configuration': tool['tool_configuration'],
             }
 
         self.sha1_1 = hash_to_bytes('34973274ccef6ab4dfaaf86599792fa9c3fe4689')
         self.sha1_2 = hash_to_bytes('61c2b3a30496d329e21af70dd2d7e097046d07b7')
         self.revision_id_1 = hash_to_bytes(
             '7026b7c1a2af56521e951c01ed20f255fa054238')
         self.revision_id_2 = hash_to_bytes(
             '7026b7c1a2af56521e9587659012345678904321')
         self.revision_id_3 = hash_to_bytes(
             '7026b7c1a2af56521e9587659012345678904320')
         self.origin_id_1 = 44434341
         self.origin_id_2 = 44434342
         self.origin_id_3 = 54974445
 
     def test_check_config(self):
         self.assertTrue(self.storage.check_config(check_write=True))
         self.assertTrue(self.storage.check_config(check_write=False))
 
     # generate content_mimetype tests
     (
         test_content_mimetype_missing,
         test_content_mimetype_add__drop_duplicate,
         test_content_mimetype_add__update_in_place_duplicate,
         test_content_mimetype_add__update_in_place_deadlock,
         test_content_mimetype_add__duplicate_twice,
         test_content_mimetype_get,
         _,  # content_mimetype_detete,
         _,  # content_mimetype_detete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='content_mimetype',
         tool_name='file',
         example_data1={
             'mimetype': 'text/plain',
             'encoding': 'utf-8',
         },
         example_data2={
             'mimetype': 'text/html',
             'encoding': 'us-ascii',
         },
     )
 
     # content_language tests
     (
         test_content_language_missing,
         test_content_language_add__drop_duplicate,
         test_content_language_add__update_in_place_duplicate,
         test_content_language_add__update_in_place_deadlock,
         test_content_language_add__duplicate_twice,
         test_content_language_get,
         _,  # test_content_language_delete,
         _,  # test_content_language_delete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='content_language',
         tool_name='pygments',
         example_data1={
             'lang': 'haskell',
         },
         example_data2={
             'lang': 'common-lisp',
         },
     )
 
     # content_ctags tests
     (
         test_content_ctags_missing,
         # the following tests are disabled because CTAGS behave differently
         _,  # test_content_ctags_add__drop_duplicate,
         _,  # test_content_ctags_add__update_in_place_duplicate,
         _,  # test_content_ctags_add__update_in_place_deadlock,
         _,  # test_content_ctags_add__duplicate_twice,
         _,  # test_content_ctags_get,
         _,  # test_content_ctags_delete,
         _,  # test_content_ctags_delete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='content_ctags',
         tool_name='universal-ctags',
         example_data1={
             'ctags': [{
                 'name': 'done',
                 'kind': 'variable',
                 'line': 119,
                 'lang': 'OCaml',
             }]
         },
         example_data2={
             'ctags': [
                 {
                     'name': 'done',
                     'kind': 'variable',
                     'line': 100,
                     'lang': 'Python',
                 },
                 {
                     'name': 'main',
                     'kind': 'function',
                     'line': 119,
                     'lang': 'Python',
                 }]
         },
     )
 
     def test_content_ctags_search(self):
         # 1. given
         tool = self.tools['universal-ctags']
         tool_id = tool['id']
 
         ctag1 = {
             'id': self.sha1_1,
             'indexer_configuration_id': tool_id,
             'ctags': [
                 {
                     'name': 'hello',
                     'kind': 'function',
                     'line': 133,
                     'lang': 'Python',
                 },
                 {
                     'name': 'counter',
                     'kind': 'variable',
                     'line': 119,
                     'lang': 'Python',
                 },
                 {
                     'name': 'hello',
                     'kind': 'variable',
                     'line': 210,
                     'lang': 'Python',
                 },
             ]
         }
 
         ctag2 = {
             'id': self.sha1_2,
             'indexer_configuration_id': tool_id,
             'ctags': [
                 {
                     'name': 'hello',
                     'kind': 'variable',
                     'line': 100,
                     'lang': 'C',
                 },
                 {
                     'name': 'result',
                     'kind': 'variable',
                     'line': 120,
                     'lang': 'C',
                 },
             ]
         }
 
         self.storage.content_ctags_add([ctag1, ctag2])
 
         # 1. when
         actual_ctags = list(self.storage.content_ctags_search('hello',
                                                               limit=1))
 
         # 1. then
         self.assertEqual(actual_ctags, [
             {
                 'id': ctag1['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'function',
                 'line': 133,
                 'lang': 'Python',
             }
         ])
 
         # 2. when
         actual_ctags = list(self.storage.content_ctags_search(
             'hello',
             limit=1,
             last_sha1=ctag1['id']))
 
         # 2. then
         self.assertEqual(actual_ctags, [
             {
                 'id': ctag2['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'C',
             }
         ])
 
         # 3. when
         actual_ctags = list(self.storage.content_ctags_search('hello'))
 
         # 3. then
         self.assertEqual(actual_ctags, [
             {
                 'id': ctag1['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'function',
                 'line': 133,
                 'lang': 'Python',
             },
             {
                 'id': ctag1['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'variable',
                 'line': 210,
                 'lang': 'Python',
             },
             {
                 'id': ctag2['id'],
                 'tool': tool,
                 'name': 'hello',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'C',
             },
         ])
 
         # 4. when
         actual_ctags = list(self.storage.content_ctags_search('counter'))
 
         # then
         self.assertEqual(actual_ctags, [{
             'id': ctag1['id'],
             'tool': tool,
             'name': 'counter',
             'kind': 'variable',
             'line': 119,
             'lang': 'Python',
         }])
 
         # 5. when
         actual_ctags = list(self.storage.content_ctags_search('result',
                                                               limit=1))
 
         # then
         self.assertEqual(actual_ctags, [{
             'id': ctag2['id'],
             'tool': tool,
             'name': 'result',
             'kind': 'variable',
             'line': 120,
             'lang': 'C',
         }])
 
     def test_content_ctags_search_no_result(self):
         actual_ctags = list(self.storage.content_ctags_search('counter'))
 
         self.assertEqual(actual_ctags, [])
 
     def test_content_ctags_add__add_new_ctags_added(self):
         # given
         tool = self.tools['universal-ctags']
         tool_id = tool['id']
 
         ctag_v1 = {
             'id': self.sha1_2,
             'indexer_configuration_id': tool_id,
             'ctags': [{
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
             }]
         }
 
         # given
         self.storage.content_ctags_add([ctag_v1])
         self.storage.content_ctags_add([ctag_v1])  # conflict does nothing
 
         # when
         actual_ctags = list(self.storage.content_ctags_get(
             [self.sha1_2]))
 
         # then
         expected_ctags = [{
             'id': self.sha1_2,
             'name': 'done',
             'kind': 'variable',
             'line': 100,
             'lang': 'Scheme',
             'tool': tool,
         }]
 
         self.assertEqual(actual_ctags, expected_ctags)
 
         # given
         ctag_v2 = ctag_v1.copy()
         ctag_v2.update({
             'ctags': [
                 {
                     'name': 'defn',
                     'kind': 'function',
                     'line': 120,
                     'lang': 'Scheme',
                 }
             ]
         })
 
         self.storage.content_ctags_add([ctag_v2])
 
         expected_ctags = [
             {
                 'id': self.sha1_2,
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
                 'tool': tool,
             }, {
                 'id': self.sha1_2,
                 'name': 'defn',
                 'kind': 'function',
                 'line': 120,
                 'lang': 'Scheme',
                 'tool': tool,
             }
         ]
 
         actual_ctags = list(self.storage.content_ctags_get(
             [self.sha1_2]))
 
         self.assertEqual(actual_ctags, expected_ctags)
 
     def test_content_ctags_add__update_in_place(self):
         # given
         tool = self.tools['universal-ctags']
         tool_id = tool['id']
 
         ctag_v1 = {
             'id': self.sha1_2,
             'indexer_configuration_id': tool_id,
             'ctags': [{
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
             }]
         }
 
         # given
         self.storage.content_ctags_add([ctag_v1])
 
         # when
         actual_ctags = list(self.storage.content_ctags_get(
             [self.sha1_2]))
 
         # then
         expected_ctags = [
             {
                 'id': self.sha1_2,
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
                 'tool': tool
             }
         ]
         self.assertEqual(actual_ctags, expected_ctags)
 
         # given
         ctag_v2 = ctag_v1.copy()
         ctag_v2.update({
             'ctags': [
                 {
                     'name': 'done',
                     'kind': 'variable',
                     'line': 100,
                     'lang': 'Scheme',
                 },
                 {
                     'name': 'defn',
                     'kind': 'function',
                     'line': 120,
                     'lang': 'Scheme',
                 }
             ]
         })
 
         self.storage.content_ctags_add([ctag_v2], conflict_update=True)
 
         actual_ctags = list(self.storage.content_ctags_get(
             [self.sha1_2]))
 
         # ctag did change as the v2 was used to overwrite v1
         expected_ctags = [
             {
                 'id': self.sha1_2,
                 'name': 'done',
                 'kind': 'variable',
                 'line': 100,
                 'lang': 'Scheme',
                 'tool': tool,
             },
             {
                 'id': self.sha1_2,
                 'name': 'defn',
                 'kind': 'function',
                 'line': 120,
                 'lang': 'Scheme',
                 'tool': tool,
             }
         ]
         self.assertEqual(actual_ctags, expected_ctags)
 
     # content_fossology_license tests
     (
         _,  # The endpoint content_fossology_license_missing does not exist
         # the following tests are disabled because fossology_license tests
         # behave differently
         _,  # test_content_fossology_license_add__drop_duplicate,
         _,  # test_content_fossology_license_add__update_in_place_duplicate,
         _,  # test_content_fossology_license_add__update_in_place_deadlock,
         _,  # test_content_metadata_add__duplicate_twice,
         _,  # test_content_fossology_license_get,
         _,  # test_content_fossology_license_delete,
         _,  # test_content_fossology_license_delete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='content_fossology_license',
         tool_name='nomos',
         example_data1={
             'licenses': ['Apache-2.0'],
         },
         example_data2={
             'licenses': ['BSD-2-Clause'],
         },
     )
 
     def test_content_fossology_license_add__new_license_added(self):
         # given
         tool = self.tools['nomos']
         tool_id = tool['id']
 
         license_v1 = {
             'id': self.sha1_1,
             'licenses': ['Apache-2.0'],
             'indexer_configuration_id': tool_id,
         }
 
         # given
         self.storage.content_fossology_license_add([license_v1])
         # conflict does nothing
         self.storage.content_fossology_license_add([license_v1])
 
         # when
         actual_licenses = list(self.storage.content_fossology_license_get(
             [self.sha1_1]))
 
         # then
         expected_license = {
             self.sha1_1: [{
                 'licenses': ['Apache-2.0'],
                 'tool': tool,
             }]
         }
         self.assertEqual(actual_licenses, [expected_license])
 
         # given
         license_v2 = license_v1.copy()
         license_v2.update({
             'licenses': ['BSD-2-Clause'],
         })
 
         self.storage.content_fossology_license_add([license_v2])
 
         actual_licenses = list(self.storage.content_fossology_license_get(
             [self.sha1_1]))
 
         expected_license = {
             self.sha1_1: [{
                 'licenses': ['Apache-2.0', 'BSD-2-Clause'],
                 'tool': tool
             }]
         }
 
         # license did not change as the v2 was dropped.
         self.assertEqual(actual_licenses, [expected_license])
 
     # content_metadata tests
     (
         test_content_metadata_missing,
         test_content_metadata_add__drop_duplicate,
         test_content_metadata_add__update_in_place_duplicate,
         test_content_metadata_add__update_in_place_deadlock,
         test_content_metadata_add__duplicate_twice,
         test_content_metadata_get,
         _,  # test_content_metadata_delete,
         _,  # test_content_metadata_delete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='content_metadata',
         tool_name='swh-metadata-detector',
         example_data1={
             'metadata': {
                 'other': {},
                 'codeRepository': {
                     'type': 'git',
                     'url': 'https://github.com/moranegg/metadata_test'
                 },
                 'description': 'Simple package.json test for indexer',
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
         },
         example_data2={
             'metadata': {
                 'other': {},
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
         },
     )
 
     # revision_intrinsic_metadata tests
     (
         test_revision_intrinsic_metadata_missing,
         test_revision_intrinsic_metadata_add__drop_duplicate,
         test_revision_intrinsic_metadata_add__update_in_place_duplicate,
         test_revision_intrinsic_metadata_add__update_in_place_deadlock,
         test_revision_intrinsic_metadata_add__duplicate_twice,
         test_revision_intrinsic_metadata_get,
         test_revision_intrinsic_metadata_delete,
         test_revision_intrinsic_metadata_delete_nonexisting,
     ) = gen_generic_endpoint_tests(
         endpoint_type='revision_intrinsic_metadata',
         tool_name='swh-metadata-detector',
         example_data1={
             'metadata': {
                 'other': {},
                 'codeRepository': {
                     'type': 'git',
                     'url': 'https://github.com/moranegg/metadata_test'
                 },
                 'description': 'Simple package.json test for indexer',
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
             'mappings': ['mapping1'],
         },
         example_data2={
             'metadata': {
                 'other': {},
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
             'mappings': ['mapping2'],
         },
     )
 
     def test_origin_intrinsic_metadata_get(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata = {
             'version': None,
             'name': None,
         }
         metadata_rev = {
             'id': self.revision_id_2,
             'metadata': metadata,
             'mappings': ['mapping1'],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin = {
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata,
             'indexer_configuration_id': tool_id,
             'mappings': ['mapping1'],
             'from_revision': self.revision_id_2,
             }
 
         # when
         self.storage.revision_intrinsic_metadata_add([metadata_rev])
         self.storage.origin_intrinsic_metadata_add([metadata_origin])
 
         # then
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1, 42]))
 
         expected_metadata = [{
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata,
             'tool': self.tools['swh-metadata-detector'],
             'from_revision': self.revision_id_2,
             'mappings': ['mapping1'],
         }]
 
         self.assertEqual(actual_metadata, expected_metadata)
 
     def test_origin_intrinsic_metadata_delete(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata = {
             'version': None,
             'name': None,
         }
         metadata_rev = {
             'id': self.revision_id_2,
             'metadata': metadata,
             'mappings': ['mapping1'],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin = {
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata,
             'indexer_configuration_id': tool_id,
             'mappings': ['mapping1'],
             'from_revision': self.revision_id_2,
             }
         metadata_origin2 = metadata_origin.copy()
         metadata_origin2['id'] = self.origin_id_2
 
         # when
         self.storage.revision_intrinsic_metadata_add([metadata_rev])
         self.storage.origin_intrinsic_metadata_add([
             metadata_origin, metadata_origin2])
 
         self.storage.origin_intrinsic_metadata_delete([
             {
                 'id': self.origin_id_1,
                 'indexer_configuration_id': tool_id
             }
         ])
 
         # then
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1, self.origin_id_2, 42]))
         for item in actual_metadata:
             item['indexer_configuration_id'] = item.pop('tool')['id']
         self.assertEqual(actual_metadata, [metadata_origin2])
 
     def test_origin_intrinsic_metadata_delete_nonexisting(self):
         tool_id = self.tools['swh-metadata-detector']['id']
         self.storage.origin_intrinsic_metadata_delete([
             {
                 'id': self.origin_id_1,
                 'indexer_configuration_id': tool_id
             }
         ])
 
     def test_origin_intrinsic_metadata_add_drop_duplicate(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata_v1 = {
             'version': None,
             'name': None,
         }
         metadata_rev_v1 = {
             'id': self.revision_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata_v1.copy(),
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin_v1 = {
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata_v1.copy(),
             'indexer_configuration_id': tool_id,
             'mappings': [],
             'from_revision': self.revision_id_1,
         }
 
         # given
         self.storage.revision_intrinsic_metadata_add([metadata_rev_v1])
         self.storage.origin_intrinsic_metadata_add([metadata_origin_v1])
 
         # when
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1, 42]))
 
         expected_metadata_v1 = [{
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata_v1,
             'tool': self.tools['swh-metadata-detector'],
             'from_revision': self.revision_id_1,
             'mappings': [],
         }]
 
         self.assertEqual(actual_metadata, expected_metadata_v1)
 
         # given
         metadata_v2 = metadata_v1.copy()
         metadata_v2.update({
             'name': 'test_metadata',
             'author': 'MG',
         })
         metadata_rev_v2 = metadata_rev_v1.copy()
         metadata_origin_v2 = metadata_origin_v1.copy()
         metadata_rev_v2['metadata'] = metadata_v2
         metadata_origin_v2['metadata'] = metadata_v2
 
         self.storage.revision_intrinsic_metadata_add([metadata_rev_v2])
         self.storage.origin_intrinsic_metadata_add([metadata_origin_v2])
 
         # then
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1]))
 
         # metadata did not change as the v2 was dropped.
         self.assertEqual(actual_metadata, expected_metadata_v1)
 
     def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata_v1 = {
             'version': None,
             'name': None,
         }
         metadata_rev_v1 = {
             'id': self.revision_id_2,
             'metadata': metadata_v1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin_v1 = {
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata_v1.copy(),
             'indexer_configuration_id': tool_id,
             'mappings': [],
             'from_revision': self.revision_id_2,
         }
 
         # given
         self.storage.revision_intrinsic_metadata_add([metadata_rev_v1])
         self.storage.origin_intrinsic_metadata_add([metadata_origin_v1])
 
         # when
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1]))
 
         # then
         expected_metadata_v1 = [{
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata_v1,
             'tool': self.tools['swh-metadata-detector'],
             'from_revision': self.revision_id_2,
             'mappings': [],
         }]
         self.assertEqual(actual_metadata, expected_metadata_v1)
 
         # given
         metadata_v2 = metadata_v1.copy()
         metadata_v2.update({
             'name': 'test_update_duplicated_metadata',
             'author': 'MG',
         })
         metadata_rev_v2 = metadata_rev_v1.copy()
         metadata_origin_v2 = metadata_origin_v1.copy()
         metadata_rev_v2['metadata'] = metadata_v2
-        metadata_origin_v2['metadata'] = metadata_v2
+        metadata_origin_v2 = {
+            'id': self.origin_id_1,
+            'origin_url': 'file:///dev/null',
+            'metadata': metadata_v2.copy(),
+            'indexer_configuration_id': tool_id,
+            'mappings': ['npm'],
+            'from_revision': self.revision_id_1,
+        }
 
         self.storage.revision_intrinsic_metadata_add(
                 [metadata_rev_v2], conflict_update=True)
         self.storage.origin_intrinsic_metadata_add(
                 [metadata_origin_v2], conflict_update=True)
 
         actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
             [self.origin_id_1]))
 
         expected_metadata_v2 = [{
             'id': self.origin_id_1,
-            'origin_url': 'file:///dev/zero',
+            'origin_url': 'file:///dev/null',
             'metadata': metadata_v2,
             'tool': self.tools['swh-metadata-detector'],
-            'from_revision': self.revision_id_2,
-            'mappings': [],
+            'from_revision': self.revision_id_1,
+            'mappings': ['npm'],
         }]
 
         # metadata did change as the v2 was used to overwrite v1
         self.assertEqual(actual_metadata, expected_metadata_v2)
 
     def test_origin_intrinsic_metadata_add__update_in_place_deadlock(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         ids = list(range(10))
 
         example_data1 = {
             'metadata': {
                 'version': None,
                 'name': None,
             },
             'mappings': [],
         }
         example_data2 = {
             'metadata': {
                 'version': 'v1.1.1',
                 'name': 'foo',
             },
             'mappings': [],
         }
 
         metadata_rev_v1 = {
             'id': self.revision_id_2,
             'metadata': {
                 'version': None,
                 'name': None,
             },
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
 
         data_v1 = [
             {
                 'id': id_,
                 'origin_url': 'file:///tmp/origin%d' % id_,
                 'from_revision': self.revision_id_2,
                 **example_data1,
                 'indexer_configuration_id': tool_id,
             }
             for id_ in ids
         ]
         data_v2 = [
             {
                 'id': id_,
                 'origin_url': 'file:///tmp/origin%d' % id_,
                 'from_revision': self.revision_id_2,
                 **example_data2,
                 'indexer_configuration_id': tool_id,
             }
             for id_ in ids
         ]
 
         # Remove one item from each, so that both queries have to succeed for
         # all items to be in the DB.
         data_v2a = data_v2[1:]
         data_v2b = list(reversed(data_v2[0:-1]))
 
         # given
         self.storage.revision_intrinsic_metadata_add([metadata_rev_v1])
         self.storage.origin_intrinsic_metadata_add(data_v1)
 
         # when
         actual_data = list(self.storage.origin_intrinsic_metadata_get(ids))
 
         expected_data_v1 = [
             {
                 'id': id_,
                 'origin_url': 'file:///tmp/origin%d' % id_,
                 'from_revision': self.revision_id_2,
                 **example_data1,
                 'tool': self.tools['swh-metadata-detector'],
             }
             for id_ in ids
         ]
 
         # then
         self.assertEqual(actual_data, expected_data_v1)
 
         # given
         def f1():
             self.storage.origin_intrinsic_metadata_add(
                 data_v2a, conflict_update=True)
 
         def f2():
             self.storage.origin_intrinsic_metadata_add(
                 data_v2b, conflict_update=True)
 
         t1 = threading.Thread(target=f1)
         t2 = threading.Thread(target=f2)
         t2.start()
         t1.start()
 
         t1.join()
         t2.join()
 
         actual_data = list(self.storage.origin_intrinsic_metadata_get(ids))
 
         expected_data_v2 = [
             {
                 'id': id_,
                 'origin_url': 'file:///tmp/origin%d' % id_,
                 'from_revision': self.revision_id_2,
                 **example_data2,
                 'tool': self.tools['swh-metadata-detector'],
             }
             for id_ in ids
         ]
 
         self.maxDiff = None
         self.assertCountEqual(actual_data, expected_data_v2)
 
     def test_origin_intrinsic_metadata_add__duplicate_twice(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata = {
             'developmentStatus': None,
             'name': None,
         }
         metadata_rev = {
             'id': self.revision_id_2,
             'metadata': metadata,
             'mappings': ['mapping1'],
             'indexer_configuration_id': tool_id,
         }
         metadata_origin = {
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata,
             'indexer_configuration_id': tool_id,
             'mappings': ['mapping1'],
             'from_revision': self.revision_id_2,
             }
 
         # when
         self.storage.revision_intrinsic_metadata_add([metadata_rev])
 
         with self.assertRaises(ValueError):
             self.storage.origin_intrinsic_metadata_add([
                 metadata_origin, metadata_origin])
 
     def test_origin_intrinsic_metadata_search_fulltext(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         metadata1 = {
             'author': 'John Doe',
         }
         metadata1_rev = {
             'id': self.revision_id_1,
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata1_origin = {
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': self.revision_id_1,
         }
         metadata2 = {
             'author': 'Jane Doe',
         }
         metadata2_rev = {
             'id': self.revision_id_2,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata2_origin = {
             'id': self.origin_id_2,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': self.revision_id_2,
         }
 
         # when
         self.storage.revision_intrinsic_metadata_add([metadata1_rev])
         self.storage.origin_intrinsic_metadata_add([metadata1_origin])
         self.storage.revision_intrinsic_metadata_add([metadata2_rev])
         self.storage.origin_intrinsic_metadata_add([metadata2_origin])
 
         # then
         search = self.storage.origin_intrinsic_metadata_search_fulltext
         self.assertCountEqual(
                 [res['id'] for res in search(['Doe'])],
                 [self.origin_id_1, self.origin_id_2])
         self.assertEqual(
                 [res['id'] for res in search(['John', 'Doe'])],
                 [self.origin_id_1])
         self.assertEqual(
                 [res['id'] for res in search(['John'])],
                 [self.origin_id_1])
         self.assertEqual(
                 [res['id'] for res in search(['John', 'Jane'])],
                 [])
 
     def test_origin_intrinsic_metadata_search_fulltext_rank(self):
         # given
         tool_id = self.tools['swh-metadata-detector']['id']
 
         # The following authors have "Random Person" to add some more content
         # to the JSON data, to work around normalization quirks when there
         # are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words
         # for small values of nb_words).
         metadata1 = {
             'author': [
                 'Random Person',
                 'John Doe',
                 'Jane Doe',
             ]
         }
         metadata1_rev = {
             'id': self.revision_id_1,
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata1_origin = {
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata1,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': self.revision_id_1,
         }
         metadata2 = {
             'author': [
                 'Random Person',
                 'Jane Doe',
             ]
         }
         metadata2_rev = {
             'id': self.revision_id_2,
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
         }
         metadata2_origin = {
             'id': self.origin_id_2,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata2,
             'mappings': [],
             'indexer_configuration_id': tool_id,
             'from_revision': self.revision_id_2,
         }
 
         # when
         self.storage.revision_intrinsic_metadata_add([metadata1_rev])
         self.storage.origin_intrinsic_metadata_add([metadata1_origin])
         self.storage.revision_intrinsic_metadata_add([metadata2_rev])
         self.storage.origin_intrinsic_metadata_add([metadata2_origin])
 
         # then
         search = self.storage.origin_intrinsic_metadata_search_fulltext
         self.assertEqual(
                 [res['id'] for res in search(['Doe'])],
                 [self.origin_id_1, self.origin_id_2])
         self.assertEqual(
                 [res['id'] for res in search(['Doe'], limit=1)],
                 [self.origin_id_1])
         self.assertEqual(
                 [res['id'] for res in search(['John'])],
                 [self.origin_id_1])
         self.assertEqual(
                 [res['id'] for res in search(['Jane'])],
                 [self.origin_id_2, self.origin_id_1])
         self.assertEqual(
                 [res['id'] for res in search(['John', 'Jane'])],
                 [self.origin_id_1])
 
     def _fill_origin_intrinsic_metadata(self):
         tool1_id = self.tools['swh-metadata-detector']['id']
         tool2_id = self.tools['swh-metadata-detector2']['id']
 
         metadata1 = {
             '@context': 'foo',
             'author': 'John Doe',
         }
         metadata1_rev = {
             'id': self.revision_id_1,
             'metadata': metadata1,
             'mappings': ['npm'],
             'indexer_configuration_id': tool1_id,
         }
         metadata1_origin = {
             'id': self.origin_id_1,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata1,
             'mappings': ['npm'],
             'indexer_configuration_id': tool1_id,
             'from_revision': self.revision_id_1,
         }
         metadata2 = {
             '@context': 'foo',
             'author': 'Jane Doe',
         }
         metadata2_rev = {
             'id': self.revision_id_2,
             'metadata': metadata2,
             'mappings': ['npm', 'gemspec'],
             'indexer_configuration_id': tool2_id,
         }
         metadata2_origin = {
             'id': self.origin_id_2,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata2,
             'mappings': ['npm', 'gemspec'],
             'indexer_configuration_id': tool2_id,
             'from_revision': self.revision_id_2,
         }
         metadata3 = {
             '@context': 'foo',
         }
         metadata3_rev = {
             'id': self.revision_id_3,
             'metadata': metadata3,
             'mappings': ['npm', 'gemspec'],
             'indexer_configuration_id': tool2_id,
         }
         metadata3_origin = {
             'id': self.origin_id_3,
             'origin_url': 'file:///dev/zero',
             'metadata': metadata3,
             'mappings': ['pkg-info'],
             'indexer_configuration_id': tool2_id,
             'from_revision': self.revision_id_3,
         }
 
         self.storage.revision_intrinsic_metadata_add([metadata1_rev])
         self.storage.origin_intrinsic_metadata_add([metadata1_origin])
         self.storage.revision_intrinsic_metadata_add([metadata2_rev])
         self.storage.origin_intrinsic_metadata_add([metadata2_origin])
         self.storage.revision_intrinsic_metadata_add([metadata3_rev])
         self.storage.origin_intrinsic_metadata_add([metadata3_origin])
 
     def test_origin_intrinsic_metadata_search_by_producer(self):
         self._fill_origin_intrinsic_metadata()
         tool1 = self.tools['swh-metadata-detector']
         tool2 = self.tools['swh-metadata-detector2']
         endpoint = self.storage.origin_intrinsic_metadata_search_by_producer
 
         # test pagination
         self.assertCountEqual(
             endpoint(ids_only=True),
             [self.origin_id_1, self.origin_id_2, self.origin_id_3])
         self.assertCountEqual(
             endpoint(start=0, ids_only=True),
             [self.origin_id_1, self.origin_id_2, self.origin_id_3])
         self.assertCountEqual(
             endpoint(start=0, limit=2, ids_only=True),
             [self.origin_id_1, self.origin_id_2])
         self.assertCountEqual(
             endpoint(start=self.origin_id_1+1, ids_only=True),
             [self.origin_id_2, self.origin_id_3])
         self.assertCountEqual(
             endpoint(start=self.origin_id_1+1, end=self.origin_id_3-1,
                      ids_only=True),
             [self.origin_id_2])
 
         # test mappings filtering
         self.assertCountEqual(
             endpoint(mappings=['npm'], ids_only=True),
             [self.origin_id_1, self.origin_id_2])
         self.assertCountEqual(
             endpoint(mappings=['npm', 'gemspec'], ids_only=True),
             [self.origin_id_1, self.origin_id_2])
         self.assertCountEqual(
             endpoint(mappings=['gemspec'], ids_only=True),
             [self.origin_id_2])
         self.assertCountEqual(
             endpoint(mappings=['pkg-info'], ids_only=True),
             [self.origin_id_3])
         self.assertCountEqual(
             endpoint(mappings=['foobar'], ids_only=True),
             [])
 
         # test pagination + mappings
         self.assertCountEqual(
             endpoint(mappings=['npm'], limit=1, ids_only=True),
             [self.origin_id_1])
 
         # test tool filtering
         self.assertCountEqual(
             endpoint(tool_ids=[tool1['id']], ids_only=True),
             [self.origin_id_1])
         self.assertCountEqual(
             endpoint(tool_ids=[tool2['id']], ids_only=True),
             [self.origin_id_2, self.origin_id_3])
         self.assertCountEqual(
             endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True),
             [self.origin_id_1, self.origin_id_2, self.origin_id_3])
 
         # test ids_only=False
         self.assertEqual(list(endpoint(mappings=['gemspec'])), [{
             'id': self.origin_id_2,
             'origin_url': 'file:///dev/zero',
             'metadata': {
                 '@context': 'foo',
                 'author': 'Jane Doe',
             },
             'mappings': ['npm', 'gemspec'],
             'tool': tool2,
             'from_revision': self.revision_id_2,
         }])
 
     def test_origin_intrinsic_metadata_stats(self):
         self._fill_origin_intrinsic_metadata()
 
         result = self.storage.origin_intrinsic_metadata_stats()
         self.assertEqual(result, {
             'per_mapping': {
                 'gemspec': 1,
                 'npm': 2,
                 'pkg-info': 1,
                 'codemeta': 0,
                 'maven': 0,
             },
             'total': 3,
             'non_empty': 2,
         })
 
     def test_indexer_configuration_add(self):
         tool = {
             'tool_name': 'some-unknown-tool',
             'tool_version': 'some-version',
             'tool_configuration': {"debian-package": "some-package"},
         }
 
         actual_tool = self.storage.indexer_configuration_get(tool)
         self.assertIsNone(actual_tool)  # does not exist
 
         # add it
         actual_tools = list(self.storage.indexer_configuration_add([tool]))
 
         self.assertEqual(len(actual_tools), 1)
         actual_tool = actual_tools[0]
         self.assertIsNotNone(actual_tool)  # now it exists
         new_id = actual_tool.pop('id')
         self.assertEqual(actual_tool, tool)
 
         actual_tools2 = list(self.storage.indexer_configuration_add([tool]))
         actual_tool2 = actual_tools2[0]
         self.assertIsNotNone(actual_tool2)  # now it exists
         new_id2 = actual_tool2.pop('id')
 
         self.assertEqual(new_id, new_id2)
         self.assertEqual(actual_tool, actual_tool2)
 
     def test_indexer_configuration_add_multiple(self):
         tool = {
             'tool_name': 'some-unknown-tool',
             'tool_version': 'some-version',
             'tool_configuration': {"debian-package": "some-package"},
         }
 
         actual_tools = list(self.storage.indexer_configuration_add([tool]))
         self.assertEqual(len(actual_tools), 1)
 
         new_tools = [tool, {
             'tool_name': 'yet-another-tool',
             'tool_version': 'version',
             'tool_configuration': {},
         }]
 
         actual_tools = list(self.storage.indexer_configuration_add(new_tools))
         self.assertEqual(len(actual_tools), 2)
 
         # order not guaranteed, so we iterate over results to check
         for tool in actual_tools:
             _id = tool.pop('id')
             self.assertIsNotNone(_id)
             self.assertIn(tool, new_tools)
 
     def test_indexer_configuration_get_missing(self):
         tool = {
             'tool_name': 'unknown-tool',
             'tool_version': '3.1.0rc2-31-ga2cbb8c',
             'tool_configuration': {"command_line": "nomossa <filepath>"},
         }
 
         actual_tool = self.storage.indexer_configuration_get(tool)
 
         self.assertIsNone(actual_tool)
 
     def test_indexer_configuration_get(self):
         tool = {
             'tool_name': 'nomos',
             'tool_version': '3.1.0rc2-31-ga2cbb8c',
             'tool_configuration': {"command_line": "nomossa <filepath>"},
         }
 
         self.storage.indexer_configuration_add([tool])
         actual_tool = self.storage.indexer_configuration_get(tool)
 
         expected_tool = tool.copy()
         del actual_tool['id']
 
         self.assertEqual(expected_tool, actual_tool)
 
     def test_indexer_configuration_metadata_get_missing_context(self):
         tool = {
             'tool_name': 'swh-metadata-translator',
             'tool_version': '0.0.1',
             'tool_configuration': {"context": "unknown-context"},
         }
 
         actual_tool = self.storage.indexer_configuration_get(tool)
 
         self.assertIsNone(actual_tool)
 
     def test_indexer_configuration_metadata_get(self):
         tool = {
             'tool_name': 'swh-metadata-translator',
             'tool_version': '0.0.1',
             'tool_configuration': {"type": "local", "context": "NpmMapping"},
         }
 
         self.storage.indexer_configuration_add([tool])
         actual_tool = self.storage.indexer_configuration_get(tool)
 
         expected_tool = tool.copy()
         expected_tool['id'] = actual_tool['id']
 
         self.assertEqual(expected_tool, actual_tool)
 
     @pytest.mark.property_based
     def test_generate_content_mimetype_get_range_limit_none(self):
         """mimetype_get_range call with wrong limit input should fail"""
         with self.assertRaises(ValueError) as e:
             self.storage.content_mimetype_get_range(
                 start=None, end=None, indexer_configuration_id=None,
                 limit=None)
 
         self.assertEqual(e.exception.args, (
             'Development error: limit should not be None',))
 
     @pytest.mark.property_based
     @given(gen_content_mimetypes(min_size=1, max_size=4))
     def test_generate_content_mimetype_get_range_no_limit(self, mimetypes):
         """mimetype_get_range returns mimetypes within range provided"""
         self.reset_storage_tables()
         # add mimetypes to storage
         self.storage.content_mimetype_add(mimetypes)
 
         # All ids from the db
         content_ids = sorted([c['id'] for c in mimetypes])
 
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve mimetypes
         tool_id = mimetypes[0]['indexer_configuration_id']
         actual_result = self.storage.content_mimetype_get_range(
             start, end, indexer_configuration_id=tool_id)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         self.assertEqual(len(mimetypes), len(actual_ids))
         self.assertIsNone(actual_next)
         self.assertEqual(content_ids, actual_ids)
 
     @pytest.mark.property_based
     @given(gen_content_mimetypes(min_size=4, max_size=4))
     def test_generate_content_mimetype_get_range_limit(self, mimetypes):
         """mimetype_get_range paginates results if limit exceeded"""
         self.reset_storage_tables()
 
         # add mimetypes to storage
         self.storage.content_mimetype_add(mimetypes)
 
         # input the list of sha1s we want from storage
         content_ids = sorted([c['id'] for c in mimetypes])
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve mimetypes limited to 3 results
         limited_results = len(mimetypes) - 1
         tool_id = mimetypes[0]['indexer_configuration_id']
         actual_result = self.storage.content_mimetype_get_range(
             start, end,
             indexer_configuration_id=tool_id, limit=limited_results)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         self.assertEqual(limited_results, len(actual_ids))
         self.assertIsNotNone(actual_next)
         self.assertEqual(actual_next, content_ids[-1])
 
         expected_mimetypes = content_ids[:-1]
         self.assertEqual(expected_mimetypes, actual_ids)
 
         # retrieve next part
         actual_results2 = self.storage.content_mimetype_get_range(
             start=end, end=end, indexer_configuration_id=tool_id)
         actual_ids2 = actual_results2['ids']
         actual_next2 = actual_results2['next']
 
         self.assertIsNone(actual_next2)
         expected_mimetypes2 = [content_ids[-1]]
         self.assertEqual(expected_mimetypes2, actual_ids2)
 
     @pytest.mark.property_based
     def test_generate_content_fossology_license_get_range_limit_none(self):
         """license_get_range call with wrong limit input should fail"""
         with self.assertRaises(ValueError) as e:
             self.storage.content_fossology_license_get_range(
                 start=None, end=None, indexer_configuration_id=None,
                 limit=None)
 
         self.assertEqual(e.exception.args, (
             'Development error: limit should not be None',))
 
     @pytest.mark.property_based
     def prepare_mimetypes_from(self, fossology_licenses):
         """Fossology license needs some consistent data in db to run.
 
         """
         mimetypes = []
         for c in fossology_licenses:
             mimetypes.append({
                 'id': c['id'],
                 'mimetype': 'text/plain',
                 'encoding': 'utf-8',
                 'indexer_configuration_id': c['indexer_configuration_id'],
             })
         return mimetypes
 
     @pytest.mark.property_based
     @given(gen_content_fossology_licenses(min_size=1, max_size=4))
     def test_generate_content_fossology_license_get_range_no_limit(
             self, fossology_licenses):
         """license_get_range returns licenses within range provided"""
         self.reset_storage_tables()
         # craft some consistent mimetypes
         mimetypes = self.prepare_mimetypes_from(fossology_licenses)
 
         self.storage.content_mimetype_add(mimetypes)
         # add fossology_licenses to storage
         self.storage.content_fossology_license_add(fossology_licenses)
 
         # All ids from the db
         content_ids = sorted([c['id'] for c in fossology_licenses])
 
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve fossology_licenses
         tool_id = fossology_licenses[0]['indexer_configuration_id']
         actual_result = self.storage.content_fossology_license_get_range(
             start, end, indexer_configuration_id=tool_id)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         self.assertEqual(len(fossology_licenses), len(actual_ids))
         self.assertIsNone(actual_next)
         self.assertEqual(content_ids, actual_ids)
 
     @pytest.mark.property_based
     @given(gen_content_fossology_licenses(min_size=1, max_size=4),
            gen_content_mimetypes(min_size=1, max_size=1))
     def test_generate_content_fossology_license_get_range_no_limit_with_filter(
             self, fossology_licenses, mimetypes):
         """This filters non textual, then returns results within range"""
         self.reset_storage_tables()
 
         # craft some consistent mimetypes
         _mimetypes = self.prepare_mimetypes_from(fossology_licenses)
         # add binary mimetypes which will get filtered out in results
         for m in mimetypes:
             _mimetypes.append({
                 'mimetype': 'binary',
                 **m,
             })
 
         self.storage.content_mimetype_add(_mimetypes)
         # add fossology_licenses to storage
         self.storage.content_fossology_license_add(fossology_licenses)
 
         # All ids from the db
         content_ids = sorted([c['id'] for c in fossology_licenses])
 
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve fossology_licenses
         tool_id = fossology_licenses[0]['indexer_configuration_id']
         actual_result = self.storage.content_fossology_license_get_range(
             start, end, indexer_configuration_id=tool_id)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         self.assertEqual(len(fossology_licenses), len(actual_ids))
         self.assertIsNone(actual_next)
         self.assertEqual(content_ids, actual_ids)
 
     @pytest.mark.property_based
     @given(gen_content_fossology_licenses(min_size=4, max_size=4))
     def test_generate_fossology_license_get_range_limit(
             self, fossology_licenses):
         """fossology_license_get_range paginates results if limit exceeded"""
         self.reset_storage_tables()
         # craft some consistent mimetypes
         mimetypes = self.prepare_mimetypes_from(fossology_licenses)
 
         # add fossology_licenses to storage
         self.storage.content_mimetype_add(mimetypes)
         self.storage.content_fossology_license_add(fossology_licenses)
 
         # input the list of sha1s we want from storage
         content_ids = sorted([c['id'] for c in fossology_licenses])
         start = content_ids[0]
         end = content_ids[-1]
 
         # retrieve fossology_licenses limited to 3 results
         limited_results = len(fossology_licenses) - 1
         tool_id = fossology_licenses[0]['indexer_configuration_id']
         actual_result = self.storage.content_fossology_license_get_range(
             start, end,
             indexer_configuration_id=tool_id, limit=limited_results)
 
         actual_ids = actual_result['ids']
         actual_next = actual_result['next']
 
         self.assertEqual(limited_results, len(actual_ids))
         self.assertIsNotNone(actual_next)
         self.assertEqual(actual_next, content_ids[-1])
 
         expected_fossology_licenses = content_ids[:-1]
         self.assertEqual(expected_fossology_licenses, actual_ids)
 
         # retrieve next part
         actual_results2 = self.storage.content_fossology_license_get_range(
             start=end, end=end, indexer_configuration_id=tool_id)
         actual_ids2 = actual_results2['ids']
         actual_next2 = actual_results2['next']
 
         self.assertIsNone(actual_next2)
         expected_fossology_licenses2 = [content_ids[-1]]
         self.assertEqual(expected_fossology_licenses2, actual_ids2)
 
 
 @pytest.mark.db
 class IndexerTestStorage(CommonTestStorage, BasePgTestStorage,
                          unittest.TestCase):
     """Running the tests locally.
 
     For the client api tests (remote storage), see
     `class`:swh.indexer.storage.test_api_client:TestRemoteStorage
     class.
 
     """
     pass
 
 
 def test_mapping_names():
     assert set(MAPPING_NAMES) == {m.name for m in MAPPINGS.values()}
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 1f50746..045f4de 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,1210 +1,1207 @@
 # Copyright (C) 2017-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import json
 import unittest
 
 from hypothesis import given, strategies, settings, HealthCheck
 
 from swh.model.hashutil import hash_to_bytes
 
 from swh.indexer.codemeta import CODEMETA_TERMS, CROSSWALK_TABLE
+from swh.indexer.codemeta import merge_documents
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.metadata_dictionary.base import merge_values
 from swh.indexer.metadata_detector import (
-    detect_metadata, extract_minimal_metadata_dict
+    detect_metadata
 )
 from swh.indexer.metadata import (
     ContentMetadataIndexer, RevisionMetadataIndexer
 )
 
 from .utils import (
     BASE_TEST_CONFIG, fill_obj_storage, fill_storage,
     YARN_PARSER_METADATA, json_document_strategy,
     xml_document_strategy,
 )
 
 
 TRANSLATOR_TOOL = {
     'name': 'swh-metadata-translator',
     'version': '0.0.2',
     'configuration': {
         'type': 'local',
         'context': 'NpmMapping'
     }
 }
 
 
 class ContentMetadataTestIndexer(ContentMetadataIndexer):
     """Specific Metadata whose configuration is enough to satisfy the
        indexing tests.
     """
     def parse_config_file(self, *args, **kwargs):
         assert False, 'should not be called; the rev indexer configures it.'
 
 
 REVISION_METADATA_CONFIG = {
     **BASE_TEST_CONFIG,
     'tools': TRANSLATOR_TOOL,
 }
 
 
 class Metadata(unittest.TestCase):
     """
     Tests metadata_mock_tool tool for Metadata detection
     """
     def setUp(self):
         """
         shows the entire diff in the results
         """
         self.maxDiff = None
         self.npm_mapping = MAPPINGS['NpmMapping']()
         self.codemeta_mapping = MAPPINGS['CodemetaMapping']()
         self.maven_mapping = MAPPINGS['MavenMapping']()
         self.pkginfo_mapping = MAPPINGS['PythonPkginfoMapping']()
         self.gemspec_mapping = MAPPINGS['GemspecMapping']()
 
     def test_crosstable(self):
         self.assertEqual(CROSSWALK_TABLE['NodeJS'], {
             'repository': 'http://schema.org/codeRepository',
             'os': 'http://schema.org/operatingSystem',
             'cpu': 'http://schema.org/processorRequirements',
             'engines':
                 'http://schema.org/processorRequirements',
             'author': 'http://schema.org/author',
             'author.email': 'http://schema.org/email',
             'author.name': 'http://schema.org/name',
             'contributor': 'http://schema.org/contributor',
             'keywords': 'http://schema.org/keywords',
             'license': 'http://schema.org/license',
             'version': 'http://schema.org/version',
             'description': 'http://schema.org/description',
             'name': 'http://schema.org/name',
             'bugs': 'https://codemeta.github.io/terms/issueTracker',
             'homepage': 'http://schema.org/url'
         })
 
     def test_merge_values(self):
         self.assertEqual(
             merge_values('a', 'b'),
             ['a', 'b'])
         self.assertEqual(
             merge_values(['a', 'b'], 'c'),
             ['a', 'b', 'c'])
         self.assertEqual(
             merge_values('a', ['b', 'c']),
             ['a', 'b', 'c'])
 
         self.assertEqual(
             merge_values({'@list': ['a']}, {'@list': ['b']}),
             {'@list': ['a', 'b']})
         self.assertEqual(
             merge_values({'@list': ['a', 'b']}, {'@list': ['c']}),
             {'@list': ['a', 'b', 'c']})
 
         with self.assertRaises(ValueError):
             merge_values({'@list': ['a']}, 'b')
         with self.assertRaises(ValueError):
             merge_values('a', {'@list': ['b']})
         with self.assertRaises(ValueError):
             merge_values({'@list': ['a']}, ['b'])
         with self.assertRaises(ValueError):
             merge_values(['a'], {'@list': ['b']})
 
         self.assertEqual(
             merge_values('a', None),
             'a')
         self.assertEqual(
             merge_values(['a', 'b'], None),
             ['a', 'b'])
         self.assertEqual(
             merge_values(None, ['b', 'c']),
             ['b', 'c'])
         self.assertEqual(
             merge_values({'@list': ['a']}, None),
             {'@list': ['a']})
         self.assertEqual(
             merge_values(None, {'@list': ['a']}),
             {'@list': ['a']})
 
     def test_compute_metadata_none(self):
         """
         testing content empty content is empty
         should return None
         """
         # given
         content = b""
 
         # None if no metadata was found or an error occurred
         declared_metadata = None
         # when
         result = self.npm_mapping.translate(content)
         # then
         self.assertEqual(declared_metadata, result)
 
     def test_compute_metadata_npm(self):
         """
         testing only computation of metadata with hard_mapping_npm
         """
         # given
         content = b"""
             {
                 "name": "test_metadata",
                 "version": "0.0.2",
                 "description": "Simple package.json test for indexer",
                   "repository": {
                     "type": "git",
                     "url": "https://github.com/moranegg/metadata_test"
                 },
                 "author": {
                     "email": "moranegg@example.com",
                     "name": "Morane G"
                 }
             }
         """
         declared_metadata = {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'test_metadata',
             'version': '0.0.2',
             'description': 'Simple package.json test for indexer',
             'codeRepository':
                 'git+https://github.com/moranegg/metadata_test',
             'author': [{
                 'type': 'Person',
                 'name': 'Morane G',
                 'email': 'moranegg@example.com',
             }],
         }
 
         # when
         result = self.npm_mapping.translate(content)
         # then
         self.assertEqual(declared_metadata, result)
 
-    def test_extract_minimal_metadata_dict(self):
+    def test_merge_documents(self):
         """
         Test the creation of a coherent minimal metadata set
         """
         # given
         metadata_list = [{
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'test_1',
             'version': '0.0.2',
             'description': 'Simple package.json test for indexer',
             'codeRepository':
                 'git+https://github.com/moranegg/metadata_test',
         }, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'test_0_1',
             'version': '0.0.2',
             'description': 'Simple package.json test for indexer',
             'codeRepository':
                 'git+https://github.com/moranegg/metadata_test'
         }, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'test_metadata',
             'version': '0.0.2',
             'author': 'moranegg',
         }]
 
         # when
-        results = extract_minimal_metadata_dict(metadata_list)
+        results = merge_documents(metadata_list)
 
         # then
         expected_results = {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             "version": '0.0.2',
             "description": 'Simple package.json test for indexer',
             "name": ['test_1', 'test_0_1', 'test_metadata'],
             "author": ['moranegg'],
             "codeRepository":
                 'git+https://github.com/moranegg/metadata_test',
         }
         self.assertEqual(expected_results, results)
 
     def test_index_content_metadata_npm(self):
         """
         testing NPM with package.json
         - one sha1 uses a file that can't be translated to metadata and
           should return None in the translated metadata
         """
         # given
         sha1s = [
             hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'),
             hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'),
             hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'),
         ]
         # this metadata indexer computes only metadata for package.json
         # in npm context with a hard mapping
         config = BASE_TEST_CONFIG.copy()
         config['tools'] = [TRANSLATOR_TOOL]
         metadata_indexer = ContentMetadataTestIndexer(config=config)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         # when
         metadata_indexer.run(sha1s, policy_update='ignore-dups')
         results = list(metadata_indexer.idx_storage.content_metadata_get(
             sha1s))
 
         expected_results = [{
             'metadata': {
                 '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
                 'type': 'SoftwareSourceCode',
                 'codeRepository':
                     'git+https://github.com/moranegg/metadata_test',
                 'description': 'Simple package.json test for indexer',
                 'name': 'test_metadata',
                 'version': '0.0.1'
             },
             'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'),
             }, {
             'metadata': {
                 '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
                 'type': 'SoftwareSourceCode',
                 'issueTracker':
                     'https://github.com/npm/npm/issues',
                 'author': [{
                     'type': 'Person',
                     'name': 'Isaac Z. Schlueter',
                     'email': 'i@izs.me',
                     'url': 'http://blog.izs.me',
                 }],
                 'codeRepository':
                     'git+https://github.com/npm/npm',
                 'description': 'a package manager for JavaScript',
                 'license': 'https://spdx.org/licenses/Artistic-2.0',
                 'version': '5.0.3',
                 'name': 'npm',
                 'keywords': [
                     'install',
                     'modules',
                     'package manager',
                     'package.json'
                 ],
                 'url': 'https://docs.npmjs.com/'
             },
             'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607')
         }]
 
         for result in results:
             del result['tool']
 
         # The assertion below returns False sometimes because of nested lists
         self.assertEqual(expected_results, results)
 
     def test_npm_bugs_normalization(self):
         # valid dictionary
         package_json = b"""{
             "name": "foo",
             "bugs": {
                 "url": "https://github.com/owner/project/issues",
                 "email": "foo@example.com"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'issueTracker': 'https://github.com/owner/project/issues',
             'type': 'SoftwareSourceCode',
         })
 
         # "invalid" dictionary
         package_json = b"""{
             "name": "foo",
             "bugs": {
                 "email": "foo@example.com"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'type': 'SoftwareSourceCode',
         })
 
         # string
         package_json = b"""{
             "name": "foo",
             "bugs": "https://github.com/owner/project/issues"
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'issueTracker': 'https://github.com/owner/project/issues',
             'type': 'SoftwareSourceCode',
         })
 
     def test_npm_repository_normalization(self):
         # normal
         package_json = b"""{
             "name": "foo",
             "repository": {
                 "type" : "git",
                 "url" : "https://github.com/npm/cli.git"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'codeRepository': 'git+https://github.com/npm/cli.git',
             'type': 'SoftwareSourceCode',
         })
 
         # missing url
         package_json = b"""{
             "name": "foo",
             "repository": {
                 "type" : "git"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'type': 'SoftwareSourceCode',
         })
 
         # github shortcut
         package_json = b"""{
             "name": "foo",
             "repository": "github:npm/cli"
         }"""
         result = self.npm_mapping.translate(package_json)
         expected_result = {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'codeRepository': 'git+https://github.com/npm/cli.git',
             'type': 'SoftwareSourceCode',
         }
         self.assertEqual(result, expected_result)
 
         # github shortshortcut
         package_json = b"""{
             "name": "foo",
             "repository": "npm/cli"
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, expected_result)
 
         # gitlab shortcut
         package_json = b"""{
             "name": "foo",
             "repository": "gitlab:user/repo"
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'name': 'foo',
             'codeRepository': 'git+https://gitlab.com/user/repo.git',
             'type': 'SoftwareSourceCode',
         })
 
     def test_detect_metadata_package_json(self):
         # given
         df = [{
                 'sha1_git': b'abc',
                 'name': b'index.js',
                 'target': b'abc',
                 'length': 897,
                 'status': 'visible',
                 'type': 'file',
                 'perms': 33188,
                 'dir_id': b'dir_a',
                 'sha1': b'bcd'
             },
             {
                 'sha1_git': b'aab',
                 'name': b'package.json',
                 'target': b'aab',
                 'length': 712,
                 'status': 'visible',
                 'type': 'file',
                 'perms': 33188,
                 'dir_id': b'dir_a',
                 'sha1': b'cde'
         }]
         # when
         results = detect_metadata(df)
 
         expected_results = {
             'NpmMapping': [
                 b'cde'
             ]
         }
         # then
         self.assertEqual(expected_results, results)
 
     def test_compute_metadata_valid_codemeta(self):
         raw_content = (
             b"""{
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "@type": "SoftwareSourceCode",
             "identifier": "CodeMeta",
             "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
             "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
             "codeRepository": "https://github.com/codemeta/codemeta",
             "issueTracker": "https://github.com/codemeta/codemeta/issues",
             "license": "https://spdx.org/licenses/Apache-2.0",
             "version": "2.0",
             "author": [
               {
                 "@type": "Person",
                 "givenName": "Carl",
                 "familyName": "Boettiger",
                 "email": "cboettig@gmail.com",
                 "@id": "http://orcid.org/0000-0002-1642-628X"
               },
               {
                 "@type": "Person",
                 "givenName": "Matthew B.",
                 "familyName": "Jones",
                 "email": "jones@nceas.ucsb.edu",
                 "@id": "http://orcid.org/0000-0003-0077-4738"
               }
             ],
             "maintainer": {
               "@type": "Person",
               "givenName": "Carl",
               "familyName": "Boettiger",
               "email": "cboettig@gmail.com",
               "@id": "http://orcid.org/0000-0002-1642-628X"
             },
             "contIntegration": "https://travis-ci.org/codemeta/codemeta",
             "developmentStatus": "active",
             "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
             "funder": {
                 "@id": "https://doi.org/10.13039/100000001",
                 "@type": "Organization",
                 "name": "National Science Foundation"
             },
             "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
             "keywords": [
               "metadata",
               "software"
             ],
             "version":"2.0",
             "dateCreated":"2017-06-05",
             "datePublished":"2017-06-05",
             "programmingLanguage": "JSON-LD"
           }""") # noqa
         expected_result = {
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "type": "SoftwareSourceCode",
             "identifier": "CodeMeta",
             "description":
                 "CodeMeta is a concept vocabulary that can "
                 "be used to standardize the exchange of software metadata "
                 "across repositories and organizations.",
             "name":
                 "CodeMeta: Minimal metadata schemas for science "
                 "software and code, in JSON-LD",
             "codeRepository": "https://github.com/codemeta/codemeta",
             "issueTracker": "https://github.com/codemeta/codemeta/issues",
             "license": "https://spdx.org/licenses/Apache-2.0",
             "version": "2.0",
             "author": [
               {
                 "type": "Person",
                 "givenName": "Carl",
                 "familyName": "Boettiger",
                 "email": "cboettig@gmail.com",
                 "id": "http://orcid.org/0000-0002-1642-628X"
               },
               {
                 "type": "Person",
                 "givenName": "Matthew B.",
                 "familyName": "Jones",
                 "email": "jones@nceas.ucsb.edu",
                 "id": "http://orcid.org/0000-0003-0077-4738"
               }
             ],
             "maintainer": {
               "type": "Person",
               "givenName": "Carl",
               "familyName": "Boettiger",
               "email": "cboettig@gmail.com",
               "id": "http://orcid.org/0000-0002-1642-628X"
             },
             "contIntegration": "https://travis-ci.org/codemeta/codemeta",
             "developmentStatus": "active",
             "downloadUrl":
                 "https://github.com/codemeta/codemeta/archive/2.0.zip",
             "funder": {
                 "id": "https://doi.org/10.13039/100000001",
                 "type": "Organization",
                 "name": "National Science Foundation"
             },
             "funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
                 "in Scientific Software",
             "keywords": [
               "metadata",
               "software"
             ],
             "version": "2.0",
             "dateCreated": "2017-06-05",
             "datePublished": "2017-06-05",
             "programmingLanguage": "JSON-LD"
           }
         result = self.codemeta_mapping.translate(raw_content)
         self.assertEqual(result, expected_result)
 
     def test_compute_metadata_codemeta_alternate_context(self):
         raw_content = (
             b"""{
             "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
             "@type": "SoftwareSourceCode",
             "identifier": "CodeMeta"
         }""")  # noqa
         expected_result = {
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "type": "SoftwareSourceCode",
             "identifier": "CodeMeta",
         }
         result = self.codemeta_mapping.translate(raw_content)
         self.assertEqual(result, expected_result)
 
     def test_compute_metadata_maven(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <repositories>
             <repository>
               <id>central</id>
               <name>Maven Repository Switchboard</name>
               <layout>default</layout>
               <url>http://repo1.maven.org/maven2</url>
               <snapshots>
                 <enabled>false</enabled>
               </snapshots>
             </repository>
           </repositories>
           <licenses>
             <license>
               <name>Apache License, Version 2.0</name>
               <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
               <distribution>repo</distribution>
               <comments>A business-friendly OSS license</comments>
             </license>
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt',
             'codeRepository':
                 'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
         })
 
     def test_compute_metadata_maven_empty(self):
         raw_content = b"""
         <project>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
         })
 
     def test_compute_metadata_maven_almost_empty(self):
         raw_content = b"""
         <project>
           <foo/>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
         })
 
     def test_compute_metadata_maven_invalid_xml(self):
         expected_warning = (
             'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:'
             'Error parsing XML from foo')
 
         raw_content = b"""
         <project>"""
         with self.assertLogs('swh.indexer.metadata_dictionary',
                              level='WARNING') as cm:
             result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
         raw_content = b"""
         """
         with self.assertLogs('swh.indexer.metadata_dictionary',
                              level='WARNING') as cm:
             result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
     def test_compute_metadata_maven_unknown_encoding(self):
         expected_warning = (
             'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:'
             'Error detecting XML encoding from foo')
 
         raw_content = b"""<?xml version="1.0" encoding="foo"?>
         <project>
         </project>"""
         with self.assertLogs('swh.indexer.metadata_dictionary',
                              level='WARNING') as cm:
             result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
         raw_content = b"""<?xml version="1.0" encoding="UTF-7"?>
         <project>
         </project>"""
         with self.assertLogs('swh.indexer.metadata_dictionary',
                              level='WARNING') as cm:
             result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
     def test_compute_metadata_maven_invalid_encoding(self):
         expected_warning = (
             'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:'
             'Error unidecoding XML from foo')
 
         raw_content = b"""<?xml version="1.0" encoding="UTF-8"?>
         <foo\xe5ct>
         </foo>"""
         with self.assertLogs('swh.indexer.metadata_dictionary',
                              level='WARNING') as cm:
             result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
     def test_compute_metadata_maven_minimal(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
     def test_compute_metadata_maven_empty_nodes(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <repositories>
           </repositories>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version></version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
         raw_content = b"""
         <project>
           <name></name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <licenses>
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
         raw_content = b"""
         <project>
           <groupId></groupId>
           <version>1.2.3</version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'version': '1.2.3',
         })
 
     def test_compute_metadata_maven_invalid_licenses(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <licenses>
             foo
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'codeRepository':
             'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
         })
 
     def test_compute_metadata_maven_multiple(self):
         '''Tests when there are multiple code repos and licenses.'''
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <repositories>
             <repository>
               <id>central</id>
               <name>Maven Repository Switchboard</name>
               <layout>default</layout>
               <url>http://repo1.maven.org/maven2</url>
               <snapshots>
                 <enabled>false</enabled>
               </snapshots>
             </repository>
             <repository>
               <id>example</id>
               <name>Example Maven Repo</name>
               <layout>default</layout>
               <url>http://example.org/maven2</url>
             </repository>
           </repositories>
           <licenses>
             <license>
               <name>Apache License, Version 2.0</name>
               <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
               <distribution>repo</distribution>
               <comments>A business-friendly OSS license</comments>
             </license>
             <license>
               <name>MIT license</name>
               <url>https://opensource.org/licenses/MIT</url>
             </license>
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'Maven Default Project',
             'identifier': 'com.mycompany.app',
             'version': '1.2.3',
             'license': [
                 'https://www.apache.org/licenses/LICENSE-2.0.txt',
                 'https://opensource.org/licenses/MIT',
             ],
             'codeRepository': [
                 'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
                 'http://example.org/maven2/com/mycompany/app/my-app',
             ]
         })
 
     def test_compute_metadata_pkginfo(self):
         raw_content = (b"""\
 Metadata-Version: 2.1
 Name: swh.core
 Version: 0.0.49
 Summary: Software Heritage core utilities
 Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
 Description: swh-core
         ========
        \x20
         core library for swh's modules:
         - config parser
         - hash computations
         - serialization
         - logging mechanism
        \x20
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 """) # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertCountEqual(result['description'], [
             'Software Heritage core utilities',  # note the comma here
             'swh-core\n'
             '========\n'
             '\n'
             "core library for swh's modules:\n"
             '- config parser\n'
             '- hash computations\n'
             '- serialization\n'
             '- logging mechanism\n'
             ''],
             result)
         del result['description']
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'url': 'https://forge.softwareheritage.org/diffusion/DCORE/',
             'name': 'swh.core',
             'author': [{
                 'type': 'Person',
                 'name': 'Software Heritage developers',
                 'email': 'swh-devel@inria.fr',
             }],
             'version': '0.0.49',
         })
 
     def test_compute_metadata_pkginfo_utf8(self):
         raw_content = (b'''\
 Metadata-Version: 1.1
 Name: snowpyt
 Description-Content-Type: UNKNOWN
 Description: foo
         Hydrology N\xc2\xb083
 ''') # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'snowpyt',
             'description': 'foo\nHydrology N°83',
         })
 
     def test_compute_metadata_pkginfo_keywords(self):
         raw_content = (b"""\
 Metadata-Version: 2.1
 Name: foo
 Keywords: foo bar baz
 """) # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'foo',
             'keywords': ['foo', 'bar', 'baz'],
         })
 
     def test_compute_metadata_pkginfo_license(self):
         raw_content = (b"""\
 Metadata-Version: 2.1
 Name: foo
 License: MIT
 """) # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'foo',
             'license': 'MIT',
         })
 
     def test_gemspec_base(self):
         raw_content = b"""
 Gem::Specification.new do |s|
   s.name        = 'example'
   s.version     = '0.1.0'
   s.licenses    = ['MIT']
   s.summary     = "This is an example!"
   s.description = "Much longer explanation of the example!"
   s.authors     = ["Ruby Coder"]
   s.email       = 'rubycoder@example.com'
   s.files       = ["lib/example.rb"]
   s.homepage    = 'https://rubygems.org/gems/example'
   s.metadata    = { "source_code_uri" => "https://github.com/example/example" }
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertCountEqual(result.pop('description'), [
             "This is an example!",
             "Much longer explanation of the example!"
         ])
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'author': ['Ruby Coder'],
             'name': 'example',
             'license': 'https://spdx.org/licenses/MIT',
             'codeRepository': 'https://rubygems.org/gems/example',
             'email': 'rubycoder@example.com',
             'version': '0.1.0',
         })
 
     def test_gemspec_two_author_fields(self):
         raw_content = b"""
 Gem::Specification.new do |s|
   s.authors     = ["Ruby Coder1"]
   s.author      = "Ruby Coder2"
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertCountEqual(result.pop('author'), [
             'Ruby Coder1', 'Ruby Coder2'])
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
         })
 
     def test_gemspec_invalid_author(self):
         raw_content = b"""
 Gem::Specification.new do |s|
   s.author      = ["Ruby Coder"]
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
         })
         raw_content = b"""
 Gem::Specification.new do |s|
   s.author      = "Ruby Coder1",
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
         })
         raw_content = b"""
 Gem::Specification.new do |s|
   s.authors     = ["Ruby Coder1", ["Ruby Coder2"]]
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'author': ['Ruby Coder1'],
         })
 
     def test_gemspec_alternative_header(self):
         raw_content = b"""
 require './lib/version'
 
 Gem::Specification.new { |s|
   s.name = 'rb-system-with-aliases'
   s.summary = 'execute system commands with aliases'
 }
 """
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(result, {
             '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
             'type': 'SoftwareSourceCode',
             'name': 'rb-system-with-aliases',
             'description': 'execute system commands with aliases',
         })
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(json_document_strategy(
         keys=list(MAPPINGS['NpmMapping'].mapping)))
     def test_npm_adversarial(self, doc):
         raw = json.dumps(doc).encode()
         self.npm_mapping.translate(raw)
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(json_document_strategy(keys=CODEMETA_TERMS))
     def test_codemeta_adversarial(self, doc):
         raw = json.dumps(doc).encode()
         self.codemeta_mapping.translate(raw)
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(xml_document_strategy(
         keys=list(MAPPINGS['MavenMapping'].mapping),
         root='project',
         xmlns='http://maven.apache.org/POM/4.0.0'))
     def test_maven_adversarial(self, doc):
         self.maven_mapping.translate(doc)
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(strategies.dictionaries(
         # keys
         strategies.one_of(
             strategies.text(),
             *map(strategies.just, MAPPINGS['GemspecMapping'].mapping)
         ),
         # values
         strategies.recursive(
             strategies.characters(),
             lambda children: strategies.lists(children, 1)
         )
     ))
     def test_gemspec_adversarial(self, doc):
         parts = [b'Gem::Specification.new do |s|\n']
         for (k, v) in doc.items():
             parts.append('  s.{} = {}\n'.format(k, repr(v)).encode())
         parts.append(b'end\n')
         self.gemspec_mapping.translate(b''.join(parts))
 
     def test_revision_metadata_indexer(self):
         metadata_indexer = RevisionMetadataIndexer(
             config=REVISION_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()})
         assert tool is not None
 
         metadata_indexer.idx_storage.content_metadata_add([{
             'indexer_configuration_id': tool['id'],
             'id': b'cde',
             'metadata': YARN_PARSER_METADATA,
         }])
 
         sha1_gits = [
             hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
         ]
         metadata_indexer.run(sha1_gits, 'update-dups')
 
         results = list(
             metadata_indexer.idx_storage.
             revision_intrinsic_metadata_get(sha1_gits))
 
         expected_results = [{
             'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
             'tool': TRANSLATOR_TOOL,
             'metadata': YARN_PARSER_METADATA,
             'mappings': ['npm'],
         }]
 
         for result in results:
             del result['tool']['id']
 
         # then
         self.assertEqual(expected_results, results)
 
     def test_revision_metadata_indexer_single_root_dir(self):
         metadata_indexer = RevisionMetadataIndexer(
             config=REVISION_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         # Add a parent directory, that is the only directory at the root
         # of the revision
         rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
-        subdir_id = metadata_indexer.storage._revisions[rev_id]['directory']
-        metadata_indexer.storage._revisions[rev_id]['directory'] = b'123456'
+        rev = metadata_indexer.storage._revisions[rev_id]
+        subdir_id = rev.directory
+        rev.directory = b'123456'
         metadata_indexer.storage.directory_add([{
             'id': b'123456',
             'entries': [{
-                'target': subdir_id,
-                'type': 'dir',
-                'length': None,
                 'name': b'foobar-1.0.0',
-                'sha1': None,
+                'type': 'dir',
+                'target': subdir_id,
                 'perms': 16384,
-                'sha1_git': None,
-                'status': None,
-                'sha256': None
             }],
         }])
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()})
         assert tool is not None
 
         metadata_indexer.idx_storage.content_metadata_add([{
             'indexer_configuration_id': tool['id'],
             'id': b'cde',
             'metadata': YARN_PARSER_METADATA,
         }])
 
         sha1_gits = [
             hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
         ]
         metadata_indexer.run(sha1_gits, 'update-dups')
 
         results = list(
             metadata_indexer.idx_storage.
             revision_intrinsic_metadata_get(sha1_gits))
 
         expected_results = [{
             'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
             'tool': TRANSLATOR_TOOL,
             'metadata': YARN_PARSER_METADATA,
             'mappings': ['npm'],
         }]
 
         for result in results:
             del result['tool']['id']
 
         # then
         self.assertEqual(expected_results, results)
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
index a22de1b..f09927e 100644
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -1,761 +1,749 @@
-# Copyright (C) 2017-2018  The Software Heritage developers
+# Copyright (C) 2017-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import abc
 import datetime
 import functools
 import random
 import unittest
 
 from hypothesis import strategies
 
 from swh.model import hashutil
 from swh.model.hashutil import hash_to_bytes, hash_to_hex
 
 from swh.indexer.storage import INDEXER_CFG_KEY
 
 BASE_TEST_CONFIG = {
     'storage': {
         'cls': 'memory',
         'args': {
         },
     },
     'objstorage': {
         'cls': 'memory',
         'args': {
         },
     },
     INDEXER_CFG_KEY: {
         'cls': 'memory',
         'args': {
         },
     },
 }
 
 ORIGINS = [
         {
-            'lister': None,
-            'project': None,
             'type': 'git',
             'url': 'https://github.com/SoftwareHeritage/swh-storage'},
         {
-            'lister': None,
-            'project': None,
             'type': 'ftp',
             'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
         {
-            'lister': None,
-            'project': None,
             'type': 'deposit',
             'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
         {
-            'lister': None,
-            'project': None,
             'type': 'pypi',
             'url': 'https://pypi.org/project/limnoria/'},
         {
-            'lister': None,
-            'project': None,
             'type': 'svn',
             'url': 'http://0-512-md.googlecode.com/svn/'},
         {
-            'lister': None,
-            'project': None,
             'type': 'git',
             'url': 'https://github.com/librariesio/yarn-parser'},
         {
-            'lister': None,
-            'project': None,
             'type': 'git',
             'url': 'https://github.com/librariesio/yarn-parser.git'},
         ]
 
 SNAPSHOTS = [
     {
         'origin': 'https://github.com/SoftwareHeritage/swh-storage',
         'branches': {
             b'refs/heads/add-revision-origin-cache': {
                 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
                           b's\xe7/\xe9l\x1e',
                 'target_type': 'revision'},
             b'HEAD': {
                 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
                           b'\xac\xefrm',
                 'target_type': 'revision'},
             b'refs/tags/v0.0.103': {
                 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
                           b'\x0f\xdd',
                 'target_type': 'release'},
             }},
     {
         'origin': 'rsync://ftp.gnu.org/gnu/3dldf',
         'branches': {
             b'3DLDF-1.1.4.tar.gz': {
                 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc'
                           b'"G\x99\x11',
                 'target_type': 'revision'},
             b'3DLDF-2.0.2.tar.gz': {
                 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
                           b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
                 'target_type': 'revision'},
             b'3DLDF-2.0.3-examples.tar.gz': {
                 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
                           b'\xfe\xadZ\x80\x80\xc1\x83\xff',
                 'target_type': 'revision'},
             b'3DLDF-2.0.3.tar.gz': {
                 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
                           b'\xcc\x1a\xb4`\x8c\x8by',
                 'target_type': 'revision'},
             b'3DLDF-2.0.tar.gz': {
                 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
                           b'\xd3\xd1m',
-                b'target_type': 'revision'}
+                'target_type': 'revision'}
             }},
     {
         'origin': 'https://forge.softwareheritage.org/source/jesuisgpl/',
         'branches': {
             b'master': {
                 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
                           b'\xa6\xe9\x99\xb1\x9e]q\xeb',
                 'target_type': 'revision'}
         },
         'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
               b"\x1d\r "},
     {
         'origin': 'https://pypi.org/project/limnoria/',
         'branches': {
             b'HEAD': {
                 'target': b'releases/2018.09.09',
                 'target_type': 'alias'},
             b'releases/2018.09.01': {
                 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
                           b'\xbb\xdfF\xfdw\xcf',
                 'target_type': 'revision'},
             b'releases/2018.09.09': {
                 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
                           b'A\x10\x9d\xc5\xfa2\xf8t',
                 'target_type': 'revision'}},
         'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
               b'\x12\x9e\xd6\xb3'},
     {
         'origin': 'http://0-512-md.googlecode.com/svn/',
         'branches': {
             b'master': {
                 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
                           b'\xc9\xad#.\x1bw=\x18',
                 'target_type': 'revision'}},
         'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
               b'\x05\xea\xb8\x1f\xc4H\xf4s'},
     {
         'origin': 'https://github.com/librariesio/yarn-parser',
         'branches': {
             b'HEAD': {
                 'target': hash_to_bytes(
                     '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
                 'target_type': 'revision'}}},
     {
         'origin': 'https://github.com/librariesio/yarn-parser.git',
         'branches': {
             b'HEAD': {
                 'target': hash_to_bytes(
                     '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
                 'target_type': 'revision'}}},
 ]
 
 
 REVISIONS = [{
     'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
+    'message': 'Improve search functionality',
     'author': {
-        'id': 26,
         'name': b'Andrew Nesbitt',
         'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
         'email': b'andrewnez@gmail.com'
     },
     'committer': {
-        'id': 26,
         'name': b'Andrew Nesbitt',
         'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
         'email': b'andrewnez@gmail.com'
     },
+    'committer_date': {
+        'negative_utc': None,
+        'offset': 120,
+        'timestamp': {
+            'microseconds': 0,
+            'seconds': 1380883849
+        }
+    },
+    'type': 'git',
     'synthetic': False,
     'date': {
         'negative_utc': False,
         'timestamp': {
             'seconds': 1487596456,
             'microseconds': 0
         },
         'offset': 0
     },
     'directory': b'10'
 }]
 
 DIRECTORY_ID = b'10'
 
-DIRECTORY = [{
-    'sha1_git': b'abc',
+DIRECTORY_ENTRIES = [{
     'name': b'index.js',
-    'target': b'abc',
-    'length': 897,
-    'status': 'visible',
     'type': 'file',
+    'target': b'abc',
     'perms': 33188,
-    'sha1': b'bcd'
     },
     {
-    'sha1_git': b'aab',
     'name': b'package.json',
-    'target': b'aab',
-    'length': 712,
-    'status': 'visible',
     'type': 'file',
+    'target': b'cde',
     'perms': 33188,
-    'sha1': b'cde'
     },
     {
-    'target': b'11',
-    'type': 'dir',
-    'length': None,
     'name': b'.github',
-    'sha1': None,
+    'type': 'dir',
+    'target': b'11',
     'perms': 16384,
-    'sha1_git': None,
-    'status': None,
-    'sha256': None
     }
 ]
 
 SHA1_TO_LICENSES = {
     '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
     '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
     '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
     '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
     'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
 }
 
 
 SHA1_TO_CTAGS = {
     '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{
         'name': 'foo',
         'kind': 'str',
         'line': 10,
         'lang': 'bar',
     }],
     'd4c647f0fc257591cc9ba1722484229780d1c607': [{
         'name': 'let',
         'kind': 'int',
         'line': 100,
         'lang': 'haskell',
     }],
     '688a5ef812c53907562fe379d4b3851e69c7cb15': [{
         'name': 'symbol',
         'kind': 'float',
         'line': 99,
         'lang': 'python',
     }],
 }
 
 
 OBJ_STORAGE_DATA = {
     '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text',
     '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text',
     '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text',
     '02fb2c89e14f7fab46701478c83779c7beb7b069': b"""
     import unittest
     import logging
     from swh.indexer.mimetype import MimetypeIndexer
     from swh.indexer.tests.test_utils import MockObjStorage
 
     class MockStorage():
         def content_mimetype_add(self, mimetypes):
             self.state = mimetypes
             self.conflict_update = conflict_update
 
         def indexer_configuration_add(self, tools):
             return [{
                 'id': 10,
             }]
     """,
     '103bc087db1d26afc3a0283f38663d081e9b01e6': b"""
         #ifndef __AVL__
         #define __AVL__
 
         typedef struct _avl_tree avl_tree;
 
         typedef struct _data_t {
           int content;
         } data_t;
     """,
     '93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
     (should 'pygments (recognize 'lisp 'easily))
 
     """,
     '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
     {
         "name": "test_metadata",
         "version": "0.0.1",
         "description": "Simple package.json test for indexer",
         "repository": {
           "type": "git",
           "url": "https://github.com/moranegg/metadata_test"
       }
     }
     """,
     'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
     {
       "version": "5.0.3",
       "name": "npm",
       "description": "a package manager for JavaScript",
       "keywords": [
         "install",
         "modules",
         "package manager",
         "package.json"
       ],
       "preferGlobal": true,
       "config": {
         "publishtest": false
       },
       "homepage": "https://docs.npmjs.com/",
       "author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
       "repository": {
         "type": "git",
         "url": "https://github.com/npm/npm"
       },
       "bugs": {
         "url": "https://github.com/npm/npm/issues"
       },
       "dependencies": {
         "JSONStream": "~1.3.1",
         "abbrev": "~1.1.0",
         "ansi-regex": "~2.1.1",
         "ansicolors": "~0.3.2",
         "ansistyles": "~0.1.3"
       },
       "devDependencies": {
         "tacks": "~1.2.6",
         "tap": "~10.3.2"
       },
       "license": "Artistic-2.0"
     }
 
     """,
     'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b"""
     """,
     'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'',
-    '636465': b"""
+    # 626364
+    hash_to_hex(b'bcd'): b'unimportant content for bcd',
+    # 636465
+    hash_to_hex(b'cde'): b"""
     {
       "name": "yarn-parser",
       "version": "1.0.0",
       "description": "Tiny web service for parsing yarn.lock files",
       "main": "index.js",
       "scripts": {
         "start": "node index.js",
         "test": "mocha"
       },
       "engines": {
         "node": "9.8.0"
       },
       "repository": {
         "type": "git",
         "url": "git+https://github.com/librariesio/yarn-parser.git"
       },
       "keywords": [
         "yarn",
         "parse",
         "lock",
         "dependencies"
       ],
       "author": "Andrew Nesbitt",
       "license": "AGPL-3.0",
       "bugs": {
         "url": "https://github.com/librariesio/yarn-parser/issues"
       },
       "homepage": "https://github.com/librariesio/yarn-parser#readme",
       "dependencies": {
         "@yarnpkg/lockfile": "^1.0.0",
         "body-parser": "^1.15.2",
         "express": "^4.14.0"
       },
       "devDependencies": {
         "chai": "^4.1.2",
         "mocha": "^5.2.0",
         "request": "^2.87.0",
         "test": "^0.6.0"
       }
     }
+
 """
 }
 
-
 YARN_PARSER_METADATA = {
     '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
     'url':
         'https://github.com/librariesio/yarn-parser#readme',
     'codeRepository':
         'git+git+https://github.com/librariesio/yarn-parser.git',
     'author': [{
         'type': 'Person',
         'name': 'Andrew Nesbitt'
     }],
     'license': 'https://spdx.org/licenses/AGPL-3.0',
     'version': '1.0.0',
     'description':
         'Tiny web service for parsing yarn.lock files',
     'issueTracker':
         'https://github.com/librariesio/yarn-parser/issues',
     'name': 'yarn-parser',
     'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
+    'type': 'SoftwareSourceCode',
 }
 
 
 json_dict_keys = strategies.one_of(
     strategies.characters(),
     *map(strategies.just, ['type', 'url', 'name', 'email', '@id',
                            '@context', 'repository', 'license',
                            'repositories', 'licenses'
                            ]),
 )
 """Hypothesis strategy that generates strings, with an emphasis on those
 that are often used as dictionary keys in metadata files."""
 
 
 generic_json_document = strategies.recursive(
     strategies.none() | strategies.booleans() | strategies.floats() |
     strategies.characters(),
     lambda children: (
         strategies.lists(children, 1) |
         strategies.dictionaries(json_dict_keys, children, min_size=1)
     )
 )
 """Hypothesis strategy that generates possible values for values of JSON
 metadata files."""
 
 
 def json_document_strategy(keys=None):
     """Generates an hypothesis strategy that generates metadata files
     for a JSON-based format that uses the given keys."""
     if keys is None:
         keys = strategies.characters()
     else:
         keys = strategies.one_of(map(strategies.just, keys))
 
     return strategies.dictionaries(keys, generic_json_document, min_size=1)
 
 
 def _tree_to_xml(root, xmlns, data):
     def encode(s):
         "Skips unpaired surrogates generated by json_document_strategy"
         return s.encode('utf8', 'replace')
 
     def to_xml(data, indent=b' '):
         if data is None:
             return b''
         elif isinstance(data, (bool, str, int, float)):
             return indent + encode(str(data))
         elif isinstance(data, list):
             return b'\n'.join(to_xml(v, indent=indent) for v in data)
         elif isinstance(data, dict):
             lines = []
             for (key, value) in data.items():
                 lines.append(indent + encode('<{}>'.format(key)))
                 lines.append(to_xml(value, indent=indent+b' '))
                 lines.append(indent + encode('</{}>'.format(key)))
             return b'\n'.join(lines)
         else:
             raise TypeError(data)
 
     return b'\n'.join([
         '<{} xmlns="{}">'.format(root, xmlns).encode(),
         to_xml(data),
         '</{}>'.format(root).encode(),
     ])
 
 
 class TreeToXmlTest(unittest.TestCase):
     def test_leaves(self):
         self.assertEqual(
             _tree_to_xml('root', 'http://example.com', None),
             b'<root xmlns="http://example.com">\n\n</root>'
         )
         self.assertEqual(
             _tree_to_xml('root', 'http://example.com', True),
             b'<root xmlns="http://example.com">\n True\n</root>'
         )
         self.assertEqual(
             _tree_to_xml('root', 'http://example.com', 'abc'),
             b'<root xmlns="http://example.com">\n abc\n</root>'
         )
         self.assertEqual(
             _tree_to_xml('root', 'http://example.com', 42),
             b'<root xmlns="http://example.com">\n 42\n</root>'
         )
         self.assertEqual(
             _tree_to_xml('root', 'http://example.com', 3.14),
             b'<root xmlns="http://example.com">\n 3.14\n</root>'
         )
 
     def test_dict(self):
         self.assertIn(
             _tree_to_xml('root', 'http://example.com', {
                 'foo': 'bar',
                 'baz': 'qux'
             }),
             [
                 b'<root xmlns="http://example.com">\n'
                 b' <foo>\n  bar\n </foo>\n'
                 b' <baz>\n  qux\n </baz>\n'
                 b'</root>',
                 b'<root xmlns="http://example.com">\n'
                 b' <baz>\n  qux\n </baz>\n'
                 b' <foo>\n  bar\n </foo>\n'
                 b'</root>'
             ]
         )
 
     def test_list(self):
         self.assertEqual(
             _tree_to_xml('root', 'http://example.com', [
                 {'foo': 'bar'},
                 {'foo': 'baz'},
             ]),
             b'<root xmlns="http://example.com">\n'
             b' <foo>\n  bar\n </foo>\n'
             b' <foo>\n  baz\n </foo>\n'
             b'</root>'
         )
 
 
 def xml_document_strategy(keys, root, xmlns):
     """Generates an hypothesis strategy that generates metadata files
     for an XML format that uses the given keys."""
 
     return strategies.builds(
         functools.partial(_tree_to_xml, root, xmlns),
         json_document_strategy(keys))
 
 
 def filter_dict(d, keys):
     'return a copy of the dict with keys deleted'
     if not isinstance(keys, (list, tuple)):
         keys = (keys, )
     return dict((k, v) for (k, v) in d.items() if k not in keys)
 
 
 def fill_obj_storage(obj_storage):
     """Add some content in an object storage."""
     for (obj_id, content) in OBJ_STORAGE_DATA.items():
         obj_storage.add(content, obj_id=hash_to_bytes(obj_id))
 
 
 def fill_storage(storage):
     for origin in ORIGINS:
         storage.origin_add_one(origin)
     for snap in SNAPSHOTS:
         origin_url = snap['origin']
         visit = storage.origin_visit_add(origin_url, datetime.datetime.now())
         snap_id = snap.get('id') or \
             bytes([random.randint(0, 255) for _ in range(32)])
         storage.snapshot_add([{
             'id': snap_id,
             'branches': snap['branches']
         }])
         storage.origin_visit_update(
             origin_url, visit['visit'], status='full', snapshot=snap_id)
     storage.revision_add(REVISIONS)
-    storage.directory_add([{
-        'id': DIRECTORY_ID,
-        'entries': DIRECTORY,
-    }])
+
+    contents = []
     for (obj_id, content) in OBJ_STORAGE_DATA.items():
         content_hashes = hashutil.MultiHash.from_data(content).digest()
-        storage.content_add([{
+        contents.append({
             'data': content,
             'length': len(content),
             'status': 'visible',
             'sha1': hash_to_bytes(obj_id),
             'sha1_git': hash_to_bytes(obj_id),
             'sha256': content_hashes['sha256'],
             'blake2s256': content_hashes['blake2s256']
-        }])
+        })
+    storage.content_add(contents)
+    storage.directory_add([{
+        'id': DIRECTORY_ID,
+        'entries': DIRECTORY_ENTRIES,
+    }])
 
 
 class CommonContentIndexerTest(metaclass=abc.ABCMeta):
     legacy_get_format = False
     """True if and only if the tested indexer uses the legacy format.
     see: https://forge.softwareheritage.org/T1433
 
     """
     def get_indexer_results(self, ids):
         """Override this for indexers that don't have a mock storage."""
         return self.indexer.idx_storage.state
 
     def assert_legacy_results_ok(self, sha1s, expected_results=None):
         # XXX old format, remove this when all endpoints are
         #     updated to the new one
         #     see: https://forge.softwareheritage.org/T1433
         sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
                  for sha1 in sha1s]
         actual_results = list(self.get_indexer_results(sha1s))
 
         if expected_results is None:
             expected_results = self.expected_results
 
         self.assertEqual(len(expected_results), len(actual_results),
                          (expected_results, actual_results))
         for indexed_data in actual_results:
             _id = indexed_data['id']
             expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
             expected_data['id'] = _id
             self.assertEqual(indexed_data, expected_data)
 
     def assert_results_ok(self, sha1s, expected_results=None):
         if self.legacy_get_format:
             self.assert_legacy_results_ok(sha1s, expected_results)
             return
 
         sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
                  for sha1 in sha1s]
         actual_results = list(self.get_indexer_results(sha1s))
 
         if expected_results is None:
             expected_results = self.expected_results
 
         self.assertEqual(len(expected_results), len(actual_results),
                          (expected_results, actual_results))
         for indexed_data in actual_results:
             (_id, indexed_data) = list(indexed_data.items())[0]
             expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
             expected_data = [expected_data]
             self.assertEqual(indexed_data, expected_data)
 
     def test_index(self):
         """Known sha1 have their data indexed
 
         """
         sha1s = [self.id0, self.id1, self.id2]
 
         # when
         self.indexer.run(sha1s, policy_update='update-dups')
 
         self.assert_results_ok(sha1s)
 
         # 2nd pass
         self.indexer.run(sha1s, policy_update='ignore-dups')
 
         self.assert_results_ok(sha1s)
 
     def test_index_one_unknown_sha1(self):
         """Unknown sha1 are not indexed"""
         sha1s = [self.id1,
                  '799a5ef812c53907562fe379d4b3851e69c7cb15',  # unknown
                  '800a5ef812c53907562fe379d4b3851e69c7cb15']  # unknown
 
         # when
         self.indexer.run(sha1s, policy_update='update-dups')
 
         # then
         expected_results = {
             k: v for k, v in self.expected_results.items() if k in sha1s
         }
 
         self.assert_results_ok(sha1s, expected_results)
 
 
 class CommonContentIndexerRangeTest:
     """Allows to factorize tests on range indexer.
 
     """
     def setUp(self):
         self.contents = sorted(OBJ_STORAGE_DATA)
 
     def assert_results_ok(self, start, end, actual_results,
                           expected_results=None):
         if expected_results is None:
             expected_results = self.expected_results
 
         actual_results = list(actual_results)
         for indexed_data in actual_results:
             _id = indexed_data['id']
             assert isinstance(_id, bytes)
             indexed_data = indexed_data.copy()
             indexed_data['id'] = hash_to_hex(indexed_data['id'])
             self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
             self.assertTrue(start <= _id <= end)
             _tool_id = indexed_data['indexer_configuration_id']
             self.assertEqual(_tool_id, self.indexer.tool['id'])
 
     def test__index_contents(self):
         """Indexing contents without existing data results in indexed data
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
         # given
         actual_results = list(self.indexer._index_contents(
             start, end, indexed={}))
 
         self.assert_results_ok(start, end, actual_results)
 
     def test__index_contents_with_indexed_data(self):
         """Indexing contents with existing data results in less indexed data
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
         data_indexed = [self.id0, self.id2]
 
         # given
         actual_results = self.indexer._index_contents(
             start, end, indexed=set(map(hash_to_bytes, data_indexed)))
 
         # craft the expected results
         expected_results = self.expected_results.copy()
         for already_indexed_key in data_indexed:
             expected_results.pop(already_indexed_key)
 
         self.assert_results_ok(
             start, end, actual_results, expected_results)
 
     def test_generate_content_get(self):
         """Optimal indexing should result in indexed data
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
 
         # given
         actual_results = self.indexer.run(start, end)
 
         # then
         self.assertTrue(actual_results)
 
     def test_generate_content_get_input_as_bytes(self):
         """Optimal indexing should result in indexed data
 
         Input are in bytes here.
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
 
         # given
         actual_results = self.indexer.run(  # checks the bytes input this time
             start, end, skip_existing=False)
         # no already indexed data so same result as prior test
 
         # then
         self.assertTrue(actual_results)
 
     def test_generate_content_get_no_result(self):
         """No result indexed returns False"""
         _start, _end = ['0000000000000000000000000000000000000000',
                         '0000000000000000000000000000000000000001']
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
         # given
         actual_results = self.indexer.run(
             start, end, incremental=False)
 
         # then
         self.assertFalse(actual_results)
diff --git a/version.txt b/version.txt
index 155ff3a..bfd45df 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v0.0.152-0-g44234a2
\ No newline at end of file
+v0.0.153-0-g12801f8
\ No newline at end of file