diff --git a/swh/indexer/data/package-json/CITATION b/swh/indexer/data/package-json/CITATION
new file mode 100644
index 0000000..52a13c0
--- /dev/null
+++ b/swh/indexer/data/package-json/CITATION
@@ -0,0 +1 @@
+swh:1:dir:49dd6f75450a37243dfcc4b418ca5bf5e0010748;origin=https://github.com/Bartvds/package.json-schema
diff --git a/swh/indexer/data/package-json/LICENSE b/swh/indexer/data/package-json/LICENSE
new file mode 100644
index 0000000..3651abe
--- /dev/null
+++ b/swh/indexer/data/package-json/LICENSE
@@ -0,0 +1,22 @@
+Copyright (c) 2014 Bart van der Schoor
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
diff --git a/swh/indexer/data/package-json/schema.json b/swh/indexer/data/package-json/schema.json
new file mode 100644
index 0000000..e5f799f
--- /dev/null
+++ b/swh/indexer/data/package-json/schema.json
@@ -0,0 +1,377 @@
+{
+	"$schema": "http://json-schema.org/draft-04/schema",
+	"id": "lib://package.json",
+	"title": "package.json-schema",
+	"description": "JSON Schema for node/npm package.json",
+	"$ref": "lib://package.json#/definitions/standard",
+	"definitions": {
+		"minimal": {
+			"allOf": [
+				{
+					"$ref": "lib://package.json#/definitions/structure"
+				},
+				{
+					"required": [
+						"name",
+						"version"
+					]
+				}
+			]
+		},
+		"standard": {
+			"allOf": [
+				{
+					"$ref": "lib://package.json#/definitions/structure"
+				},
+				{
+					"required": [
+						"name",
+						"version",
+						"description",
+						"keywords",
+						"author",
+						"homepage",
+						"repository",
+						"bugs",
+						"licenses",
+						"engines",
+						"main",
+						"scripts",
+						"dependencies",
+						"devDependencies"
+					],
+					"properties": {
+						"scripts": {
+							"type": "object",
+							"properties": {
+								"test": {
+									"type" : "string",
+									"pattern": "[a-zA-Z]"
+								}
+							}
+						},
+						"author": {
+							"$ref": "lib://package.json#/definitions/person-object"
+						},
+						"contributors": {
+							"type": "array",
+							"items": {
+								"$ref": "lib://package.json#/definitions/person-object"
+							}
+						},
+						"maintainers": {
+							"type": "array",
+							"items": {
+								"$ref": "lib://package.json#/definitions/person-object"
+							}
+						}
+					}
+				}
+			]
+		},
+		"structure": {
+			"type": "object",
+			"properties": {
+				"name": {
+					"$ref": "lib://package.json#/definitions/name"
+				},
+				"version": {
+					"$ref": "lib://package.json#/definitions/semver"
+				},
+				"description": {
+					"type": "string",
+					"minLength": 1
+				},
+				"keywords": {
+					"type": "array",
+					"uniqueItems": true,
+					"items": {
+						"$ref": "lib://package.json#/definitions/name"
+					}
+				},
+				"author": {
+					"$ref": "lib://package.json#/definitions/person"
+				},
+				"contributors": {
+					"type": "array",
+					"uniqueItems": true,
+					"items": {
+						"$ref": "lib://package.json#/definitions/person"
+					}
+				},
+				"maintainers": {
+					"type": "array",
+					"uniqueItems": true,
+					"items": {
+						"$ref": "lib://package.json#/definitions/person"
+					}
+				},
+				"homepage": {
+					"$ref": "lib://package.json#/definitions/uri-http"
+				},
+				"repository": {
+					"$ref": "lib://package.json#/definitions/repository"
+				},
+				"man": {
+					"oneOf": [
+						{
+							"$ref": "lib://package.json#/definitions/path"
+						},
+						{
+							"type": "array",
+							"uniqueItems": true,
+							"items": {
+								"$ref": "lib://package.json#/definitions/path"
+							}
+						}
+					]
+				},
+				"bugs": {
+					"oneOf": [
+						{
+							"$ref": "lib://package.json#/definitions/uri-http"
+						},
+						{
+							"type": "object",
+							"required": [
+								"url"
+							],
+							"properties": {
+								"url": {
+									"$ref": "lib://package.json#/definitions/uri-http"
+								},
+								"email": {
+									"$ref": "lib://package.json#/definitions/email"
+								}
+							}
+						}
+					]
+				},
+				"license": {
+					"$ref": "lib://package.json#/definitions/licence"
+				},
+				"licenses": {
+					"type": "array",
+					"uniqueItems": true,
+					"items": {
+						"$ref": "lib://package.json#/definitions/licence"
+					}
+				},
+				"private": {
+					"type": "boolean"
+				},
+				"preferGlobal": {
+					"type": "boolean"
+				},
+				"engines": {
+					"$ref": "lib://package.json#/definitions/string-map"
+				},
+				"engineStrict": {
+					"type": "boolean"
+				},
+				"main": {
+					"$ref": "lib://package.json#/definitions/path"
+				},
+				"bin": {
+					"oneOf": [
+						{
+							"$ref": "lib://package.json#/definitions/path"
+						},
+						{
+
+							"$ref": "lib://package.json#/definitions/path-map"
+						}
+					]
+				},
+				"files": {
+					"type": "array",
+					"uniqueItems": true,
+					"items": {
+						"$ref": "lib://package.json#/definitions/path"
+					}
+				},
+				"os": {
+					"type": "array",
+					"uniqueItems": true,
+					"items": {
+						"$ref": "lib://package.json#/definitions/identifier"
+					}
+				},
+				"cpu": {
+					"type": "array",
+					"uniqueItems": true,
+					"items": {
+						"$ref": "lib://package.json#/definitions/identifier"
+					}
+				},
+				"config": {
+					"type": "object"
+				},
+				"publishConfig": {
+					"type": "object"
+				},
+				"directories": {
+					"type": "object",
+					"properties": {
+						"lib": {
+							"$ref": "lib://package.json#/definitions/path"
+						},
+						"bin": {
+							"$ref": "lib://package.json#/definitions/path"
+						},
+						"man": {
+							"$ref": "lib://package.json#/definitions/path"
+						},
+						"doc": {
+							"$ref": "lib://package.json#/definitions/path"
+						},
+						"example": {
+							"$ref": "lib://package.json#/definitions/path"
+						}
+					}
+				},
+				"scripts": {
+					"$ref": "lib://package.json#/definitions/string-map"
+				},
+				"dependencies": {
+					"$ref": "lib://package.json#/definitions/dependency-map"
+				},
+				"devDependencies": {
+					"$ref": "lib://package.json#/definitions/dependency-map"
+				},
+				"bundledDependencies": {
+					"$ref": "lib://package.json#/definitions/dependency-map"
+				},
+				"bundleDependencies": {
+					"$ref": "lib://package.json#/definitions/dependency-map"
+				},
+				"optionalDependencies": {
+					"$ref": "lib://package.json#/definitions/dependency-map"
+				},
+				"peerDependencies": {
+					"$ref": "lib://package.json#/definitions/dependency-map"
+				}
+			}
+		},
+		"uri-http": {
+			"type": "string",
+			"pattern": "^https?:\/\/"
+		},
+		"email": {
+			"type": "string",
+			"pattern": "^([0-9a-zA-Z]([-\\.\\w]*[0-9a-zA-Z])*@([0-9a-zA-Z][-\\w]*[0-9a-zA-Z]\\.)+[a-zA-Z]{2,9})$"
+		},
+		"path": {
+			"type": "string",
+			"minLength": 1
+		},
+		"name": {
+			"type": "string",
+			"pattern": "^[A-Za-z](?:[_\\.-]?[A-Za-z0-9]+)*$"
+		},
+		"identifier": {
+			"type": "string",
+			"pattern": "^[A-Za-z](?:[_-]?[A-Za-z0-9]+)*$"
+		},
+		"semver": {
+			"type": "string",
+			"pattern": "^\\d+\\.\\d+\\.\\d+(?:-[a-z]+(?:[_\\.-]*[a-z0-9]+)*)*$"
+		},
+		"type-url": {
+			"type": "object",
+			"additionalProperties": false,
+			"required": [
+				"type",
+				"url"
+			],
+			"properties": {
+				"type": {
+					"type": "string",
+					"pattern": "[a-zA-Z]"
+				},
+				"url": {
+					"$ref": "lib://package.json#/definitions/uri-http"
+				}
+			}
+
+		},
+		"repository": {
+			"$ref": "lib://package.json#/definitions/type-url"
+		},
+		"licence": {
+			"oneOf": [
+				{
+					"type": "string",
+					"pattern": "[a-zA-Z]"
+				},
+				{
+					"$ref": "lib://package.json#/definitions/licence-object"
+				}
+			]
+		},
+		"licence-object": {
+			"type": "object",
+			"additionalProperties": false,
+			"properties": {
+				"type": {
+					"type": "string",
+					"pattern": "[a-zA-Z]"
+				},
+				"url": {
+					"$ref": "lib://package.json#/definitions/uri-http"
+				}
+			}
+		},
+		"person": {
+			"oneOf": [
+				{
+					"type": "string",
+					"pattern": "[a-zA-Z]"
+				},
+				{
+					"$ref": "lib://package.json#/definitions/person-object"
+				}
+			]
+		},
+		"person-object": {
+			"type": "object",
+			"required": [
+				"name"
+			],
+			"properties": {
+				"name": {
+					"type": "string",
+					"pattern": "[a-zA-Z]"
+				},
+				"email": {
+					"$ref": "lib://package.json#/definitions/email"
+				},
+				"url": {
+					"$ref": "lib://package.json#/definitions/uri-http"
+				}
+			}
+		},
+		"string-map": {
+			"type": "object",
+			"additionalProperties": false,
+			"patternProperties": {
+				".+": {
+					"type": "string"
+				}
+			}
+		},
+		"path-map": {
+			"type": "object",
+			"additionalProperties": false,
+			"patternProperties": {
+				".+": {
+					"$ref": "lib://package.json#/definitions/path",
+					"pattern": "[a-zA-Z]"
+				}
+			}
+		},
+		"dependency-map": {
+			"$ref": "lib://package.json#/definitions/string-map"
+		}
+	}
+}
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 836d77d..8b2df2d 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,335 +1,335 @@
 # Copyright (C) 2017-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import click
 import logging
 from copy import deepcopy
 
 from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
 from swh.indexer.origin_head import OriginHeadIndexer
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_detector import extract_minimal_metadata_dict
 from swh.indexer.storage import INDEXER_CFG_KEY
 
 from swh.model import hashutil
 
 
 class ContentMetadataIndexer(ContentIndexer):
     """Content-level indexer
 
     This indexer is in charge of:
 
     - filtering out content already indexed in content_metadata
     - reading content from objstorage with the content's id sha1
     - computing translated_metadata by given context
     - using the metadata_dictionary as the 'swh-metadata-translator' tool
     - store result in content_metadata table
 
     """
     # Note: This used when the content metadata indexer is used alone
     # (not the case for example in the case of the RevisionMetadataIndexer)
     CONFIG_BASE_FILENAME = 'indexer/content_metadata'
 
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones.
         """
         yield from self.idx_storage.content_metadata_missing((
             {
                 'id': sha1,
                 'indexer_configuration_id': self.tool['id'],
             } for sha1 in ids
         ))
 
     def index(self, id, data, log_suffix='unknown revision'):
         """Index sha1s' content and store result.
 
         Args:
             id (bytes): content's identifier
             data (bytes): raw content in bytes
 
         Returns:
             dict: dictionary representing a content_metadata. If the
             translation wasn't successful the translated_metadata keys will
             be returned as None
 
         """
         result = {
             'id': id,
             'indexer_configuration_id': self.tool['id'],
             'translated_metadata': None
         }
         try:
             mapping_name = self.tool['tool_configuration']['context']
             log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id)
             result['translated_metadata'] = \
                 MAPPINGS[mapping_name](log_suffix).translate(data)
         except Exception:
             self.log.exception(
                 "Problem during metadata translation "
                 "for content %s" % hashutil.hash_to_hex(id))
         if result['translated_metadata'] is None:
             return None
         return result
 
     def persist_index_computations(self, results, policy_update):
         """Persist the results in storage.
 
         Args:
             results ([dict]): list of content_metadata, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - translated_metadata (jsonb): detected metadata
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
               respectively update duplicates or ignore them
 
         """
         self.idx_storage.content_metadata_add(
             results, conflict_update=(policy_update == 'update-dups'))
 
 
 class RevisionMetadataIndexer(RevisionIndexer):
     """Revision-level indexer
 
     This indexer is in charge of:
 
     - filtering revisions already indexed in revision_metadata table with
       defined computation tool
     - retrieve all entry_files in root directory
     - use metadata_detector for file_names containing metadata
     - compute metadata translation if necessary and possible (depends on tool)
     - send sha1s to content indexing if possible
     - store the results for revision
 
     """
     CONFIG_BASE_FILENAME = 'indexer/revision_metadata'
 
     ADDITIONAL_CONFIG = {
         'tools': ('dict', {
             'name': 'swh-metadata-detector',
             'version': '0.0.2',
             'configuration': {
                 'type': 'local',
                 'context': list(MAPPINGS),
             },
         }),
     }
 
     def filter(self, sha1_gits):
         """Filter out known sha1s and return only missing ones.
 
         """
         yield from self.idx_storage.revision_metadata_missing((
             {
                 'id': sha1_git,
                 'indexer_configuration_id': self.tool['id'],
             } for sha1_git in sha1_gits
         ))
 
     def index(self, rev):
         """Index rev by processing it and organizing result.
 
         use metadata_detector to iterate on filenames
 
         - if one filename detected -> sends file to content indexer
         - if multiple file detected -> translation needed at revision level
 
         Args:
           rev (dict): revision artifact from storage
 
         Returns:
             dict: dictionary representing a revision_metadata, with keys:
 
             - id (str): rev's identifier (sha1_git)
             - indexer_configuration_id (bytes): tool used
             - translated_metadata: dict of retrieved metadata
 
         """
         result = {
             'id': rev['id'],
             'indexer_configuration_id': self.tool['id'],
             'mappings': None,
             'translated_metadata': None
         }
 
         try:
             root_dir = rev['directory']
             dir_ls = self.storage.directory_ls(root_dir, recursive=False)
             files = [entry for entry in dir_ls if entry['type'] == 'file']
             detected_files = detect_metadata(files)
             (mappings, metadata) = self.translate_revision_metadata(
                 detected_files,
                 log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id']))
             result['mappings'] = mappings
             result['translated_metadata'] = metadata
         except Exception as e:
             self.log.exception(
                 'Problem when indexing rev: %r', e)
         return result
 
     def persist_index_computations(self, results, policy_update):
         """Persist the results in storage.
 
         Args:
             results ([dict]): list of content_mimetype, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - mimetype (bytes): mimetype in bytes
               - encoding (bytes): encoding in bytes
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
               respectively update duplicates or ignore them
 
         """
         # TODO: add functions in storage to keep data in revision_metadata
         self.idx_storage.revision_metadata_add(
             results, conflict_update=(policy_update == 'update-dups'))
 
     def translate_revision_metadata(self, detected_files, log_suffix):
         """
         Determine plan of action to translate metadata when containing
         one or multiple detected files:
 
         Args:
             detected_files (dict): dictionary mapping context names (e.g.,
               "npm", "authors") to list of sha1
 
         Returns:
             (List[str], dict): list of mappings used and dict with
             translated metadata according to the CodeMeta vocabulary
 
         """
         used_mappings = [MAPPINGS[context].name for context in detected_files]
         translated_metadata = []
         tool = {
                 'name': 'swh-metadata-translator',
                 'version': '0.0.2',
                 'configuration': {
                     'type': 'local',
                     'context': None
                 },
             }
         # TODO: iterate on each context, on each file
         # -> get raw_contents
         # -> translate each content
         config = {
             k: self.config[k]
             for k in [INDEXER_CFG_KEY, 'objstorage', 'storage']
         }
         config['tools'] = [tool]
         for context in detected_files.keys():
             cfg = deepcopy(config)
             cfg['tools'][0]['configuration']['context'] = context
             c_metadata_indexer = ContentMetadataIndexer(config=cfg)
             # sha1s that are in content_metadata table
             sha1s_in_storage = []
             metadata_generator = self.idx_storage.content_metadata_get(
                 detected_files[context])
             for c in metadata_generator:
                 # extracting translated_metadata
                 sha1 = c['id']
                 sha1s_in_storage.append(sha1)
                 local_metadata = c['translated_metadata']
                 # local metadata is aggregated
                 if local_metadata:
                     translated_metadata.append(local_metadata)
 
             sha1s_filtered = [item for item in detected_files[context]
                               if item not in sha1s_in_storage]
 
             if sha1s_filtered:
                 # content indexing
                 try:
                     c_metadata_indexer.run(sha1s_filtered,
                                            policy_update='ignore-dups',
                                            log_suffix=log_suffix)
                     # on the fly possibility:
                     for result in c_metadata_indexer.results:
                         local_metadata = result['translated_metadata']
                         translated_metadata.append(local_metadata)
 
                 except Exception:
                     self.log.exception(
                         "Exception while indexing metadata on contents")
 
         # transform translated_metadata into min set with swh-metadata-detector
         min_metadata = extract_minimal_metadata_dict(translated_metadata)
         return (used_mappings, min_metadata)
 
 
 class OriginMetadataIndexer(OriginIndexer):
     CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata'
 
     ADDITIONAL_CONFIG = {
         'tools': ('list', [])
     }
 
     USE_TOOLS = False
 
     def __init__(self):
         super().__init__()
         self.origin_head_indexer = OriginHeadIndexer()
         self.revision_metadata_indexer = RevisionMetadataIndexer()
 
     def index_list(self, origins):
         head_rev_ids = []
         for origin in origins:
             head_result = self.origin_head_indexer.index(origin)
             if not head_result:
                 continue
             head_rev_ids.append(head_result['revision_id'])
 
         head_revs = list(self.storage.revision_get(head_rev_ids))
         assert len(head_revs) == len(head_rev_ids)
 
         results = []
-        for (orig, rev) in zip(origins, head_revs):
+        for (origin, rev) in zip(origins, head_revs):
             if not rev:
                 self.warning('Missing head revision %s of origin %r',
                              (hashutil.hash_to_bytes(rev['id']), origin))
                 continue
 
             rev_metadata = self.revision_metadata_indexer.index(rev)
             orig_metadata = {
                 'from_revision': rev_metadata['id'],
                 'origin_id': origin['id'],
                 'metadata': rev_metadata['translated_metadata'],
                 'mappings': rev_metadata['mappings'],
                 'indexer_configuration_id':
                     rev_metadata['indexer_configuration_id'],
             }
             results.append((orig_metadata, rev_metadata))
         return results
 
     def persist_index_computations(self, results, policy_update):
         conflict_update = (policy_update == 'update-dups')
 
         # Deduplicate revisions
         rev_metadata = []
         orig_metadata = []
         for (orig_item, rev_item) in results:
             if rev_item not in rev_metadata:
                 rev_metadata.append(rev_item)
             if rev_item not in orig_metadata:
                 orig_metadata.append(orig_item)
 
         self.idx_storage.revision_metadata_add(
             rev_metadata, conflict_update=conflict_update)
 
         self.idx_storage.origin_intrinsic_metadata_add(
             orig_metadata, conflict_update=conflict_update)
 
 
 @click.command()
 @click.option('--revs', '-i',
               help='Default sha1_git to lookup', multiple=True)
 def main(revs):
     _git_sha1s = list(map(hashutil.hash_to_bytes, revs))
     rev_metadata_indexer = RevisionMetadataIndexer()
     rev_metadata_indexer.run(_git_sha1s, 'update-dups')
 
 
 if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO)
     main()
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 2d5e653..fb72a5a 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,163 +1,188 @@
 # Copyright (C) 2018-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import pytest
 
 from unittest.mock import patch
 
 from swh.model.hashutil import hash_to_bytes
 
 from swh.indexer.metadata import OriginMetadataIndexer
 
 from .utils import BASE_TEST_CONFIG, YARN_PARSER_METADATA
 from .test_metadata import REVISION_METADATA_CONFIG
 
 
 ORIGIN_HEAD_CONFIG = {
     **BASE_TEST_CONFIG,
     'tools': {
         'name': 'origin-metadata',
         'version': '0.0.1',
         'configuration': {},
     },
     'tasks': {
         'revision_metadata': 'revision_metadata',
         'origin_intrinsic_metadata': 'origin_intrinsic_metadata',
     }
 }
 
 
 @pytest.fixture
 def origin_metadata_indexer():
     prefix = 'swh.indexer.'
     suffix = '.parse_config_file'
     with patch(prefix + 'metadata.OriginMetadataIndexer' + suffix) as omi, \
             patch(prefix + 'origin_head.OriginHeadIndexer' + suffix) as ohi, \
             patch(prefix + 'metadata.RevisionMetadataIndexer' + suffix) as rmi:
         omi.return_value = BASE_TEST_CONFIG
         ohi.return_value = ORIGIN_HEAD_CONFIG
         rmi.return_value = REVISION_METADATA_CONFIG
         yield OriginMetadataIndexer()
 
 
 def test_origin_metadata_indexer(
         idx_storage, storage, obj_storage, origin_metadata_indexer):
 
     indexer = OriginMetadataIndexer()
     indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     rev_metadata = {
         'id': rev_id,
         'translated_metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
     origin_metadata = {
         'origin_id': origin['id'],
         'from_revision': rev_id,
         'metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
 
     results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
     for result in results:
         del result['tool']
     assert results == [rev_metadata]
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     for result in results:
         del result['tool']
     assert results == [origin_metadata]
 
 
-def test_origin_metadata_indexer_duplicates(
+def test_origin_metadata_indexer_duplicate_origin(
         idx_storage, storage, obj_storage, origin_metadata_indexer):
     indexer = OriginMetadataIndexer()
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.run(["git+https://github.com/librariesio/yarn-parser"])
 
     indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2)
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
     assert len(results) == 1
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert len(results) == 1
 
 
 def test_origin_metadata_indexer_missing_head(
         idx_storage, storage, obj_storage, origin_metadata_indexer):
 
     storage.origin_add([{
         'type': 'git',
         'url': 'https://example.com'
     }])
 
     indexer = OriginMetadataIndexer()
     indexer.run(["git+https://example.com"])
 
     origin = storage.origin_get({
         'type': 'git',
         'url': 'https://example.com'})
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin['id']]))
     assert results == []
 
 
 def test_origin_metadata_indexer_partial_missing_head(
         idx_storage, storage, obj_storage, origin_metadata_indexer):
 
     storage.origin_add([{
         'type': 'git',
         'url': 'https://example.com'
     }])
 
     indexer = OriginMetadataIndexer()
     indexer.run(["git+https://example.com",
                  "git+https://github.com/librariesio/yarn-parser"])
 
     origin1 = storage.origin_get({
         'type': 'git',
         'url': 'https://example.com'})
     origin2 = storage.origin_get({
         'type': 'git',
         'url': 'https://github.com/librariesio/yarn-parser'})
     rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
 
     rev_metadata = {
         'id': rev_id,
         'translated_metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
     origin_metadata = {
         'origin_id': origin2['id'],
         'from_revision': rev_id,
         'metadata': YARN_PARSER_METADATA,
         'mappings': ['npm'],
     }
 
     results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
     for result in results:
         del result['tool']
     assert results == [rev_metadata]
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
         origin1['id'], origin2['id']]))
     for result in results:
         del result['tool']
     assert results == [origin_metadata]
+
+
+def test_origin_metadata_indexer_duplicate_revision(
+        idx_storage, storage, obj_storage, origin_metadata_indexer):
+    indexer = OriginMetadataIndexer()
+    indexer.storage = storage
+    indexer.idx_storage = idx_storage
+    indexer.run(["git+https://github.com/librariesio/yarn-parser",
+                 "git+https://github.com/librariesio/yarn-parser.git"])
+
+    origin1 = storage.origin_get({
+        'type': 'git',
+        'url': 'https://github.com/librariesio/yarn-parser'})
+    origin2 = storage.origin_get({
+        'type': 'git',
+        'url': 'https://github.com/librariesio/yarn-parser.git'})
+    assert origin1['id'] != origin2['id']
+    rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
+
+    results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
+    assert len(results) == 1
+
+    results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
+        origin1['id'], origin2['id']]))
+    assert len(results) == 2
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
index c19bb7e..2cfc437 100644
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -1,666 +1,678 @@
 # Copyright (C) 2017-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import abc
 import datetime
 import hashlib
 import random
 
 from hypothesis import strategies
 
 from swh.model import hashutil
 from swh.model.hashutil import hash_to_bytes, hash_to_hex
 
 from swh.indexer.storage import INDEXER_CFG_KEY
 
 BASE_TEST_CONFIG = {
     'storage': {
         'cls': 'memory',
         'args': {
         },
     },
     'objstorage': {
         'cls': 'memory',
         'args': {
         },
     },
     INDEXER_CFG_KEY: {
         'cls': 'memory',
         'args': {
         },
     },
 }
 
 ORIGINS = [
         {
             'id': 52189575,
             'lister': None,
             'project': None,
             'type': 'git',
             'url': 'https://github.com/SoftwareHeritage/swh-storage'},
         {
             'id': 4423668,
             'lister': None,
             'project': None,
             'type': 'ftp',
             'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
         {
             'id': 77775770,
             'lister': None,
             'project': None,
             'type': 'deposit',
             'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
         {
             'id': 85072327,
             'lister': None,
             'project': None,
             'type': 'pypi',
             'url': 'https://pypi.org/project/limnoria/'},
         {
             'id': 49908349,
             'lister': None,
             'project': None,
             'type': 'svn',
             'url': 'http://0-512-md.googlecode.com/svn/'},
         {
             'id': 54974445,
             'lister': None,
             'project': None,
             'type': 'git',
             'url': 'https://github.com/librariesio/yarn-parser'},
+        {
+            'id': 54974446,
+            'lister': None,
+            'project': None,
+            'type': 'git',
+            'url': 'https://github.com/librariesio/yarn-parser.git'},
         ]
 
 SNAPSHOTS = {
         52189575: {
             'branches': {
                 b'refs/heads/add-revision-origin-cache': {
                     'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
                               b's\xe7/\xe9l\x1e',
                     'target_type': 'revision'},
                 b'HEAD': {
                     'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
                               b'\xac\xefrm',
                     'target_type': 'revision'},
                 b'refs/tags/v0.0.103': {
                     'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
                               b'\x0f\xdd',
                     'target_type': 'release'},
                 }},
         4423668: {
             'branches': {
                 b'3DLDF-1.1.4.tar.gz': {
                     'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc'
                               b'"G\x99\x11',
                     'target_type': 'revision'},
                 b'3DLDF-2.0.2.tar.gz': {
                     'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
                               b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
                     'target_type': 'revision'},
                 b'3DLDF-2.0.3-examples.tar.gz': {
                     'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
                               b'\xfe\xadZ\x80\x80\xc1\x83\xff',
                     'target_type': 'revision'},
                 b'3DLDF-2.0.3.tar.gz': {
                     'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
                               b'\xcc\x1a\xb4`\x8c\x8by',
                     'target_type': 'revision'},
                 b'3DLDF-2.0.tar.gz': {
                     'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
                               b'\xd3\xd1m',
                     b'target_type': 'revision'}
                 }},
         77775770: {
             'branches': {
                 b'master': {
                     'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
                               b'\xa6\xe9\x99\xb1\x9e]q\xeb',
                     'target_type': 'revision'}
             },
             'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
                   b"\x1d\r "},
         85072327: {
             'branches': {
                 b'HEAD': {
                     'target': b'releases/2018.09.09',
                     'target_type': 'alias'},
                 b'releases/2018.09.01': {
                     'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
                               b'\xbb\xdfF\xfdw\xcf',
                     'target_type': 'revision'},
                 b'releases/2018.09.09': {
                     'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
                               b'A\x10\x9d\xc5\xfa2\xf8t',
                     'target_type': 'revision'}},
             'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
                   b'\x12\x9e\xd6\xb3'},
         49908349: {
                 'branches': {
                     b'master': {
                         'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
                                   b'\xc9\xad#.\x1bw=\x18',
                         'target_type': 'revision'}},
                 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
                       b'\x05\xea\xb8\x1f\xc4H\xf4s'},
         54974445: {
                 'branches': {
                     b'HEAD': {
                         'target': hash_to_bytes(
                             '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
-                        'target_type': 'revision'}}}
+                        'target_type': 'revision'}}},
+        54974446: {
+                'branches': {
+                    b'HEAD': {
+                        'target': hash_to_bytes(
+                            '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
+                        'target_type': 'revision'}}},
         }
 
 
 REVISIONS = [{
     'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
     'author': {
         'id': 26,
         'name': b'Andrew Nesbitt',
         'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
         'email': b'andrewnez@gmail.com'
     },
     'committer': {
         'id': 26,
         'name': b'Andrew Nesbitt',
         'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
         'email': b'andrewnez@gmail.com'
     },
     'synthetic': False,
     'date': {
         'negative_utc': False,
         'timestamp': {
             'seconds': 1487596456,
             'microseconds': 0
         },
         'offset': 0
     },
     'directory': b'10'
 }]
 
 DIRECTORY_ID = b'10'
 
 DIRECTORY = [{
     'sha1_git': b'abc',
     'name': b'index.js',
     'target': b'abc',
     'length': 897,
     'status': 'visible',
     'type': 'file',
     'perms': 33188,
     'sha1': b'bcd'
     },
     {
     'sha1_git': b'aab',
     'name': b'package.json',
     'target': b'aab',
     'length': 712,
     'status': 'visible',
     'type': 'file',
     'perms': 33188,
     'sha1': b'cde'
     },
     {
     'target': b'11',
     'type': 'dir',
     'length': None,
     'name': b'.github',
     'sha1': None,
     'perms': 16384,
     'sha1_git': None,
     'status': None,
     'sha256': None
     }
 ]
 
 SHA1_TO_LICENSES = {
     '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
     '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
     '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
     '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
     'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
 }
 
 
 SHA1_TO_CTAGS = {
     '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{
         'name': 'foo',
         'kind': 'str',
         'line': 10,
         'lang': 'bar',
     }],
     'd4c647f0fc257591cc9ba1722484229780d1c607': [{
         'name': 'let',
         'kind': 'int',
         'line': 100,
         'lang': 'haskell',
     }],
     '688a5ef812c53907562fe379d4b3851e69c7cb15': [{
         'name': 'symbol',
         'kind': 'float',
         'line': 99,
         'lang': 'python',
     }],
 }
 
 
 OBJ_STORAGE_DATA = {
     '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text',
     '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text',
     '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text',
     '02fb2c89e14f7fab46701478c83779c7beb7b069': b"""
     import unittest
     import logging
     from swh.indexer.mimetype import MimetypeIndexer
     from swh.indexer.tests.test_utils import MockObjStorage
 
     class MockStorage():
         def content_mimetype_add(self, mimetypes):
             self.state = mimetypes
             self.conflict_update = conflict_update
 
         def indexer_configuration_add(self, tools):
             return [{
                 'id': 10,
             }]
     """,
     '103bc087db1d26afc3a0283f38663d081e9b01e6': b"""
         #ifndef __AVL__
         #define __AVL__
 
         typedef struct _avl_tree avl_tree;
 
         typedef struct _data_t {
           int content;
         } data_t;
     """,
     '93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
     (should 'pygments (recognize 'lisp 'easily))
 
     """,
     '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
     {
         "name": "test_metadata",
         "version": "0.0.1",
         "description": "Simple package.json test for indexer",
         "repository": {
           "type": "git",
           "url": "https://github.com/moranegg/metadata_test"
       }
     }
     """,
     'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
     {
       "version": "5.0.3",
       "name": "npm",
       "description": "a package manager for JavaScript",
       "keywords": [
         "install",
         "modules",
         "package manager",
         "package.json"
       ],
       "preferGlobal": true,
       "config": {
         "publishtest": false
       },
       "homepage": "https://docs.npmjs.com/",
       "author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
       "repository": {
         "type": "git",
         "url": "https://github.com/npm/npm"
       },
       "bugs": {
         "url": "https://github.com/npm/npm/issues"
       },
       "dependencies": {
         "JSONStream": "~1.3.1",
         "abbrev": "~1.1.0",
         "ansi-regex": "~2.1.1",
         "ansicolors": "~0.3.2",
         "ansistyles": "~0.1.3"
       },
       "devDependencies": {
         "tacks": "~1.2.6",
         "tap": "~10.3.2"
       },
       "license": "Artistic-2.0"
     }
 
     """,
     'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b"""
     """,
     'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'',
     '636465': b"""
     {
       "name": "yarn-parser",
       "version": "1.0.0",
       "description": "Tiny web service for parsing yarn.lock files",
       "main": "index.js",
       "scripts": {
         "start": "node index.js",
         "test": "mocha"
       },
       "engines": {
         "node": "9.8.0"
       },
       "repository": {
         "type": "git",
         "url": "git+https://github.com/librariesio/yarn-parser.git"
       },
       "keywords": [
         "yarn",
         "parse",
         "lock",
         "dependencies"
       ],
       "author": "Andrew Nesbitt",
       "license": "AGPL-3.0",
       "bugs": {
         "url": "https://github.com/librariesio/yarn-parser/issues"
       },
       "homepage": "https://github.com/librariesio/yarn-parser#readme",
       "dependencies": {
         "@yarnpkg/lockfile": "^1.0.0",
         "body-parser": "^1.15.2",
         "express": "^4.14.0"
       },
       "devDependencies": {
         "chai": "^4.1.2",
         "mocha": "^5.2.0",
         "request": "^2.87.0",
         "test": "^0.6.0"
       }
     }
 """
 }
 
 
 YARN_PARSER_METADATA = {
     '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
     'url':
         'https://github.com/librariesio/yarn-parser#readme',
     'codeRepository':
         'git+git+https://github.com/librariesio/yarn-parser.git',
     'author': [{
         'type': 'Person',
         'name': 'Andrew Nesbitt'
     }],
     'license': 'https://spdx.org/licenses/AGPL-3.0',
     'version': '1.0.0',
     'description':
         'Tiny web service for parsing yarn.lock files',
     'issueTracker':
         'https://github.com/librariesio/yarn-parser/issues',
     'name': 'yarn-parser',
     'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
 }
 
 
 json_dict_keys = strategies.one_of(
     strategies.characters(),
     *map(strategies.just, ['type', 'url', 'name', 'email', '@id',
                            '@context', 'repository', 'license',
                            ]),
 )
 """Hypothesis strategy that generates strings, with an emphasis on those
 that are often used as dictionary keys in metadata files."""
 
 
 generic_json_document = strategies.recursive(
     strategies.none() | strategies.booleans() | strategies.floats() |
     strategies.characters(),
     lambda children: (
         strategies.lists(children, 1) |
         strategies.dictionaries(json_dict_keys, children, min_size=1)
     )
 )
 """Hypothesis strategy that generates possible values for values of JSON
 metadata files."""
 
 
 def json_document_strategy(keys=None):
     """Generates an hypothesis strategy that generates metadata files
     for a format that uses the given keys."""
     if keys is None:
         keys = strategies.characters()
     else:
         keys = strategies.one_of(map(strategies.just, keys))
 
     return strategies.dictionaries(keys, generic_json_document, min_size=1)
 
 
 def filter_dict(d, keys):
     'return a copy of the dict with keys deleted'
     if not isinstance(keys, (list, tuple)):
         keys = (keys, )
     return dict((k, v) for (k, v) in d.items() if k not in keys)
 
 
 def fill_obj_storage(obj_storage):
     """Add some content in an object storage."""
     for (obj_id, content) in OBJ_STORAGE_DATA.items():
         obj_storage.add(content, obj_id=hash_to_bytes(obj_id))
 
 
 def fill_storage(storage):
     for origin in ORIGINS:
         origin = origin.copy()
         del origin['id']
         storage.origin_add_one(origin)
     for (orig_pseudo_id, snap) in SNAPSHOTS.items():
         for orig in ORIGINS:
             if orig_pseudo_id == orig['id']:
                 origin_id = storage.origin_get(
                     {'type': orig['type'], 'url': orig['url']})['id']
                 break
         else:
             assert False
         visit = storage.origin_visit_add(origin_id, datetime.datetime.now())
         snap_id = snap.get('id') or \
             bytes([random.randint(0, 255) for _ in range(32)])
         storage.snapshot_add(origin_id, visit['visit'], {
             'id': snap_id,
             'branches': snap['branches']
         })
     storage.revision_add(REVISIONS)
     storage.directory_add([{
         'id': DIRECTORY_ID,
         'entries': DIRECTORY,
     }])
     for (obj_id, content) in OBJ_STORAGE_DATA.items():
         # TODO: use MultiHash
         if hasattr(hashlib, 'blake2s'):
             blake2s256 = hashlib.blake2s(content, digest_size=32).digest()
         else:
             # fallback for Python <3.6
             blake2s256 = bytes([random.randint(0, 255) for _ in range(32)])
         storage.content_add([{
             'data': content,
             'length': len(content),
             'status': 'visible',
             'sha1': hash_to_bytes(obj_id),
             'sha1_git': hash_to_bytes(obj_id),
             'sha256': hashlib.sha256(content).digest(),
             'blake2s256': blake2s256
         }])
 
 
 class CommonContentIndexerTest(metaclass=abc.ABCMeta):
     legacy_get_format = False
     """True if and only if the tested indexer uses the legacy format.
     see: https://forge.softwareheritage.org/T1433
 
     """
     def get_indexer_results(self, ids):
         """Override this for indexers that don't have a mock storage."""
         return self.indexer.idx_storage.state
 
     def assert_legacy_results_ok(self, sha1s, expected_results=None):
         # XXX old format, remove this when all endpoints are
         #     updated to the new one
         #     see: https://forge.softwareheritage.org/T1433
         sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
                  for sha1 in sha1s]
         actual_results = list(self.get_indexer_results(sha1s))
 
         if expected_results is None:
             expected_results = self.expected_results
 
         self.assertEqual(len(expected_results), len(actual_results),
                          (expected_results, actual_results))
         for indexed_data in actual_results:
             _id = indexed_data['id']
             expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
             expected_data['id'] = _id
             self.assertEqual(indexed_data, expected_data)
 
     def assert_results_ok(self, sha1s, expected_results=None):
         if self.legacy_get_format:
             self.assert_legacy_results_ok(sha1s, expected_results)
             return
 
         sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
                  for sha1 in sha1s]
         actual_results = list(self.get_indexer_results(sha1s))
 
         if expected_results is None:
             expected_results = self.expected_results
 
         self.assertEqual(len(expected_results), len(actual_results),
                          (expected_results, actual_results))
         for indexed_data in actual_results:
             (_id, indexed_data) = list(indexed_data.items())[0]
             expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
             expected_data = [expected_data]
             self.assertEqual(indexed_data, expected_data)
 
     def test_index(self):
         """Known sha1 have their data indexed
 
         """
         sha1s = [self.id0, self.id1, self.id2]
 
         # when
         self.indexer.run(sha1s, policy_update='update-dups')
 
         self.assert_results_ok(sha1s)
 
         # 2nd pass
         self.indexer.run(sha1s, policy_update='ignore-dups')
 
         self.assert_results_ok(sha1s)
 
     def test_index_one_unknown_sha1(self):
         """Unknown sha1 are not indexed"""
         sha1s = [self.id1,
                  '799a5ef812c53907562fe379d4b3851e69c7cb15',  # unknown
                  '800a5ef812c53907562fe379d4b3851e69c7cb15']  # unknown
 
         # when
         self.indexer.run(sha1s, policy_update='update-dups')
 
         # then
         expected_results = {
             k: v for k, v in self.expected_results.items() if k in sha1s
         }
 
         self.assert_results_ok(sha1s, expected_results)
 
 
 class CommonContentIndexerRangeTest:
     """Allows to factorize tests on range indexer.
 
     """
     def setUp(self):
         self.contents = sorted(OBJ_STORAGE_DATA)
 
     def assert_results_ok(self, start, end, actual_results,
                           expected_results=None):
         if expected_results is None:
             expected_results = self.expected_results
 
         actual_results = list(actual_results)
         for indexed_data in actual_results:
             _id = indexed_data['id']
             assert isinstance(_id, bytes)
             indexed_data = indexed_data.copy()
             indexed_data['id'] = hash_to_hex(indexed_data['id'])
             self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
             self.assertTrue(start <= _id <= end)
             _tool_id = indexed_data['indexer_configuration_id']
             self.assertEqual(_tool_id, self.indexer.tool['id'])
 
     def test__index_contents(self):
         """Indexing contents without existing data results in indexed data
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
         # given
         actual_results = list(self.indexer._index_contents(
             start, end, indexed={}))
 
         self.assert_results_ok(start, end, actual_results)
 
     def test__index_contents_with_indexed_data(self):
         """Indexing contents with existing data results in less indexed data
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
         data_indexed = [self.id0, self.id2]
 
         # given
         actual_results = self.indexer._index_contents(
             start, end, indexed=set(map(hash_to_bytes, data_indexed)))
 
         # craft the expected results
         expected_results = self.expected_results.copy()
         for already_indexed_key in data_indexed:
             expected_results.pop(already_indexed_key)
 
         self.assert_results_ok(
             start, end, actual_results, expected_results)
 
     def test_generate_content_get(self):
         """Optimal indexing should result in indexed data
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
 
         # given
         actual_results = self.indexer.run(start, end)
 
         # then
         self.assertTrue(actual_results)
 
     def test_generate_content_get_input_as_bytes(self):
         """Optimal indexing should result in indexed data
 
         Input are in bytes here.
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
 
         # given
         actual_results = self.indexer.run(  # checks the bytes input this time
             start, end, skip_existing=False)
         # no already indexed data so same result as prior test
 
         # then
         self.assertTrue(actual_results)
 
     def test_generate_content_get_no_result(self):
         """No result indexed returns False"""
         _start, _end = ['0000000000000000000000000000000000000000',
                         '0000000000000000000000000000000000000001']
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
         # given
         actual_results = self.indexer.run(
             start, end, incremental=False)
 
         # then
         self.assertFalse(actual_results)