No OneTemporary
Actions

Size

50 KB

Subscribers

None

View Options

	diff --git a/swh/indexer/data/package-json/CITATION b/swh/indexer/data/package-json/CITATION
	new file mode 100644
	index 0000000..52a13c0
	--- /dev/null
	+++ b/swh/indexer/data/package-json/CITATION
	@@ -0,0 +1 @@
	+swh:1:dir:49dd6f75450a37243dfcc4b418ca5bf5e0010748;origin=https://github.com/Bartvds/package.json-schema
	diff --git a/swh/indexer/data/package-json/LICENSE b/swh/indexer/data/package-json/LICENSE
	new file mode 100644
	index 0000000..3651abe
	--- /dev/null
	+++ b/swh/indexer/data/package-json/LICENSE
	@@ -0,0 +1,22 @@
	+Copyright (c) 2014 Bart van der Schoor
	+
	+Permission is hereby granted, free of charge, to any person
	+obtaining a copy of this software and associated documentation
	+files (the "Software"), to deal in the Software without
	+restriction, including without limitation the rights to use,
	+copy, modify, merge, publish, distribute, sublicense, and/or sell
	+copies of the Software, and to permit persons to whom the
	+Software is furnished to do so, subject to the following
	+conditions:
	+
	+The above copyright notice and this permission notice shall be
	+included in all copies or substantial portions of the Software.
	+
	+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
	+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
	+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
	+OTHER DEALINGS IN THE SOFTWARE.
	diff --git a/swh/indexer/data/package-json/schema.json b/swh/indexer/data/package-json/schema.json
	new file mode 100644
	index 0000000..e5f799f
	--- /dev/null
	+++ b/swh/indexer/data/package-json/schema.json
	@@ -0,0 +1,377 @@
	+{
	+ "$schema": "http://json-schema.org/draft-04/schema",
	+ "id": "lib://package.json",
	+ "title": "package.json-schema",
	+ "description": "JSON Schema for node/npm package.json",
	+ "$ref": "lib://package.json#/definitions/standard",
	+ "definitions": {
	+ "minimal": {
	+ "allOf": [
	+ {
	+ "$ref": "lib://package.json#/definitions/structure"
	+ },
	+ {
	+ "required": [
	+ "name",
	+ "version"
	+ ]
	+ }
	+ ]
	+ },
	+ "standard": {
	+ "allOf": [
	+ {
	+ "$ref": "lib://package.json#/definitions/structure"
	+ },
	+ {
	+ "required": [
	+ "name",
	+ "version",
	+ "description",
	+ "keywords",
	+ "author",
	+ "homepage",
	+ "repository",
	+ "bugs",
	+ "licenses",
	+ "engines",
	+ "main",
	+ "scripts",
	+ "dependencies",
	+ "devDependencies"
	+ ],
	+ "properties": {
	+ "scripts": {
	+ "type": "object",
	+ "properties": {
	+ "test": {
	+ "type" : "string",
	+ "pattern": "[a-zA-Z]"
	+ }
	+ }
	+ },
	+ "author": {
	+ "$ref": "lib://package.json#/definitions/person-object"
	+ },
	+ "contributors": {
	+ "type": "array",
	+ "items": {
	+ "$ref": "lib://package.json#/definitions/person-object"
	+ }
	+ },
	+ "maintainers": {
	+ "type": "array",
	+ "items": {
	+ "$ref": "lib://package.json#/definitions/person-object"
	+ }
	+ }
	+ }
	+ }
	+ ]
	+ },
	+ "structure": {
	+ "type": "object",
	+ "properties": {
	+ "name": {
	+ "$ref": "lib://package.json#/definitions/name"
	+ },
	+ "version": {
	+ "$ref": "lib://package.json#/definitions/semver"
	+ },
	+ "description": {
	+ "type": "string",
	+ "minLength": 1
	+ },
	+ "keywords": {
	+ "type": "array",
	+ "uniqueItems": true,
	+ "items": {
	+ "$ref": "lib://package.json#/definitions/name"
	+ }
	+ },
	+ "author": {
	+ "$ref": "lib://package.json#/definitions/person"
	+ },
	+ "contributors": {
	+ "type": "array",
	+ "uniqueItems": true,
	+ "items": {
	+ "$ref": "lib://package.json#/definitions/person"
	+ }
	+ },
	+ "maintainers": {
	+ "type": "array",
	+ "uniqueItems": true,
	+ "items": {
	+ "$ref": "lib://package.json#/definitions/person"
	+ }
	+ },
	+ "homepage": {
	+ "$ref": "lib://package.json#/definitions/uri-http"
	+ },
	+ "repository": {
	+ "$ref": "lib://package.json#/definitions/repository"
	+ },
	+ "man": {
	+ "oneOf": [
	+ {
	+ "$ref": "lib://package.json#/definitions/path"
	+ },
	+ {
	+ "type": "array",
	+ "uniqueItems": true,
	+ "items": {
	+ "$ref": "lib://package.json#/definitions/path"
	+ }
	+ }
	+ ]
	+ },
	+ "bugs": {
	+ "oneOf": [
	+ {
	+ "$ref": "lib://package.json#/definitions/uri-http"
	+ },
	+ {
	+ "type": "object",
	+ "required": [
	+ "url"
	+ ],
	+ "properties": {
	+ "url": {
	+ "$ref": "lib://package.json#/definitions/uri-http"
	+ },
	+ "email": {
	+ "$ref": "lib://package.json#/definitions/email"
	+ }
	+ }
	+ }
	+ ]
	+ },
	+ "license": {
	+ "$ref": "lib://package.json#/definitions/licence"
	+ },
	+ "licenses": {
	+ "type": "array",
	+ "uniqueItems": true,
	+ "items": {
	+ "$ref": "lib://package.json#/definitions/licence"
	+ }
	+ },
	+ "private": {
	+ "type": "boolean"
	+ },
	+ "preferGlobal": {
	+ "type": "boolean"
	+ },
	+ "engines": {
	+ "$ref": "lib://package.json#/definitions/string-map"
	+ },
	+ "engineStrict": {
	+ "type": "boolean"
	+ },
	+ "main": {
	+ "$ref": "lib://package.json#/definitions/path"
	+ },
	+ "bin": {
	+ "oneOf": [
	+ {
	+ "$ref": "lib://package.json#/definitions/path"
	+ },
	+ {
	+
	+ "$ref": "lib://package.json#/definitions/path-map"
	+ }
	+ ]
	+ },
	+ "files": {
	+ "type": "array",
	+ "uniqueItems": true,
	+ "items": {
	+ "$ref": "lib://package.json#/definitions/path"
	+ }
	+ },
	+ "os": {
	+ "type": "array",
	+ "uniqueItems": true,
	+ "items": {
	+ "$ref": "lib://package.json#/definitions/identifier"
	+ }
	+ },
	+ "cpu": {
	+ "type": "array",
	+ "uniqueItems": true,
	+ "items": {
	+ "$ref": "lib://package.json#/definitions/identifier"
	+ }
	+ },
	+ "config": {
	+ "type": "object"
	+ },
	+ "publishConfig": {
	+ "type": "object"
	+ },
	+ "directories": {
	+ "type": "object",
	+ "properties": {
	+ "lib": {
	+ "$ref": "lib://package.json#/definitions/path"
	+ },
	+ "bin": {
	+ "$ref": "lib://package.json#/definitions/path"
	+ },
	+ "man": {
	+ "$ref": "lib://package.json#/definitions/path"
	+ },
	+ "doc": {
	+ "$ref": "lib://package.json#/definitions/path"
	+ },
	+ "example": {
	+ "$ref": "lib://package.json#/definitions/path"
	+ }
	+ }
	+ },
	+ "scripts": {
	+ "$ref": "lib://package.json#/definitions/string-map"
	+ },
	+ "dependencies": {
	+ "$ref": "lib://package.json#/definitions/dependency-map"
	+ },
	+ "devDependencies": {
	+ "$ref": "lib://package.json#/definitions/dependency-map"
	+ },
	+ "bundledDependencies": {
	+ "$ref": "lib://package.json#/definitions/dependency-map"
	+ },
	+ "bundleDependencies": {
	+ "$ref": "lib://package.json#/definitions/dependency-map"
	+ },
	+ "optionalDependencies": {
	+ "$ref": "lib://package.json#/definitions/dependency-map"
	+ },
	+ "peerDependencies": {
	+ "$ref": "lib://package.json#/definitions/dependency-map"
	+ }
	+ }
	+ },
	+ "uri-http": {
	+ "type": "string",
	+ "pattern": "^https?:\/\/"
	+ },
	+ "email": {
	+ "type": "string",
	+ "pattern": "^([0-9a-zA-Z]([-\\.\\w][0-9a-zA-Z])@([0-9a-zA-Z][-\\w]*[0-9a-zA-Z]\\.)+[a-zA-Z]{2,9})$"
	+ },
	+ "path": {
	+ "type": "string",
	+ "minLength": 1
	+ },
	+ "name": {
	+ "type": "string",
	+ "pattern": "^[A-Za-z](?:[_\\.-]?[A-Za-z0-9]+)*$"
	+ },
	+ "identifier": {
	+ "type": "string",
	+ "pattern": "^[A-Za-z](?:[_-]?[A-Za-z0-9]+)*$"
	+ },
	+ "semver": {
	+ "type": "string",
	+ "pattern": "^\\d+\\.\\d+\\.\\d+(?:-[a-z]+(?:[_\\.-][a-z0-9]+))*$"
	+ },
	+ "type-url": {
	+ "type": "object",
	+ "additionalProperties": false,
	+ "required": [
	+ "type",
	+ "url"
	+ ],
	+ "properties": {
	+ "type": {
	+ "type": "string",
	+ "pattern": "[a-zA-Z]"
	+ },
	+ "url": {
	+ "$ref": "lib://package.json#/definitions/uri-http"
	+ }
	+ }
	+
	+ },
	+ "repository": {
	+ "$ref": "lib://package.json#/definitions/type-url"
	+ },
	+ "licence": {
	+ "oneOf": [
	+ {
	+ "type": "string",
	+ "pattern": "[a-zA-Z]"
	+ },
	+ {
	+ "$ref": "lib://package.json#/definitions/licence-object"
	+ }
	+ ]
	+ },
	+ "licence-object": {
	+ "type": "object",
	+ "additionalProperties": false,
	+ "properties": {
	+ "type": {
	+ "type": "string",
	+ "pattern": "[a-zA-Z]"
	+ },
	+ "url": {
	+ "$ref": "lib://package.json#/definitions/uri-http"
	+ }
	+ }
	+ },
	+ "person": {
	+ "oneOf": [
	+ {
	+ "type": "string",
	+ "pattern": "[a-zA-Z]"
	+ },
	+ {
	+ "$ref": "lib://package.json#/definitions/person-object"
	+ }
	+ ]
	+ },
	+ "person-object": {
	+ "type": "object",
	+ "required": [
	+ "name"
	+ ],
	+ "properties": {
	+ "name": {
	+ "type": "string",
	+ "pattern": "[a-zA-Z]"
	+ },
	+ "email": {
	+ "$ref": "lib://package.json#/definitions/email"
	+ },
	+ "url": {
	+ "$ref": "lib://package.json#/definitions/uri-http"
	+ }
	+ }
	+ },
	+ "string-map": {
	+ "type": "object",
	+ "additionalProperties": false,
	+ "patternProperties": {
	+ ".+": {
	+ "type": "string"
	+ }
	+ }
	+ },
	+ "path-map": {
	+ "type": "object",
	+ "additionalProperties": false,
	+ "patternProperties": {
	+ ".+": {
	+ "$ref": "lib://package.json#/definitions/path",
	+ "pattern": "[a-zA-Z]"
	+ }
	+ }
	+ },
	+ "dependency-map": {
	+ "$ref": "lib://package.json#/definitions/string-map"
	+ }
	+ }
	+}
	diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
	index 836d77d..8b2df2d 100644
	--- a/swh/indexer/metadata.py
	+++ b/swh/indexer/metadata.py
	@@ -1,335 +1,335 @@
	# Copyright (C) 2017-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import click
	import logging
	from copy import deepcopy

	from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
	from swh.indexer.origin_head import OriginHeadIndexer
	from swh.indexer.metadata_dictionary import MAPPINGS
	from swh.indexer.metadata_detector import detect_metadata
	from swh.indexer.metadata_detector import extract_minimal_metadata_dict
	from swh.indexer.storage import INDEXER_CFG_KEY

	from swh.model import hashutil


	class ContentMetadataIndexer(ContentIndexer):
	"""Content-level indexer

	This indexer is in charge of:

	- filtering out content already indexed in content_metadata
	- reading content from objstorage with the content's id sha1
	- computing translated_metadata by given context
	- using the metadata_dictionary as the 'swh-metadata-translator' tool
	- store result in content_metadata table

	"""
	# Note: This used when the content metadata indexer is used alone
	# (not the case for example in the case of the RevisionMetadataIndexer)
	CONFIG_BASE_FILENAME = 'indexer/content_metadata'

	def filter(self, ids):
	"""Filter out known sha1s and return only missing ones.
	"""
	yield from self.idx_storage.content_metadata_missing((
	{
	'id': sha1,
	'indexer_configuration_id': self.tool['id'],
	} for sha1 in ids
	))

	def index(self, id, data, log_suffix='unknown revision'):
	"""Index sha1s' content and store result.

	Args:
	id (bytes): content's identifier
	data (bytes): raw content in bytes

	Returns:
	dict: dictionary representing a content_metadata. If the
	translation wasn't successful the translated_metadata keys will
	be returned as None

	"""
	result = {
	'id': id,
	'indexer_configuration_id': self.tool['id'],
	'translated_metadata': None
	}
	try:
	mapping_name = self.tool['tool_configuration']['context']
	log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id)
	result['translated_metadata'] = \
	MAPPINGS[mapping_name](log_suffix).translate(data)
	except Exception:
	self.log.exception(
	"Problem during metadata translation "
	"for content %s" % hashutil.hash_to_hex(id))
	if result['translated_metadata'] is None:
	return None
	return result

	def persist_index_computations(self, results, policy_update):
	"""Persist the results in storage.

	Args:
	results ([dict]): list of content_metadata, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- translated_metadata (jsonb): detected metadata
	policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	self.idx_storage.content_metadata_add(
	results, conflict_update=(policy_update == 'update-dups'))


	class RevisionMetadataIndexer(RevisionIndexer):
	"""Revision-level indexer

	This indexer is in charge of:

	- filtering revisions already indexed in revision_metadata table with
	defined computation tool
	- retrieve all entry_files in root directory
	- use metadata_detector for file_names containing metadata
	- compute metadata translation if necessary and possible (depends on tool)
	- send sha1s to content indexing if possible
	- store the results for revision

	"""
	CONFIG_BASE_FILENAME = 'indexer/revision_metadata'

	ADDITIONAL_CONFIG = {
	'tools': ('dict', {
	'name': 'swh-metadata-detector',
	'version': '0.0.2',
	'configuration': {
	'type': 'local',
	'context': list(MAPPINGS),
	},
	}),
	}

	def filter(self, sha1_gits):
	"""Filter out known sha1s and return only missing ones.

	"""
	yield from self.idx_storage.revision_metadata_missing((
	{
	'id': sha1_git,
	'indexer_configuration_id': self.tool['id'],
	} for sha1_git in sha1_gits
	))

	def index(self, rev):
	"""Index rev by processing it and organizing result.

	use metadata_detector to iterate on filenames

	- if one filename detected -> sends file to content indexer
	- if multiple file detected -> translation needed at revision level

	Args:
	rev (dict): revision artifact from storage

	Returns:
	dict: dictionary representing a revision_metadata, with keys:

	- id (str): rev's identifier (sha1_git)
	- indexer_configuration_id (bytes): tool used
	- translated_metadata: dict of retrieved metadata

	"""
	result = {
	'id': rev['id'],
	'indexer_configuration_id': self.tool['id'],
	'mappings': None,
	'translated_metadata': None
	}

	try:
	root_dir = rev['directory']
	dir_ls = self.storage.directory_ls(root_dir, recursive=False)
	files = [entry for entry in dir_ls if entry['type'] == 'file']
	detected_files = detect_metadata(files)
	(mappings, metadata) = self.translate_revision_metadata(
	detected_files,
	log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id']))
	result['mappings'] = mappings
	result['translated_metadata'] = metadata
	except Exception as e:
	self.log.exception(
	'Problem when indexing rev: %r', e)
	return result

	def persist_index_computations(self, results, policy_update):
	"""Persist the results in storage.

	Args:
	results ([dict]): list of content_mimetype, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- mimetype (bytes): mimetype in bytes
	- encoding (bytes): encoding in bytes
	policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	# TODO: add functions in storage to keep data in revision_metadata
	self.idx_storage.revision_metadata_add(
	results, conflict_update=(policy_update == 'update-dups'))

	def translate_revision_metadata(self, detected_files, log_suffix):
	"""
	Determine plan of action to translate metadata when containing
	one or multiple detected files:

	Args:
	detected_files (dict): dictionary mapping context names (e.g.,
	"npm", "authors") to list of sha1

	Returns:
	(List[str], dict): list of mappings used and dict with
	translated metadata according to the CodeMeta vocabulary

	"""
	used_mappings = [MAPPINGS[context].name for context in detected_files]
	translated_metadata = []
	tool = {
	'name': 'swh-metadata-translator',
	'version': '0.0.2',
	'configuration': {
	'type': 'local',
	'context': None
	},
	}
	# TODO: iterate on each context, on each file
	# -> get raw_contents
	# -> translate each content
	config = {
	k: self.config[k]
	for k in [INDEXER_CFG_KEY, 'objstorage', 'storage']
	}
	config['tools'] = [tool]
	for context in detected_files.keys():
	cfg = deepcopy(config)
	cfg['tools'][0]['configuration']['context'] = context
	c_metadata_indexer = ContentMetadataIndexer(config=cfg)
	# sha1s that are in content_metadata table
	sha1s_in_storage = []
	metadata_generator = self.idx_storage.content_metadata_get(
	detected_files[context])
	for c in metadata_generator:
	# extracting translated_metadata
	sha1 = c['id']
	sha1s_in_storage.append(sha1)
	local_metadata = c['translated_metadata']
	# local metadata is aggregated
	if local_metadata:
	translated_metadata.append(local_metadata)

	sha1s_filtered = [item for item in detected_files[context]
	if item not in sha1s_in_storage]

	if sha1s_filtered:
	# content indexing
	try:
	c_metadata_indexer.run(sha1s_filtered,
	policy_update='ignore-dups',
	log_suffix=log_suffix)
	# on the fly possibility:
	for result in c_metadata_indexer.results:
	local_metadata = result['translated_metadata']
	translated_metadata.append(local_metadata)

	except Exception:
	self.log.exception(
	"Exception while indexing metadata on contents")

	# transform translated_metadata into min set with swh-metadata-detector
	min_metadata = extract_minimal_metadata_dict(translated_metadata)
	return (used_mappings, min_metadata)


	class OriginMetadataIndexer(OriginIndexer):
	CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata'

	ADDITIONAL_CONFIG = {
	'tools': ('list', [])
	}

	USE_TOOLS = False

	def __init__(self):
	super().__init__()
	self.origin_head_indexer = OriginHeadIndexer()
	self.revision_metadata_indexer = RevisionMetadataIndexer()

	def index_list(self, origins):
	head_rev_ids = []
	for origin in origins:
	head_result = self.origin_head_indexer.index(origin)
	if not head_result:
	continue
	head_rev_ids.append(head_result['revision_id'])

	head_revs = list(self.storage.revision_get(head_rev_ids))
	assert len(head_revs) == len(head_rev_ids)

	results = []
	- for (orig, rev) in zip(origins, head_revs):
	+ for (origin, rev) in zip(origins, head_revs):
	if not rev:
	self.warning('Missing head revision %s of origin %r',
	(hashutil.hash_to_bytes(rev['id']), origin))
	continue

	rev_metadata = self.revision_metadata_indexer.index(rev)
	orig_metadata = {
	'from_revision': rev_metadata['id'],
	'origin_id': origin['id'],
	'metadata': rev_metadata['translated_metadata'],
	'mappings': rev_metadata['mappings'],
	'indexer_configuration_id':
	rev_metadata['indexer_configuration_id'],
	}
	results.append((orig_metadata, rev_metadata))
	return results

	def persist_index_computations(self, results, policy_update):
	conflict_update = (policy_update == 'update-dups')

	# Deduplicate revisions
	rev_metadata = []
	orig_metadata = []
	for (orig_item, rev_item) in results:
	if rev_item not in rev_metadata:
	rev_metadata.append(rev_item)
	if rev_item not in orig_metadata:
	orig_metadata.append(orig_item)

	self.idx_storage.revision_metadata_add(
	rev_metadata, conflict_update=conflict_update)

	self.idx_storage.origin_intrinsic_metadata_add(
	orig_metadata, conflict_update=conflict_update)


	@click.command()
	@click.option('--revs', '-i',
	help='Default sha1_git to lookup', multiple=True)
	def main(revs):
	_git_sha1s = list(map(hashutil.hash_to_bytes, revs))
	rev_metadata_indexer = RevisionMetadataIndexer()
	rev_metadata_indexer.run(_git_sha1s, 'update-dups')


	if __name__ == '__main__':
	logging.basicConfig(level=logging.INFO)
	main()
	diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
	index 2d5e653..fb72a5a 100644
	--- a/swh/indexer/tests/test_origin_metadata.py
	+++ b/swh/indexer/tests/test_origin_metadata.py
	@@ -1,163 +1,188 @@
	# Copyright (C) 2018-2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import pytest

	from unittest.mock import patch

	from swh.model.hashutil import hash_to_bytes

	from swh.indexer.metadata import OriginMetadataIndexer

	from .utils import BASE_TEST_CONFIG, YARN_PARSER_METADATA
	from .test_metadata import REVISION_METADATA_CONFIG


	ORIGIN_HEAD_CONFIG = {
	**BASE_TEST_CONFIG,
	'tools': {
	'name': 'origin-metadata',
	'version': '0.0.1',
	'configuration': {},
	},
	'tasks': {
	'revision_metadata': 'revision_metadata',
	'origin_intrinsic_metadata': 'origin_intrinsic_metadata',
	}
	}


	@pytest.fixture
	def origin_metadata_indexer():
	prefix = 'swh.indexer.'
	suffix = '.parse_config_file'
	with patch(prefix + 'metadata.OriginMetadataIndexer' + suffix) as omi, \
	patch(prefix + 'origin_head.OriginHeadIndexer' + suffix) as ohi, \
	patch(prefix + 'metadata.RevisionMetadataIndexer' + suffix) as rmi:
	omi.return_value = BASE_TEST_CONFIG
	ohi.return_value = ORIGIN_HEAD_CONFIG
	rmi.return_value = REVISION_METADATA_CONFIG
	yield OriginMetadataIndexer()


	def test_origin_metadata_indexer(
	idx_storage, storage, obj_storage, origin_metadata_indexer):

	indexer = OriginMetadataIndexer()
	indexer.run(["git+https://github.com/librariesio/yarn-parser"])

	origin = storage.origin_get({
	'type': 'git',
	'url': 'https://github.com/librariesio/yarn-parser'})
	rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')

	rev_metadata = {
	'id': rev_id,
	'translated_metadata': YARN_PARSER_METADATA,
	'mappings': ['npm'],
	}
	origin_metadata = {
	'origin_id': origin['id'],
	'from_revision': rev_id,
	'metadata': YARN_PARSER_METADATA,
	'mappings': ['npm'],
	}

	results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
	for result in results:
	del result['tool']
	assert results == [rev_metadata]

	results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
	origin['id']]))
	for result in results:
	del result['tool']
	assert results == [origin_metadata]


	-def test_origin_metadata_indexer_duplicates(
	+def test_origin_metadata_indexer_duplicate_origin(
	idx_storage, storage, obj_storage, origin_metadata_indexer):
	indexer = OriginMetadataIndexer()
	indexer.storage = storage
	indexer.idx_storage = idx_storage
	indexer.run(["git+https://github.com/librariesio/yarn-parser"])

	indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2)

	origin = storage.origin_get({
	'type': 'git',
	'url': 'https://github.com/librariesio/yarn-parser'})
	rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')

	results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
	assert len(results) == 1

	results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
	origin['id']]))
	assert len(results) == 1


	def test_origin_metadata_indexer_missing_head(
	idx_storage, storage, obj_storage, origin_metadata_indexer):

	storage.origin_add([{
	'type': 'git',
	'url': 'https://example.com'
	}])

	indexer = OriginMetadataIndexer()
	indexer.run(["git+https://example.com"])

	origin = storage.origin_get({
	'type': 'git',
	'url': 'https://example.com'})

	results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
	origin['id']]))
	assert results == []


	def test_origin_metadata_indexer_partial_missing_head(
	idx_storage, storage, obj_storage, origin_metadata_indexer):

	storage.origin_add([{
	'type': 'git',
	'url': 'https://example.com'
	}])

	indexer = OriginMetadataIndexer()
	indexer.run(["git+https://example.com",
	"git+https://github.com/librariesio/yarn-parser"])

	origin1 = storage.origin_get({
	'type': 'git',
	'url': 'https://example.com'})
	origin2 = storage.origin_get({
	'type': 'git',
	'url': 'https://github.com/librariesio/yarn-parser'})
	rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')

	rev_metadata = {
	'id': rev_id,
	'translated_metadata': YARN_PARSER_METADATA,
	'mappings': ['npm'],
	}
	origin_metadata = {
	'origin_id': origin2['id'],
	'from_revision': rev_id,
	'metadata': YARN_PARSER_METADATA,
	'mappings': ['npm'],
	}

	results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
	for result in results:
	del result['tool']
	assert results == [rev_metadata]

	results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
	origin1['id'], origin2['id']]))
	for result in results:
	del result['tool']
	assert results == [origin_metadata]
	+
	+
	+def test_origin_metadata_indexer_duplicate_revision(
	+ idx_storage, storage, obj_storage, origin_metadata_indexer):
	+ indexer = OriginMetadataIndexer()
	+ indexer.storage = storage
	+ indexer.idx_storage = idx_storage
	+ indexer.run(["git+https://github.com/librariesio/yarn-parser",
	+ "git+https://github.com/librariesio/yarn-parser.git"])
	+
	+ origin1 = storage.origin_get({
	+ 'type': 'git',
	+ 'url': 'https://github.com/librariesio/yarn-parser'})
	+ origin2 = storage.origin_get({
	+ 'type': 'git',
	+ 'url': 'https://github.com/librariesio/yarn-parser.git'})
	+ assert origin1['id'] != origin2['id']
	+ rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
	+
	+ results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
	+ assert len(results) == 1
	+
	+ results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
	+ origin1['id'], origin2['id']]))
	+ assert len(results) == 2
	diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
	index c19bb7e..2cfc437 100644
	--- a/swh/indexer/tests/utils.py
	+++ b/swh/indexer/tests/utils.py
	@@ -1,666 +1,678 @@
	# Copyright (C) 2017-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import abc
	import datetime
	import hashlib
	import random

	from hypothesis import strategies

	from swh.model import hashutil
	from swh.model.hashutil import hash_to_bytes, hash_to_hex

	from swh.indexer.storage import INDEXER_CFG_KEY

	BASE_TEST_CONFIG = {
	'storage': {
	'cls': 'memory',
	'args': {
	},
	},
	'objstorage': {
	'cls': 'memory',
	'args': {
	},
	},
	INDEXER_CFG_KEY: {
	'cls': 'memory',
	'args': {
	},
	},
	}

	ORIGINS = [
	{
	'id': 52189575,
	'lister': None,
	'project': None,
	'type': 'git',
	'url': 'https://github.com/SoftwareHeritage/swh-storage'},
	{
	'id': 4423668,
	'lister': None,
	'project': None,
	'type': 'ftp',
	'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
	{
	'id': 77775770,
	'lister': None,
	'project': None,
	'type': 'deposit',
	'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
	{
	'id': 85072327,
	'lister': None,
	'project': None,
	'type': 'pypi',
	'url': 'https://pypi.org/project/limnoria/'},
	{
	'id': 49908349,
	'lister': None,
	'project': None,
	'type': 'svn',
	'url': 'http://0-512-md.googlecode.com/svn/'},
	{
	'id': 54974445,
	'lister': None,
	'project': None,
	'type': 'git',
	'url': 'https://github.com/librariesio/yarn-parser'},
	+ {
	+ 'id': 54974446,
	+ 'lister': None,
	+ 'project': None,
	+ 'type': 'git',
	+ 'url': 'https://github.com/librariesio/yarn-parser.git'},
	]

	SNAPSHOTS = {
	52189575: {
	'branches': {
	b'refs/heads/add-revision-origin-cache': {
	'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
	b's\xe7/\xe9l\x1e',
	'target_type': 'revision'},
	b'HEAD': {
	'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
	b'\xac\xefrm',
	'target_type': 'revision'},
	b'refs/tags/v0.0.103': {
	'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
	b'\x0f\xdd',
	'target_type': 'release'},
	}},
	4423668: {
	'branches': {
	b'3DLDF-1.1.4.tar.gz': {
	'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90\|\xd3\xfc'
	b'"G\x99\x11',
	'target_type': 'revision'},
	b'3DLDF-2.0.2.tar.gz': {
	'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
	b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
	'target_type': 'revision'},
	b'3DLDF-2.0.3-examples.tar.gz': {
	'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
	b'\xfe\xadZ\x80\x80\xc1\x83\xff',
	'target_type': 'revision'},
	b'3DLDF-2.0.3.tar.gz': {
	'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
	b'\xcc\x1a\xb4`\x8c\x8by',
	'target_type': 'revision'},
	b'3DLDF-2.0.tar.gz': {
	'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
	b'\xd3\xd1m',
	b'target_type': 'revision'}
	}},
	77775770: {
	'branches': {
	b'master': {
	'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
	b'\xa6\xe9\x99\xb1\x9e]q\xeb',
	'target_type': 'revision'}
	},
	'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
	b"\x1d\r "},
	85072327: {
	'branches': {
	b'HEAD': {
	'target': b'releases/2018.09.09',
	'target_type': 'alias'},
	b'releases/2018.09.01': {
	'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
	b'\xbb\xdfF\xfdw\xcf',
	'target_type': 'revision'},
	b'releases/2018.09.09': {
	'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
	b'A\x10\x9d\xc5\xfa2\xf8t',
	'target_type': 'revision'}},
	'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
	b'\x12\x9e\xd6\xb3'},
	49908349: {
	'branches': {
	b'master': {
	'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
	b'\xc9\xad#.\x1bw=\x18',
	'target_type': 'revision'}},
	'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
	b'\x05\xea\xb8\x1f\xc4H\xf4s'},
	54974445: {
	'branches': {
	b'HEAD': {
	'target': hash_to_bytes(
	'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
	- 'target_type': 'revision'}}}
	+ 'target_type': 'revision'}}},
	+ 54974446: {
	+ 'branches': {
	+ b'HEAD': {
	+ 'target': hash_to_bytes(
	+ '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
	+ 'target_type': 'revision'}}},
	}


	REVISIONS = [{
	'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
	'author': {
	'id': 26,
	'name': b'Andrew Nesbitt',
	'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
	'email': b'andrewnez@gmail.com'
	},
	'committer': {
	'id': 26,
	'name': b'Andrew Nesbitt',
	'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
	'email': b'andrewnez@gmail.com'
	},
	'synthetic': False,
	'date': {
	'negative_utc': False,
	'timestamp': {
	'seconds': 1487596456,
	'microseconds': 0
	},
	'offset': 0
	},
	'directory': b'10'
	}]

	DIRECTORY_ID = b'10'

	DIRECTORY = [{
	'sha1_git': b'abc',
	'name': b'index.js',
	'target': b'abc',
	'length': 897,
	'status': 'visible',
	'type': 'file',
	'perms': 33188,
	'sha1': b'bcd'
	},
	{
	'sha1_git': b'aab',
	'name': b'package.json',
	'target': b'aab',
	'length': 712,
	'status': 'visible',
	'type': 'file',
	'perms': 33188,
	'sha1': b'cde'
	},
	{
	'target': b'11',
	'type': 'dir',
	'length': None,
	'name': b'.github',
	'sha1': None,
	'perms': 16384,
	'sha1_git': None,
	'status': None,
	'sha256': None
	}
	]

	SHA1_TO_LICENSES = {
	'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
	'02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
	'103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
	'688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
	'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
	}


	SHA1_TO_CTAGS = {
	'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{
	'name': 'foo',
	'kind': 'str',
	'line': 10,
	'lang': 'bar',
	}],
	'd4c647f0fc257591cc9ba1722484229780d1c607': [{
	'name': 'let',
	'kind': 'int',
	'line': 100,
	'lang': 'haskell',
	}],
	'688a5ef812c53907562fe379d4b3851e69c7cb15': [{
	'name': 'symbol',
	'kind': 'float',
	'line': 99,
	'lang': 'python',
	}],
	}


	OBJ_STORAGE_DATA = {
	'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text',
	'688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text',
	'8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text',
	'02fb2c89e14f7fab46701478c83779c7beb7b069': b"""
	import unittest
	import logging
	from swh.indexer.mimetype import MimetypeIndexer
	from swh.indexer.tests.test_utils import MockObjStorage

	class MockStorage():
	def content_mimetype_add(self, mimetypes):
	self.state = mimetypes
	self.conflict_update = conflict_update

	def indexer_configuration_add(self, tools):
	return [{
	'id': 10,
	}]
	""",
	'103bc087db1d26afc3a0283f38663d081e9b01e6': b"""
	#ifndef __AVL__
	#define __AVL__

	typedef struct _avl_tree avl_tree;

	typedef struct _data_t {
	int content;
	} data_t;
	""",
	'93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
	(should 'pygments (recognize 'lisp 'easily))

	""",
	'26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
	{
	"name": "test_metadata",
	"version": "0.0.1",
	"description": "Simple package.json test for indexer",
	"repository": {
	"type": "git",
	"url": "https://github.com/moranegg/metadata_test"
	}
	}
	""",
	'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
	{
	"version": "5.0.3",
	"name": "npm",
	"description": "a package manager for JavaScript",
	"keywords": [
	"install",
	"modules",
	"package manager",
	"package.json"
	],
	"preferGlobal": true,
	"config": {
	"publishtest": false
	},
	"homepage": "https://docs.npmjs.com/",
	"author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
	"repository": {
	"type": "git",
	"url": "https://github.com/npm/npm"
	},
	"bugs": {
	"url": "https://github.com/npm/npm/issues"
	},
	"dependencies": {
	"JSONStream": "~1.3.1",
	"abbrev": "~1.1.0",
	"ansi-regex": "~2.1.1",
	"ansicolors": "~0.3.2",
	"ansistyles": "~0.1.3"
	},
	"devDependencies": {
	"tacks": "~1.2.6",
	"tap": "~10.3.2"
	},
	"license": "Artistic-2.0"
	}

	""",
	'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b"""
	""",
	'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'',
	'636465': b"""
	{
	"name": "yarn-parser",
	"version": "1.0.0",
	"description": "Tiny web service for parsing yarn.lock files",
	"main": "index.js",
	"scripts": {
	"start": "node index.js",
	"test": "mocha"
	},
	"engines": {
	"node": "9.8.0"
	},
	"repository": {
	"type": "git",
	"url": "git+https://github.com/librariesio/yarn-parser.git"
	},
	"keywords": [
	"yarn",
	"parse",
	"lock",
	"dependencies"
	],
	"author": "Andrew Nesbitt",
	"license": "AGPL-3.0",
	"bugs": {
	"url": "https://github.com/librariesio/yarn-parser/issues"
	},
	"homepage": "https://github.com/librariesio/yarn-parser#readme",
	"dependencies": {
	"@yarnpkg/lockfile": "^1.0.0",
	"body-parser": "^1.15.2",
	"express": "^4.14.0"
	},
	"devDependencies": {
	"chai": "^4.1.2",
	"mocha": "^5.2.0",
	"request": "^2.87.0",
	"test": "^0.6.0"
	}
	}
	"""
	}


	YARN_PARSER_METADATA = {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'url':
	'https://github.com/librariesio/yarn-parser#readme',
	'codeRepository':
	'git+git+https://github.com/librariesio/yarn-parser.git',
	'author': [{
	'type': 'Person',
	'name': 'Andrew Nesbitt'
	}],
	'license': 'https://spdx.org/licenses/AGPL-3.0',
	'version': '1.0.0',
	'description':
	'Tiny web service for parsing yarn.lock files',
	'issueTracker':
	'https://github.com/librariesio/yarn-parser/issues',
	'name': 'yarn-parser',
	'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
	}


	json_dict_keys = strategies.one_of(
	strategies.characters(),
	*map(strategies.just, ['type', 'url', 'name', 'email', '@id',
	'@context', 'repository', 'license',
	]),
	)
	"""Hypothesis strategy that generates strings, with an emphasis on those
	that are often used as dictionary keys in metadata files."""


	generic_json_document = strategies.recursive(
	strategies.none() \| strategies.booleans() \| strategies.floats() \|
	strategies.characters(),
	lambda children: (
	strategies.lists(children, 1) \|
	strategies.dictionaries(json_dict_keys, children, min_size=1)
	)
	)
	"""Hypothesis strategy that generates possible values for values of JSON
	metadata files."""


	def json_document_strategy(keys=None):
	"""Generates an hypothesis strategy that generates metadata files
	for a format that uses the given keys."""
	if keys is None:
	keys = strategies.characters()
	else:
	keys = strategies.one_of(map(strategies.just, keys))

	return strategies.dictionaries(keys, generic_json_document, min_size=1)


	def filter_dict(d, keys):
	'return a copy of the dict with keys deleted'
	if not isinstance(keys, (list, tuple)):
	keys = (keys, )
	return dict((k, v) for (k, v) in d.items() if k not in keys)


	def fill_obj_storage(obj_storage):
	"""Add some content in an object storage."""
	for (obj_id, content) in OBJ_STORAGE_DATA.items():
	obj_storage.add(content, obj_id=hash_to_bytes(obj_id))


	def fill_storage(storage):
	for origin in ORIGINS:
	origin = origin.copy()
	del origin['id']
	storage.origin_add_one(origin)
	for (orig_pseudo_id, snap) in SNAPSHOTS.items():
	for orig in ORIGINS:
	if orig_pseudo_id == orig['id']:
	origin_id = storage.origin_get(
	{'type': orig['type'], 'url': orig['url']})['id']
	break
	else:
	assert False
	visit = storage.origin_visit_add(origin_id, datetime.datetime.now())
	snap_id = snap.get('id') or \
	bytes([random.randint(0, 255) for _ in range(32)])
	storage.snapshot_add(origin_id, visit['visit'], {
	'id': snap_id,
	'branches': snap['branches']
	})
	storage.revision_add(REVISIONS)
	storage.directory_add([{
	'id': DIRECTORY_ID,
	'entries': DIRECTORY,
	}])
	for (obj_id, content) in OBJ_STORAGE_DATA.items():
	# TODO: use MultiHash
	if hasattr(hashlib, 'blake2s'):
	blake2s256 = hashlib.blake2s(content, digest_size=32).digest()
	else:
	# fallback for Python <3.6
	blake2s256 = bytes([random.randint(0, 255) for _ in range(32)])
	storage.content_add([{
	'data': content,
	'length': len(content),
	'status': 'visible',
	'sha1': hash_to_bytes(obj_id),
	'sha1_git': hash_to_bytes(obj_id),
	'sha256': hashlib.sha256(content).digest(),
	'blake2s256': blake2s256
	}])


	class CommonContentIndexerTest(metaclass=abc.ABCMeta):
	legacy_get_format = False
	"""True if and only if the tested indexer uses the legacy format.
	see: https://forge.softwareheritage.org/T1433

	"""
	def get_indexer_results(self, ids):
	"""Override this for indexers that don't have a mock storage."""
	return self.indexer.idx_storage.state

	def assert_legacy_results_ok(self, sha1s, expected_results=None):
	# XXX old format, remove this when all endpoints are
	# updated to the new one
	# see: https://forge.softwareheritage.org/T1433
	sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
	for sha1 in sha1s]
	actual_results = list(self.get_indexer_results(sha1s))

	if expected_results is None:
	expected_results = self.expected_results

	self.assertEqual(len(expected_results), len(actual_results),
	(expected_results, actual_results))
	for indexed_data in actual_results:
	_id = indexed_data['id']
	expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
	expected_data['id'] = _id
	self.assertEqual(indexed_data, expected_data)

	def assert_results_ok(self, sha1s, expected_results=None):
	if self.legacy_get_format:
	self.assert_legacy_results_ok(sha1s, expected_results)
	return

	sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
	for sha1 in sha1s]
	actual_results = list(self.get_indexer_results(sha1s))

	if expected_results is None:
	expected_results = self.expected_results

	self.assertEqual(len(expected_results), len(actual_results),
	(expected_results, actual_results))
	for indexed_data in actual_results:
	(_id, indexed_data) = list(indexed_data.items())[0]
	expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
	expected_data = [expected_data]
	self.assertEqual(indexed_data, expected_data)

	def test_index(self):
	"""Known sha1 have their data indexed

	"""
	sha1s = [self.id0, self.id1, self.id2]

	# when
	self.indexer.run(sha1s, policy_update='update-dups')

	self.assert_results_ok(sha1s)

	# 2nd pass
	self.indexer.run(sha1s, policy_update='ignore-dups')

	self.assert_results_ok(sha1s)

	def test_index_one_unknown_sha1(self):
	"""Unknown sha1 are not indexed"""
	sha1s = [self.id1,
	'799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
	'800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown

	# when
	self.indexer.run(sha1s, policy_update='update-dups')

	# then
	expected_results = {
	k: v for k, v in self.expected_results.items() if k in sha1s
	}

	self.assert_results_ok(sha1s, expected_results)


	class CommonContentIndexerRangeTest:
	"""Allows to factorize tests on range indexer.

	"""
	def setUp(self):
	self.contents = sorted(OBJ_STORAGE_DATA)

	def assert_results_ok(self, start, end, actual_results,
	expected_results=None):
	if expected_results is None:
	expected_results = self.expected_results

	actual_results = list(actual_results)
	for indexed_data in actual_results:
	_id = indexed_data['id']
	assert isinstance(_id, bytes)
	indexed_data = indexed_data.copy()
	indexed_data['id'] = hash_to_hex(indexed_data['id'])
	self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
	self.assertTrue(start <= _id <= end)
	_tool_id = indexed_data['indexer_configuration_id']
	self.assertEqual(_tool_id, self.indexer.tool['id'])

	def test__index_contents(self):
	"""Indexing contents without existing data results in indexed data

	"""
	_start, _end = [self.contents[0], self.contents[2]] # output hex ids
	start, end = map(hashutil.hash_to_bytes, (_start, _end))
	# given
	actual_results = list(self.indexer._index_contents(
	start, end, indexed={}))

	self.assert_results_ok(start, end, actual_results)

	def test__index_contents_with_indexed_data(self):
	"""Indexing contents with existing data results in less indexed data

	"""
	_start, _end = [self.contents[0], self.contents[2]] # output hex ids
	start, end = map(hashutil.hash_to_bytes, (_start, _end))
	data_indexed = [self.id0, self.id2]

	# given
	actual_results = self.indexer._index_contents(
	start, end, indexed=set(map(hash_to_bytes, data_indexed)))

	# craft the expected results
	expected_results = self.expected_results.copy()
	for already_indexed_key in data_indexed:
	expected_results.pop(already_indexed_key)

	self.assert_results_ok(
	start, end, actual_results, expected_results)

	def test_generate_content_get(self):
	"""Optimal indexing should result in indexed data

	"""
	_start, _end = [self.contents[0], self.contents[2]] # output hex ids
	start, end = map(hashutil.hash_to_bytes, (_start, _end))

	# given
	actual_results = self.indexer.run(start, end)

	# then
	self.assertTrue(actual_results)

	def test_generate_content_get_input_as_bytes(self):
	"""Optimal indexing should result in indexed data

	Input are in bytes here.

	"""
	_start, _end = [self.contents[0], self.contents[2]] # output hex ids
	start, end = map(hashutil.hash_to_bytes, (_start, _end))

	# given
	actual_results = self.indexer.run( # checks the bytes input this time
	start, end, skip_existing=False)
	# no already indexed data so same result as prior test

	# then
	self.assertTrue(actual_results)

	def test_generate_content_get_no_result(self):
	"""No result indexed returns False"""
	_start, _end = ['0000000000000000000000000000000000000000',
	'0000000000000000000000000000000000000001']
	start, end = map(hashutil.hash_to_bytes, (_start, _end))
	# given
	actual_results = self.indexer.run(
	start, end, incremental=False)

	# then
	self.assertFalse(actual_results)

File Metadata

Mime Type: text/x-diff
Expires: Thu, Sep 18, 4:32 AM (50 m, 52 s ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3297959

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions