No OneTemporary
Actions

Size

84 KB

Subscribers

None

View Options

	diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
	index 0a5f0c7..1ee09bb 100644
	--- a/swh/indexer/metadata.py
	+++ b/swh/indexer/metadata.py
	@@ -1,340 +1,340 @@
	# Copyright (C) 2017 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	import click
	import logging

	from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
	from swh.indexer.metadata_dictionary import MAPPINGS
	from swh.indexer.metadata_detector import detect_metadata
	from swh.indexer.metadata_detector import extract_minimal_metadata_dict
	from swh.indexer.storage import INDEXER_CFG_KEY

	from swh.model import hashutil


	class ContentMetadataIndexer(ContentIndexer):
	"""Content-level indexer

	This indexer is in charge of:

	- filtering out content already indexed in content_metadata
	- reading content from objstorage with the content's id sha1
	- computing translated_metadata by given context
	- using the metadata_dictionary as the 'swh-metadata-translator' tool
	- store result in content_metadata table

	"""
	CONFIG_BASE_FILENAME = 'indexer/content_metadata'

	def __init__(self, tool, config):
	# twisted way to use the exact same config of RevisionMetadataIndexer
	# object that uses internally ContentMetadataIndexer
	self.config = config
	self.config['tools'] = tool
	super().__init__()

	def filter(self, ids):
	"""Filter out known sha1s and return only missing ones.
	"""
	yield from self.idx_storage.content_metadata_missing((
	{
	'id': sha1,
	'indexer_configuration_id': self.tool['id'],
	} for sha1 in ids
	))

	def index(self, id, data):
	"""Index sha1s' content and store result.

	Args:
	id (bytes): content's identifier
	data (bytes): raw content in bytes

	Returns:
	dict: dictionary representing a content_metadata. If the
	translation wasn't successful the translated_metadata keys will
	be returned as None

	"""
	result = {
	'id': id,
	'indexer_configuration_id': self.tool['id'],
	'translated_metadata': None
	}
	try:
	mapping_name = self.tool['tool_configuration']['context']
	result['translated_metadata'] = MAPPINGS[mapping_name] \
	.translate(data)
	# a twisted way to keep result with indexer object for get_results
	self.results.append(result)
	except Exception:
	self.log.exception(
	"Problem during tool retrieval of metadata translation")
	return result

	def persist_index_computations(self, results, policy_update):
	"""Persist the results in storage.

	Args:
	results ([dict]): list of content_metadata, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- translated_metadata (jsonb): detected metadata
	policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	self.idx_storage.content_metadata_add(
	results, conflict_update=(policy_update == 'update-dups'))

	def get_results(self):
	"""can be called only if run method was called before

	Returns:
	list: list of content_metadata entries calculated by
	current indexer

	"""
	return self.results


	class RevisionMetadataIndexer(RevisionIndexer):
	"""Revision-level indexer

	This indexer is in charge of:

	- filtering revisions already indexed in revision_metadata table with
	defined computation tool
	- retrieve all entry_files in root directory
	- use metadata_detector for file_names containing metadata
	- compute metadata translation if necessary and possible (depends on tool)
	- send sha1s to content indexing if possible
	- store the results for revision

	"""
	CONFIG_BASE_FILENAME = 'indexer/revision_metadata'

	ADDITIONAL_CONFIG = {
	'tools': ('dict', {
	'name': 'swh-metadata-detector',
	'version': '0.0.2',
	'configuration': {
	'type': 'local',
	'context': ['NpmMapping', 'CodemetaMapping']
	},
	}),
	}

	ContentMetadataIndexer = ContentMetadataIndexer

	def prepare(self):
	super().prepare()
	self.tool = self.tools[0]

	def filter(self, sha1_gits):
	"""Filter out known sha1s and return only missing ones.

	"""
	yield from self.idx_storage.revision_metadata_missing((
	{
	'id': sha1_git,
	'indexer_configuration_id': self.tool['id'],
	} for sha1_git in sha1_gits
	))

	def index(self, rev):
	"""Index rev by processing it and organizing result.

	use metadata_detector to iterate on filenames

	- if one filename detected -> sends file to content indexer
	- if multiple file detected -> translation needed at revision level

	Args:
	rev (bytes): revision artifact from storage

	Returns:
	dict: dictionary representing a revision_metadata, with keys:

	- id (str): rev's identifier (sha1_git)
	- indexer_configuration_id (bytes): tool used
	- translated_metadata: dict of retrieved metadata

	"""
	result = {
	- 'id': rev['id'].decode(),
	+ 'id': rev['id'],
	'indexer_configuration_id': self.tool['id'],
	'translated_metadata': None
	}

	try:
	root_dir = rev['directory']
	dir_ls = self.storage.directory_ls(root_dir, recursive=False)
	files = [entry for entry in dir_ls if entry['type'] == 'file']
	detected_files = detect_metadata(files)
	result['translated_metadata'] = self.translate_revision_metadata(
	detected_files)
	except Exception as e:
	self.log.exception(
	'Problem when indexing rev: %r', e)
	return result

	def persist_index_computations(self, results, policy_update):
	"""Persist the results in storage.

	Args:
	results ([dict]): list of content_mimetype, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- mimetype (bytes): mimetype in bytes
	- encoding (bytes): encoding in bytes
	policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	# TODO: add functions in storage to keep data in revision_metadata
	self.idx_storage.revision_metadata_add(
	results, conflict_update=(policy_update == 'update-dups'))

	def translate_revision_metadata(self, detected_files):
	"""
	Determine plan of action to translate metadata when containing
	one or multiple detected files:

	Args:
	detected_files (dict): dictionary mapping context names (e.g.,
	"npm", "authors") to list of sha1

	Returns:
	dict: dict with translated metadata according to the CodeMeta
	vocabulary

	"""
	translated_metadata = []
	tool = {
	'name': 'swh-metadata-translator',
	'version': '0.0.2',
	'configuration': {
	'type': 'local',
	'context': None
	},
	}
	# TODO: iterate on each context, on each file
	# -> get raw_contents
	# -> translate each content
	config = {
	INDEXER_CFG_KEY: self.idx_storage,
	'objstorage': self.objstorage
	}
	for context in detected_files.keys():
	tool['configuration']['context'] = context
	c_metadata_indexer = self.ContentMetadataIndexer(tool, config)
	# sha1s that are in content_metadata table
	sha1s_in_storage = []
	metadata_generator = self.idx_storage.content_metadata_get(
	detected_files[context])
	for c in metadata_generator:
	# extracting translated_metadata
	sha1 = c['id']
	sha1s_in_storage.append(sha1)
	local_metadata = c['translated_metadata']
	# local metadata is aggregated
	if local_metadata:
	translated_metadata.append(local_metadata)

	sha1s_filtered = [item for item in detected_files[context]
	if item not in sha1s_in_storage]

	if sha1s_filtered:
	# schedule indexation of content
	try:
	c_metadata_indexer.run(sha1s_filtered,
	policy_update='ignore-dups')
	# on the fly possibility:
	results = c_metadata_indexer.get_results()

	for result in results:
	local_metadata = result['translated_metadata']
	translated_metadata.append(local_metadata)

	except Exception as e:
	self.log.warning("""Exception while indexing content""", e)

	# transform translated_metadata into min set with swh-metadata-detector
	min_metadata = extract_minimal_metadata_dict(translated_metadata)
	return min_metadata


	class OriginMetadataIndexer(OriginIndexer):
	CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata'

	ADDITIONAL_CONFIG = {
	'tools': ('list', [])
	}

	def check(self, **kwargs):
	kwargs['check_tools'] = False
	super().check(**kwargs)

	def filter(self, ids):
	return ids

	def run(self, origin_head, policy_update):
	"""Expected to be called with the result of RevisionMetadataIndexer
	as first argument; ie. not a list of ids as other indexers would.

	Args:

	- * `origin_head` (dict): {str(origin_id): rev_id.encode()}
	+ * `origin_head` (dict): {str(origin_id): rev_id}
	keys `origin_id` and `revision_id`, which is the result
	of OriginHeadIndexer.
	* `policy_update`: `'ignore-dups'` or `'update-dups'`
	"""
	- origin_head_map = {int(origin_id): rev_id
	+ origin_head_map = {int(origin_id): hashutil.hash_to_bytes(rev_id)
	for (origin_id, rev_id) in origin_head.items()}

	# Fix up the argument order. revisions_metadata has to be the
	# first argument because of celery.chain; the next line calls
	# run() with the usual order, ie. origin ids first.
	return super().run(ids=list(origin_head_map),
	policy_update=policy_update,
	parse_ids=False,
	origin_head_map=origin_head_map)

	def index(self, origin, *, origin_head_map):
	# Get the last revision of the origin.
	revision_id = origin_head_map[origin['id']]

	revision_metadata = self.idx_storage \
	.revision_metadata_get([revision_id])

	for item in revision_metadata:
	assert item['id'] == revision_id
	# Get the metadata of that revision, and return it
	return {
	'origin_id': origin['id'],
	'metadata': item['translated_metadata'],
	'from_revision': revision_id,
	'indexer_configuration_id':
	item['indexer_configuration_id'],
	}

	def persist_index_computations(self, results, policy_update):
	self.idx_storage.origin_intrinsic_metadata_add(
	results, conflict_update=(policy_update == 'update-dups'))


	@click.command()
	@click.option('--revs', '-i',
	help='Default sha1_git to lookup', multiple=True)
	def main(revs):
	_git_sha1s = list(map(hashutil.hash_to_bytes, revs))
	rev_metadata_indexer = RevisionMetadataIndexer()
	rev_metadata_indexer.run(_git_sha1s, 'update-dups')


	if __name__ == '__main__':
	logging.basicConfig(level=logging.INFO)
	main()
	diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
	index 424fb57..35ea767 100644
	--- a/swh/indexer/origin_head.py
	+++ b/swh/indexer/origin_head.py
	@@ -1,219 +1,221 @@
	# Copyright (C) 2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import re
	import click
	import logging

	from swh.scheduler import get_scheduler
	from swh.scheduler.utils import create_task_dict
	from swh.indexer.indexer import OriginIndexer

	+from swh.model.hashutil import hash_to_hex
	+

	class OriginHeadIndexer(OriginIndexer):
	"""Origin-level indexer.

	This indexer is in charge of looking up the revision that acts as the
	"head" of an origin.

	In git, this is usually the commit pointed to by the 'master' branch."""

	ADDITIONAL_CONFIG = {
	'tools': ('dict', {
	'name': 'origin-metadata',
	'version': '0.0.1',
	'configuration': {},
	}),
	'tasks': ('dict', {
	'revision_metadata': 'revision_metadata',
	'origin_intrinsic_metadata': 'origin_metadata',
	})
	}

	CONFIG_BASE_FILENAME = 'indexer/origin_head'

	def filter(self, ids):
	yield from ids

	def persist_index_computations(self, results, policy_update):
	"""Do nothing. The indexer's results are not persistent, they
	should only be piped to another indexer."""
	pass

	def next_step(self, results, task):
	"""Once the head is found, call the RevisionMetadataIndexer
	on these revisions, then call the OriginMetadataIndexer with
	both the origin_id and the revision metadata, so it can copy the
	revision metadata to the origin's metadata.

	Args:
	results (Iterable[dict]): Iterable of return values from `index`.

	"""
	super().next_step(results, task)
	revision_metadata_task = self.config['tasks']['revision_metadata']
	origin_intrinsic_metadata_task = self.config['tasks'][
	'origin_intrinsic_metadata']
	if revision_metadata_task is None and \
	origin_intrinsic_metadata_task is None:
	return
	assert revision_metadata_task is not None
	assert origin_intrinsic_metadata_task is not None

	# Second task to run after this one: copy the revision's metadata
	# to the origin
	sub_task = create_task_dict(
	origin_intrinsic_metadata_task,
	'oneshot',
	origin_head={
	str(result['origin_id']):
	- result['revision_id'].decode()
	+ hash_to_hex(result['revision_id'])
	for result in results},
	policy_update='update-dups',
	)
	del sub_task['next_run'] # Not json-serializable

	# First task to run after this one: index the metadata of the
	# revision
	task = create_task_dict(
	revision_metadata_task,
	'oneshot',
	- ids=[res['revision_id'].decode() for res in results],
	+ ids=[hash_to_hex(res['revision_id']) for res in results],
	policy_update='update-dups',
	next_step=sub_task,
	)
	if getattr(self, 'scheduler', None):
	scheduler = self.scheduler
	else:
	scheduler = get_scheduler(**self.config['scheduler'])
	scheduler.create_tasks([task])

	# Dispatch

	def index(self, origin):
	origin_id = origin['id']
	latest_snapshot = self.storage.snapshot_get_latest(origin_id)
	method = getattr(self, '_try_get_%s_head' % origin['type'], None)
	if method is None:
	method = self._try_get_head_generic
	rev_id = method(latest_snapshot)
	if rev_id is None:
	return None
	result = {
	'origin_id': origin_id,
	'revision_id': rev_id,
	}
	return result

	# VCSs

	def _try_get_vcs_head(self, snapshot):
	try:
	if isinstance(snapshot, dict):
	branches = snapshot['branches']
	if branches[b'HEAD']['target_type'] == 'revision':
	return branches[b'HEAD']['target']
	except KeyError:
	return None

	_try_get_hg_head = _try_get_git_head = _try_get_vcs_head

	# Tarballs

	_archive_filename_re = re.compile(
	rb'^'
	rb'(?P<pkgname>.*)[-_]'
	rb'(?P<version>[0-9]+(\.[0-9])*)'
	rb'(?P<preversion>[-+][a-zA-Z0-9.~]+?)?'
	rb'(?P<extension>(\.[a-zA-Z0-9]+)+)'
	rb'$')

	@classmethod
	def _parse_version(cls, filename):
	"""Extracts the release version from an archive filename,
	to get an ordering whose maximum is likely to be the last
	version of the software

	>>> OriginHeadIndexer._parse_version(b'foo')
	(-inf,)
	>>> OriginHeadIndexer._parse_version(b'foo.tar.gz')
	(-inf,)
	>>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz')
	(0, 0, 1, 0)
	>>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz')
	(0, 0, 1, -1, 'beta2')
	>>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz')
	(0, 0, 1, 1, 'foobar')
	"""
	res = cls._archive_filename_re.match(filename)
	if res is None:
	return (float('-infinity'),)
	version = [int(n) for n in res.group('version').decode().split('.')]
	if res.group('preversion') is None:
	version.append(0)
	else:
	preversion = res.group('preversion').decode()
	if preversion.startswith('-'):
	version.append(-1)
	version.append(preversion[1:])
	elif preversion.startswith('+'):
	version.append(1)
	version.append(preversion[1:])
	else:
	assert False, res.group('preversion')
	return tuple(version)

	def _try_get_ftp_head(self, snapshot):
	archive_names = list(snapshot['branches'])
	max_archive_name = max(archive_names, key=self._parse_version)
	r = self._try_resolve_target(snapshot['branches'], max_archive_name)
	return r

	# Generic

	def _try_get_head_generic(self, snapshot):
	# Works on 'deposit', 'svn', and 'pypi'.
	try:
	if isinstance(snapshot, dict):
	branches = snapshot['branches']
	except KeyError:
	return None
	else:
	return (
	self._try_resolve_target(branches, b'HEAD') or
	self._try_resolve_target(branches, b'master')
	)

	def _try_resolve_target(self, branches, target_name):
	try:
	target = branches[target_name]
	while target['target_type'] == 'alias':
	target = branches[target['target']]
	if target['target_type'] == 'revision':
	return target['target']
	elif target['target_type'] == 'content':
	return None # TODO
	elif target['target_type'] == 'directory':
	return None # TODO
	elif target['target_type'] == 'release':
	return None # TODO
	else:
	assert False
	except KeyError:
	return None


	@click.command()
	@click.option('--origins', '-i',
	help='Origins to lookup, in the "type+url" format',
	multiple=True)
	def main(origins):
	rev_metadata_indexer = OriginHeadIndexer()
	rev_metadata_indexer.run(origins)


	if __name__ == '__main__':
	logging.basicConfig(level=logging.INFO)
	main()
	diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
	index c60692c..0fea30c 100644
	--- a/swh/indexer/storage/db.py
	+++ b/swh/indexer/storage/db.py
	@@ -1,396 +1,396 @@
	# Copyright (C) 2015-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from swh.model import hashutil

	from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes
	from swh.storage.db import line_to_bytes, execute_values_to_bytes


	class Db(BaseDb):
	"""Proxy to the SWH Indexer DB, with wrappers around stored procedures

	"""
	content_mimetype_hash_keys = ['id', 'indexer_configuration_id']

	def _missing_from_list(self, table, data, hash_keys, cur=None):
	"""Read from table the data with hash_keys that are missing.

	Args:
	table (str): Table name (e.g content_mimetype, content_language,
	etc...)
	data (dict): Dict of data to read from
	hash_keys ([str]): List of keys to read in the data dict.

	Yields:
	The data which is missing from the db.

	"""
	cur = self._cursor(cur)
	keys = ', '.join(hash_keys)
	equality = ' AND '.join(
	('t.%s = c.%s' % (key, key)) for key in hash_keys
	)
	yield from execute_values_to_bytes(
	cur, """
	select %s from (values %%s) as t(%s)
	where not exists (
	select 1 from %s c
	where %s
	)
	""" % (keys, keys, table, equality),
	(tuple(m[k] for k in hash_keys) for m in data)
	)

	def content_mimetype_missing_from_list(self, mimetypes, cur=None):
	"""List missing mimetypes.

	"""
	yield from self._missing_from_list(
	'content_mimetype', mimetypes, self.content_mimetype_hash_keys,
	cur=cur)

	content_mimetype_cols = [
	'id', 'mimetype', 'encoding',
	'tool_id', 'tool_name', 'tool_version', 'tool_configuration']

	@stored_procedure('swh_mktemp_content_mimetype')
	def mktemp_content_mimetype(self, cur=None): pass

	def content_mimetype_add_from_temp(self, conflict_update, cur=None):
	self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)",
	(conflict_update, ))

	def _convert_key(self, key, main_table='c'):
	"""Convert keys according to specific use in the module.
	Args:
	key (str): Key expression to change according to the alias
	used in the query
	main_table (str): Alias to use for the main table. Default
	to c for content_{something}.

	Expected:
	Tables content_{something} being aliased as 'c' (something
	in {language, mimetype, ...}), table indexer_configuration
	being aliased as 'i'.

	"""
	if key == 'id':
	return '%s.id' % main_table
	elif key == 'tool_id':
	return 'i.id as tool_id'
	elif key == 'licenses':
	return '''
	array(select name
	from fossology_license
	where id = ANY(
	array_agg(%s.license_id))) as licenses''' % main_table
	return key

	def _get_from_list(self, table, ids, cols, cur=None, id_col='id'):
	"""Fetches entries from the `table` such that their `id` field
	(or whatever is given to `id_col`) is in `ids`.
	Returns the columns `cols`.
	The `cur`sor is used to connect to the database.
	"""
	cur = self._cursor(cur)
	keys = map(self._convert_key, cols)
	query = """
	select {keys}
	from (values %s) as t(id)
	inner join {table} c
	on c.{id_col}=t.id
	inner join indexer_configuration i
	on c.indexer_configuration_id=i.id;
	""".format(
	keys=', '.join(keys),
	id_col=id_col,
	table=table)
	yield from execute_values_to_bytes(
	cur, query,
	((_id,) for _id in ids)
	)

	content_indexer_names = {
	'mimetype': 'content_mimetype',
	'fossology_license': 'content_fossology_license',
	}

	def content_get_range(self, content_type, start, end,
	indexer_configuration_id, limit=1000,
	with_textual_data=False, cur=None):
	"""Retrieve contents with content_type, within range [start, end]
	bound by limit and associated to the given indexer
	configuration id.

	When asking to work on textual content, that filters on the
	mimetype table with any mimetype that is not binary.

	"""
	cur = self._cursor(cur)
	table = self.content_indexer_names[content_type]
	if with_textual_data:
	extra = """inner join content_mimetype cm
	on (t.id=cm.id and cm.mimetype like 'text/%%')"""
	else:
	extra = ""
	query = """select t.id
	from %s t
	inner join indexer_configuration ic
	on t.indexer_configuration_id=ic.id
	%s
	where ic.id=%%s and
	%%s <= t.id and t.id <= %%s
	order by t.indexer_configuration_id, t.id
	limit %%s""" % (table, extra)
	cur.execute(query, (indexer_configuration_id, start, end, limit))
	yield from cursor_to_bytes(cur)

	def content_mimetype_get_from_list(self, ids, cur=None):
	yield from self._get_from_list(
	'content_mimetype', ids, self.content_mimetype_cols, cur=cur)

	content_language_hash_keys = ['id', 'indexer_configuration_id']

	def content_language_missing_from_list(self, languages, cur=None):
	"""List missing languages.

	"""
	yield from self._missing_from_list(
	'content_language', languages, self.content_language_hash_keys,
	cur=cur)

	content_language_cols = [
	'id', 'lang',
	'tool_id', 'tool_name', 'tool_version', 'tool_configuration']

	@stored_procedure('swh_mktemp_content_language')
	def mktemp_content_language(self, cur=None): pass

	def content_language_add_from_temp(self, conflict_update, cur=None):
	self._cursor(cur).execute("SELECT swh_content_language_add(%s)",
	(conflict_update, ))

	def content_language_get_from_list(self, ids, cur=None):
	yield from self._get_from_list(
	'content_language', ids, self.content_language_cols, cur=cur)

	content_ctags_hash_keys = ['id', 'indexer_configuration_id']

	def content_ctags_missing_from_list(self, ctags, cur=None):
	"""List missing ctags.

	"""
	yield from self._missing_from_list(
	'content_ctags', ctags, self.content_ctags_hash_keys,
	cur=cur)

	content_ctags_cols = [
	'id', 'name', 'kind', 'line', 'lang',
	'tool_id', 'tool_name', 'tool_version', 'tool_configuration']

	@stored_procedure('swh_mktemp_content_ctags')
	def mktemp_content_ctags(self, cur=None): pass

	def content_ctags_add_from_temp(self, conflict_update, cur=None):
	self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)",
	(conflict_update, ))

	def content_ctags_get_from_list(self, ids, cur=None):
	cur = self._cursor(cur)
	keys = map(self._convert_key, self.content_ctags_cols)
	yield from execute_values_to_bytes(
	cur, """
	select %s
	from (values %%s) as t(id)
	inner join content_ctags c
	on c.id=t.id
	inner join indexer_configuration i
	on c.indexer_configuration_id=i.id
	order by line
	""" % ', '.join(keys),
	((_id,) for _id in ids)
	)

	def content_ctags_search(self, expression, last_sha1, limit, cur=None):
	cur = self._cursor(cur)
	if not last_sha1:
	query = """SELECT %s
	FROM swh_content_ctags_search(%%s, %%s)""" % (
	','.join(self.content_ctags_cols))
	cur.execute(query, (expression, limit))
	else:
	if last_sha1 and isinstance(last_sha1, bytes):
	last_sha1 = '\\x%s' % hashutil.hash_to_hex(last_sha1)
	elif last_sha1:
	last_sha1 = '\\x%s' % last_sha1

	query = """SELECT %s
	FROM swh_content_ctags_search(%%s, %%s, %%s)""" % (
	','.join(self.content_ctags_cols))
	cur.execute(query, (expression, limit, last_sha1))

	yield from cursor_to_bytes(cur)

	content_fossology_license_cols = [
	'id', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration',
	'licenses']

	@stored_procedure('swh_mktemp_content_fossology_license')
	def mktemp_content_fossology_license(self, cur=None): pass

	def content_fossology_license_add_from_temp(self, conflict_update,
	cur=None):
	"""Add new licenses per content.

	"""
	self._cursor(cur).execute(
	"SELECT swh_content_fossology_license_add(%s)",
	(conflict_update, ))

	def content_fossology_license_get_from_list(self, ids, cur=None):
	"""Retrieve licenses per id.

	"""
	cur = self._cursor(cur)
	keys = map(self._convert_key, self.content_fossology_license_cols)
	yield from execute_values_to_bytes(
	cur, """
	select %s
	from (values %%s) as t(id)
	inner join content_fossology_license c on t.id=c.id
	inner join indexer_configuration i
	on i.id=c.indexer_configuration_id
	group by c.id, i.id, i.tool_name, i.tool_version,
	i.tool_configuration;
	""" % ', '.join(keys),
	((_id,) for _id in ids)
	)

	content_metadata_hash_keys = ['id', 'indexer_configuration_id']

	def content_metadata_missing_from_list(self, metadata, cur=None):
	"""List missing metadata.

	"""
	yield from self._missing_from_list(
	'content_metadata', metadata, self.content_metadata_hash_keys,
	cur=cur)

	content_metadata_cols = [
	'id', 'translated_metadata',
	'tool_id', 'tool_name', 'tool_version', 'tool_configuration']

	@stored_procedure('swh_mktemp_content_metadata')
	def mktemp_content_metadata(self, cur=None): pass

	def content_metadata_add_from_temp(self, conflict_update, cur=None):
	self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)",
	(conflict_update, ))

	def content_metadata_get_from_list(self, ids, cur=None):
	yield from self._get_from_list(
	'content_metadata', ids, self.content_metadata_cols, cur=cur)

	revision_metadata_hash_keys = ['id', 'indexer_configuration_id']

	def revision_metadata_missing_from_list(self, metadata, cur=None):
	"""List missing metadata.

	"""
	yield from self._missing_from_list(
	'revision_metadata', metadata, self.revision_metadata_hash_keys,
	cur=cur)

	revision_metadata_cols = [
	'id', 'translated_metadata',
	'tool_id', 'tool_name', 'tool_version', 'tool_configuration']

	@stored_procedure('swh_mktemp_revision_metadata')
	def mktemp_revision_metadata(self, cur=None): pass

	def revision_metadata_add_from_temp(self, conflict_update, cur=None):
	self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)",
	(conflict_update, ))

	def revision_metadata_get_from_list(self, ids, cur=None):
	yield from self._get_from_list(
	'revision_metadata', ids, self.revision_metadata_cols, cur=cur)

	origin_intrinsic_metadata_cols = [
	'origin_id', 'metadata', 'from_revision',
	'tool_id', 'tool_name', 'tool_version', 'tool_configuration']

	origin_intrinsic_metadata_regconfig = 'pg_catalog.simple'
	"""The dictionary used to normalize 'metadata' and queries.
	'pg_catalog.simple' provides no stopword, so it should be suitable
	for proper names and non-English content.
	When updating this value, make sure to add a new index on
	origin_intrinsic_metadata.metadata."""

	@stored_procedure('swh_mktemp_origin_intrinsic_metadata')
	def mktemp_origin_intrinsic_metadata(self, cur=None): pass

	def origin_intrinsic_metadata_add_from_temp(
	self, conflict_update, cur=None):
	cur = self._cursor(cur)
	cur.execute(
	"SELECT swh_origin_intrinsic_metadata_add(%s)",
	(conflict_update, ))

	def origin_intrinsic_metadata_get_from_list(self, orig_ids, cur=None):
	yield from self._get_from_list(
	'origin_intrinsic_metadata', orig_ids,
	self.origin_intrinsic_metadata_cols, cur=cur,
	id_col='origin_id')

	def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit,
	cur=None):
	regconfig = self.origin_intrinsic_metadata_regconfig
	tsquery_template = ' && '.join("plainto_tsquery('%s', %%s)" % regconfig
	for _ in terms)
	tsquery_args = [(term,) for term in terms]
	keys = map(self._convert_key, self.origin_intrinsic_metadata_cols)
	query = ("SELECT {keys} FROM origin_intrinsic_metadata AS oim "
	"INNER JOIN indexer_configuration AS i "
	"ON oim.indexer_configuration_id=i.id "
	"JOIN LATERAL (SELECT {tsquery_template}) AS s(tsq) ON true "
	"WHERE to_tsvector('{regconfig}', metadata) @@ tsq "
	"ORDER BY ts_rank(oim.metadata_tsvector, tsq, 1) DESC "
	"LIMIT %s;"
	).format(keys=', '.join(keys),
	regconfig=regconfig,
	tsquery_template=tsquery_template)
	cur.execute(query, tsquery_args + [limit])
	- yield from cur
	+ yield from cursor_to_bytes(cur)

	indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
	'tool_configuration']

	@stored_procedure('swh_mktemp_indexer_configuration')
	def mktemp_indexer_configuration(self, cur=None):
	pass

	def indexer_configuration_add_from_temp(self, cur=None):
	cur = self._cursor(cur)
	cur.execute("SELECT %s from swh_indexer_configuration_add()" % (
	','.join(self.indexer_configuration_cols), ))
	yield from cursor_to_bytes(cur)

	def indexer_configuration_get(self, tool_name,
	tool_version, tool_configuration, cur=None):
	cur = self._cursor(cur)
	cur.execute('''select %s
	from indexer_configuration
	where tool_name=%%s and
	tool_version=%%s and
	tool_configuration=%%s''' % (
	','.join(self.indexer_configuration_cols)),
	(tool_name, tool_version, tool_configuration))

	data = cur.fetchone()
	if not data:
	return None
	return line_to_bytes(data)
	diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
	index f36cd1d..174c73c 100644
	--- a/swh/indexer/tests/test_metadata.py
	+++ b/swh/indexer/tests/test_metadata.py
	@@ -1,478 +1,480 @@
	# Copyright (C) 2017-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import unittest
	import logging

	from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS
	from swh.indexer.metadata_detector import detect_metadata
	from swh.indexer.metadata_detector import extract_minimal_metadata_dict
	from swh.indexer.metadata import ContentMetadataIndexer
	from swh.indexer.metadata import RevisionMetadataIndexer
	from swh.indexer.tests.test_utils import MockObjStorage, MockStorage
	from swh.indexer.tests.test_utils import MockIndexerStorage

	+from swh.model.hashutil import hash_to_bytes
	+

	class ContentMetadataTestIndexer(ContentMetadataIndexer):
	"""Specific Metadata whose configuration is enough to satisfy the
	indexing tests.
	"""
	def prepare(self):
	self.idx_storage = MockIndexerStorage()
	self.log = logging.getLogger('swh.indexer')
	self.objstorage = MockObjStorage()
	self.tools = self.register_tools(self.config['tools'])
	self.tool = self.tools[0]
	self.results = []


	class RevisionMetadataTestIndexer(RevisionMetadataIndexer):
	"""Specific indexer whose configuration is enough to satisfy the
	indexing tests.
	"""

	ContentMetadataIndexer = ContentMetadataTestIndexer

	def prepare(self):
	self.config = {
	'storage': {
	'cls': 'remote',
	'args': {
	'url': 'http://localhost:9999',
	}
	},
	'tools': {
	'name': 'swh-metadata-detector',
	'version': '0.0.2',
	'configuration': {
	'type': 'local',
	'context': 'NpmMapping'
	}
	}
	}
	self.storage = MockStorage()
	self.idx_storage = MockIndexerStorage()
	self.log = logging.getLogger('swh.indexer')
	self.objstorage = MockObjStorage()
	self.tools = self.register_tools(self.config['tools'])
	self.tool = self.tools[0]
	self.results = []


	class Metadata(unittest.TestCase):
	"""
	Tests metadata_mock_tool tool for Metadata detection
	"""
	def setUp(self):
	"""
	shows the entire diff in the results
	"""
	self.maxDiff = None
	self.content_tool = {
	'name': 'swh-metadata-translator',
	'version': '0.0.2',
	'configuration': {
	'type': 'local',
	'context': 'NpmMapping'
	}
	}
	MockIndexerStorage.added_data = []

	def test_crosstable(self):
	self.assertEqual(CROSSWALK_TABLE['NodeJS'], {
	'repository': 'http://schema.org/codeRepository',
	'os': 'http://schema.org/operatingSystem',
	'cpu': 'http://schema.org/processorRequirements',
	'engines':
	'http://schema.org/processorRequirements',
	'author': 'http://schema.org/author',
	'author.email': 'http://schema.org/email',
	'author.name': 'http://schema.org/name',
	'contributor': 'http://schema.org/contributor',
	'keywords': 'http://schema.org/keywords',
	'license': 'http://schema.org/license',
	'version': 'http://schema.org/version',
	'description': 'http://schema.org/description',
	'name': 'http://schema.org/name',
	'bugs': 'https://codemeta.github.io/terms/issueTracker',
	'homepage': 'http://schema.org/url'
	})

	def test_compute_metadata_none(self):
	"""
	testing content empty content is empty
	should return None
	"""
	# given
	content = b""

	# None if no metadata was found or an error occurred
	declared_metadata = None
	# when
	result = MAPPINGS["NpmMapping"].translate(content)
	# then
	self.assertEqual(declared_metadata, result)

	def test_compute_metadata_npm(self):
	"""
	testing only computation of metadata with hard_mapping_npm
	"""
	# given
	content = b"""
	{
	"name": "test_metadata",
	"version": "0.0.2",
	"description": "Simple package.json test for indexer",
	"repository": {
	"type": "git",
	"url": "https://github.com/moranegg/metadata_test"
	},
	"author": {
	"email": "moranegg@example.com",
	"name": "Morane G"
	}
	}
	"""
	declared_metadata = {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'type': 'SoftwareSourceCode',
	'name': 'test_metadata',
	'version': '0.0.2',
	'description': 'Simple package.json test for indexer',
	'schema:codeRepository':
	'git+https://github.com/moranegg/metadata_test',
	'schema:author': {
	'type': 'Person',
	'name': 'Morane G',
	'email': 'moranegg@example.com',
	},
	}

	# when
	result = MAPPINGS["NpmMapping"].translate(content)
	# then
	self.assertEqual(declared_metadata, result)

	def test_extract_minimal_metadata_dict(self):
	"""
	Test the creation of a coherent minimal metadata set
	"""
	# given
	metadata_list = [{
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'name': 'test_1',
	'version': '0.0.2',
	'description': 'Simple package.json test for indexer',
	'schema:codeRepository':
	'git+https://github.com/moranegg/metadata_test',
	}, {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'name': 'test_0_1',
	'version': '0.0.2',
	'description': 'Simple package.json test for indexer',
	'schema:codeRepository':
	'git+https://github.com/moranegg/metadata_test'
	}, {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'name': 'test_metadata',
	'version': '0.0.2',
	'schema:author': 'moranegg',
	}]

	# when
	results = extract_minimal_metadata_dict(metadata_list)

	# then
	expected_results = {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	"version": '0.0.2',
	"description": 'Simple package.json test for indexer',
	"name": ['test_1', 'test_0_1', 'test_metadata'],
	"schema:author": 'moranegg',
	"schema:codeRepository":
	'git+https://github.com/moranegg/metadata_test',
	}
	self.assertEqual(expected_results, results)

	def test_index_content_metadata_npm(self):
	"""
	testing NPM with package.json
	- one sha1 uses a file that can't be translated to metadata and
	should return None in the translated metadata
	"""
	# given
	sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5',
	'd4c647f0fc257591cc9ba1722484229780d1c607',
	'02fb2c89e14f7fab46701478c83779c7beb7b069']
	# this metadata indexer computes only metadata for package.json
	# in npm context with a hard mapping
	metadata_indexer = ContentMetadataTestIndexer(
	tool=self.content_tool, config={})

	# when
	metadata_indexer.run(sha1s, policy_update='ignore-dups')
	results = metadata_indexer.idx_storage.added_data

	expected_results = [('content_metadata', False, [{
	'indexer_configuration_id': 30,
	'translated_metadata': {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'type': 'SoftwareSourceCode',
	'schema:codeRepository':
	'git+https://github.com/moranegg/metadata_test',
	'description': 'Simple package.json test for indexer',
	'name': 'test_metadata',
	'version': '0.0.1'
	},
	'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'
	}, {
	'indexer_configuration_id': 30,
	'translated_metadata': {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'type': 'SoftwareSourceCode',
	'codemeta:issueTracker':
	'https://github.com/npm/npm/issues',
	'schema:author': {
	'type': 'Person',
	'name': 'Isaac Z. Schlueter',
	'email': 'i@izs.me',
	'schema:url': 'http://blog.izs.me',
	},
	'schema:codeRepository':
	'git+https://github.com/npm/npm',
	'description': 'a package manager for JavaScript',
	'schema:license': 'Artistic-2.0',
	'version': '5.0.3',
	'name': 'npm',
	'keywords': [
	'install',
	'modules',
	'package manager',
	'package.json'
	],
	'schema:url': 'https://docs.npmjs.com/'
	},
	'id': 'd4c647f0fc257591cc9ba1722484229780d1c607'
	}, {
	'indexer_configuration_id': 30,
	'translated_metadata': None,
	'id': '02fb2c89e14f7fab46701478c83779c7beb7b069'
	}])]

	# The assertion below returns False sometimes because of nested lists
	self.assertEqual(expected_results, results)

	def test_detect_metadata_package_json(self):
	# given
	df = [{
	'sha1_git': b'abc',
	'name': b'index.js',
	'target': b'abc',
	'length': 897,
	'status': 'visible',
	'type': 'file',
	'perms': 33188,
	'dir_id': b'dir_a',
	'sha1': b'bcd'
	},
	{
	'sha1_git': b'aab',
	'name': b'package.json',
	'target': b'aab',
	'length': 712,
	'status': 'visible',
	'type': 'file',
	'perms': 33188,
	'dir_id': b'dir_a',
	'sha1': b'cde'
	}]
	# when
	results = detect_metadata(df)

	expected_results = {
	'NpmMapping': [
	b'cde'
	]
	}
	# then
	self.assertEqual(expected_results, results)

	def test_compute_metadata_valid_codemeta(self):
	raw_content = (
	b"""{
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"@type": "SoftwareSourceCode",
	"identifier": "CodeMeta",
	"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
	"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
	"codeRepository": "https://github.com/codemeta/codemeta",
	"issueTracker": "https://github.com/codemeta/codemeta/issues",
	"license": "https://spdx.org/licenses/Apache-2.0",
	"version": "2.0",
	"author": [
	{
	"@type": "Person",
	"givenName": "Carl",
	"familyName": "Boettiger",
	"email": "cboettig@gmail.com",
	"@id": "http://orcid.org/0000-0002-1642-628X"
	},
	{
	"@type": "Person",
	"givenName": "Matthew B.",
	"familyName": "Jones",
	"email": "jones@nceas.ucsb.edu",
	"@id": "http://orcid.org/0000-0003-0077-4738"
	}
	],
	"maintainer": {
	"@type": "Person",
	"givenName": "Carl",
	"familyName": "Boettiger",
	"email": "cboettig@gmail.com",
	"@id": "http://orcid.org/0000-0002-1642-628X"
	},
	"contIntegration": "https://travis-ci.org/codemeta/codemeta",
	"developmentStatus": "active",
	"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
	"funder": {
	"@id": "https://doi.org/10.13039/100000001",
	"@type": "Organization",
	"name": "National Science Foundation"
	},
	"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
	"keywords": [
	"metadata",
	"software"
	],
	"version":"2.0",
	"dateCreated":"2017-06-05",
	"datePublished":"2017-06-05",
	"programmingLanguage": "JSON-LD"
	}""") # noqa
	expected_result = {
	"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	"type": "SoftwareSourceCode",
	"identifier": "CodeMeta",
	"description":
	"CodeMeta is a concept vocabulary that can "
	"be used to standardize the exchange of software metadata "
	"across repositories and organizations.",
	"name":
	"CodeMeta: Minimal metadata schemas for science "
	"software and code, in JSON-LD",
	"codeRepository": "https://github.com/codemeta/codemeta",
	"issueTracker": "https://github.com/codemeta/codemeta/issues",
	"license": "https://spdx.org/licenses/Apache-2.0",
	"version": "2.0",
	"author": [
	{
	"type": "Person",
	"givenName": "Carl",
	"familyName": "Boettiger",
	"email": "cboettig@gmail.com",
	"id": "http://orcid.org/0000-0002-1642-628X"
	},
	{
	"type": "Person",
	"givenName": "Matthew B.",
	"familyName": "Jones",
	"email": "jones@nceas.ucsb.edu",
	"id": "http://orcid.org/0000-0003-0077-4738"
	}
	],
	"maintainer": {
	"type": "Person",
	"givenName": "Carl",
	"familyName": "Boettiger",
	"email": "cboettig@gmail.com",
	"id": "http://orcid.org/0000-0002-1642-628X"
	},
	"contIntegration": "https://travis-ci.org/codemeta/codemeta",
	"developmentStatus": "active",
	"downloadUrl":
	"https://github.com/codemeta/codemeta/archive/2.0.zip",
	"funder": {
	"id": "https://doi.org/10.13039/100000001",
	"type": "Organization",
	"name": "National Science Foundation"
	},
	"funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
	"in Scientific Software",
	"keywords": [
	"metadata",
	"software"
	],
	"version": "2.0",
	"dateCreated": "2017-06-05",
	"datePublished": "2017-06-05",
	"programmingLanguage": "JSON-LD"
	}
	result = MAPPINGS["CodemetaMapping"].translate(raw_content)
	self.assertEqual(result, expected_result)

	def test_compute_metadata_maven(self):
	raw_content = b"""
	<project>
	<name>Maven Default Project</name>
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.mycompany.app</groupId>
	<artifactId>my-app</artifactId>
	<version>1.2.3</version>
	<repositories>
	<repository>
	<id>central</id>
	<name>Maven Repository Switchboard</name>
	<layout>default</layout>
	<url>http://repo1.maven.org/maven2</url>
	<snapshots>
	<enabled>false</enabled>
	</snapshots>
	</repository>
	</repositories>
	</project>"""
	result = MAPPINGS["MavenMapping"].translate(raw_content)
	self.assertEqual(result, {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'type': 'SoftwareSourceCode',
	'name': 'Maven Default Project',
	'schema:identifier': 'com.mycompany.app',
	'version': '1.2.3',
	'schema:codeRepository':
	'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
	})

	def test_revision_metadata_indexer(self):
	metadata_indexer = RevisionMetadataTestIndexer()

	sha1_gits = [
	- b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
	+ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
	]
	metadata_indexer.run(sha1_gits, 'update-dups')

	results = metadata_indexer.idx_storage.added_data

	expected_results = [('revision_metadata', True, [{
	- 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
	+ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
	'translated_metadata': {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'url':
	'https://github.com/librariesio/yarn-parser#readme',
	'schema:codeRepository':
	'git+https://github.com/librariesio/yarn-parser.git',
	'schema:author': 'Andrew Nesbitt',
	'license': 'AGPL-3.0',
	'version': '1.0.0',
	'description':
	'Tiny web service for parsing yarn.lock files',
	'codemeta:issueTracker':
	'https://github.com/librariesio/yarn-parser/issues',
	'name': 'yarn-parser',
	'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
	},
	'indexer_configuration_id': 7
	}])]
	# then
	self.assertEqual(expected_results, results)
	diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
	index 510ae1a..2b651cc 100644
	--- a/swh/indexer/tests/test_origin_metadata.py
	+++ b/swh/indexer/tests/test_origin_metadata.py
	@@ -1,122 +1,125 @@
	# Copyright (C) 2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import time
	import logging
	import unittest
	from celery import task

	from swh.indexer.metadata import OriginMetadataIndexer
	from swh.indexer.tests.test_utils import MockObjStorage, MockStorage
	from swh.indexer.tests.test_utils import MockIndexerStorage
	from swh.indexer.tests.test_origin_head import OriginHeadTestIndexer
	from swh.indexer.tests.test_metadata import RevisionMetadataTestIndexer

	from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture

	+from swh.model.hashutil import hash_to_bytes
	+

	class OriginMetadataTestIndexer(OriginMetadataIndexer):
	def prepare(self):
	self.config = {
	'storage': {
	'cls': 'remote',
	'args': {
	'url': 'http://localhost:9999',
	}
	},
	'tools': [],
	}
	self.storage = MockStorage()
	self.idx_storage = MockIndexerStorage()
	self.log = logging.getLogger('swh.indexer')
	self.objstorage = MockObjStorage()
	self.tools = self.register_tools(self.config['tools'])
	self.results = []


	@task
	def revision_metadata_test_task(args, *kwargs):
	indexer = RevisionMetadataTestIndexer()
	indexer.run(args, *kwargs)
	return indexer.results


	@task
	def origin_intrinsic_metadata_test_task(args, *kwargs):
	indexer = OriginMetadataTestIndexer()
	indexer.run(args, *kwargs)
	return indexer.results


	class OriginHeadTestIndexer(OriginHeadTestIndexer):
	def prepare(self):
	super().prepare()
	self.config['tasks'] = {
	'revision_metadata': 'revision_metadata_test_task',
	'origin_intrinsic_metadata': 'origin_intrinsic_metadata_test_task',
	}


	class TestOriginMetadata(SchedulerTestFixture, unittest.TestCase):
	def setUp(self):
	super().setUp()
	self.maxDiff = None
	MockIndexerStorage.added_data = []
	self.add_scheduler_task_type(
	'revision_metadata_test_task',
	'swh.indexer.tests.test_origin_metadata.'
	'revision_metadata_test_task')
	self.add_scheduler_task_type(
	'origin_intrinsic_metadata_test_task',
	'swh.indexer.tests.test_origin_metadata.'
	'origin_intrinsic_metadata_test_task')
	RevisionMetadataTestIndexer.scheduler = self.scheduler

	def tearDown(self):
	del RevisionMetadataTestIndexer.scheduler
	super().tearDown()

	def test_pipeline(self):
	indexer = OriginHeadTestIndexer()
	indexer.scheduler = self.scheduler
	indexer.run(["git+https://github.com/librariesio/yarn-parser"])

	self.run_ready_tasks() # Run the first task
	time.sleep(0.1) # Give it time to complete and schedule the 2nd one
	self.run_ready_tasks() # Run the second task

	metadata = {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'url':
	'https://github.com/librariesio/yarn-parser#readme',
	'schema:codeRepository':
	'git+https://github.com/librariesio/yarn-parser.git',
	'schema:author': 'Andrew Nesbitt',
	'license': 'AGPL-3.0',
	'version': '1.0.0',
	'description':
	'Tiny web service for parsing yarn.lock files',
	'codemeta:issueTracker':
	'https://github.com/librariesio/yarn-parser/issues',
	'name': 'yarn-parser',
	'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
	}
	rev_metadata = {
	- 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
	+ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
	'translated_metadata': metadata,
	'indexer_configuration_id': 7,
	}
	origin_metadata = {
	'origin_id': 54974445,
	- 'from_revision': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
	+ 'from_revision': hash_to_bytes(
	+ '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
	'metadata': metadata,
	'indexer_configuration_id': 7,
	}
	expected_results = [
	('origin_intrinsic_metadata', True, [origin_metadata]),
	('revision_metadata', True, [rev_metadata])]

	results = list(indexer.idx_storage.added_data)
	self.assertCountEqual(expected_results, results)
	diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
	index da1f11d..9ccefc6 100644
	--- a/swh/indexer/tests/test_utils.py
	+++ b/swh/indexer/tests/test_utils.py
	@@ -1,728 +1,732 @@
	# Copyright (C) 2017-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from swh.objstorage.exc import ObjNotFoundError
	from swh.model import hashutil
	+from swh.model.hashutil import hash_to_bytes

	ORIGINS = [
	{
	'id': 52189575,
	'lister': None,
	'project': None,
	'type': 'git',
	'url': 'https://github.com/SoftwareHeritage/swh-storage'},
	{
	'id': 4423668,
	'lister': None,
	'project': None,
	'type': 'ftp',
	'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
	{
	'id': 77775770,
	'lister': None,
	'project': None,
	'type': 'deposit',
	'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
	{
	'id': 85072327,
	'lister': None,
	'project': None,
	'type': 'pypi',
	'url': 'https://pypi.org/project/limnoria/'},
	{
	'id': 49908349,
	'lister': None,
	'project': None,
	'type': 'svn',
	'url': 'http://0-512-md.googlecode.com/svn/'},
	{
	'id': 54974445,
	'lister': None,
	'project': None,
	'type': 'git',
	'url': 'https://github.com/librariesio/yarn-parser'},
	]

	SNAPSHOTS = {
	52189575: {
	'branches': {
	b'refs/heads/add-revision-origin-cache': {
	'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
	b's\xe7/\xe9l\x1e',
	'target_type': 'revision'},
	b'HEAD': {
	'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
	b'\xac\xefrm',
	'target_type': 'revision'},
	b'refs/tags/v0.0.103': {
	'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
	b'\x0f\xdd',
	'target_type': 'release'},
	}},
	4423668: {
	'branches': {
	b'3DLDF-1.1.4.tar.gz': {
	'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90\|\xd3\xfc'
	b'"G\x99\x11',
	'target_type': 'revision'},
	b'3DLDF-2.0.2.tar.gz': {
	'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
	b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
	'target_type': 'revision'},
	b'3DLDF-2.0.3-examples.tar.gz': {
	'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
	b'\xfe\xadZ\x80\x80\xc1\x83\xff',
	'target_type': 'revision'},
	b'3DLDF-2.0.3.tar.gz': {
	'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
	b'\xcc\x1a\xb4`\x8c\x8by',
	'target_type': 'revision'},
	b'3DLDF-2.0.tar.gz': {
	'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
	b'\xd3\xd1m',
	b'target_type': 'revision'}
	}},
	77775770: {
	'branches': {
	b'master': {
	'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
	b'\xa6\xe9\x99\xb1\x9e]q\xeb',
	'target_type': 'revision'}
	},
	'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
	b"\x1d\r "},
	85072327: {
	'branches': {
	b'HEAD': {
	'target': b'releases/2018.09.09',
	'target_type': 'alias'},
	b'releases/2018.09.01': {
	'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
	b'\xbb\xdfF\xfdw\xcf',
	'target_type': 'revision'},
	b'releases/2018.09.09': {
	'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
	b'A\x10\x9d\xc5\xfa2\xf8t',
	'target_type': 'revision'}},
	'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
	b'\x12\x9e\xd6\xb3'},
	49908349: {
	'branches': {
	b'master': {
	'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
	b'\xc9\xad#.\x1bw=\x18',
	'target_type': 'revision'}},
	'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
	b'\x05\xea\xb8\x1f\xc4H\xf4s'},
	54974445: {
	'branches': {
	b'HEAD': {
	- 'target': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
	+ 'target': hash_to_bytes(
	+ '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
	'target_type': 'revision'}}}
	}


	SHA1_TO_LICENSES = {
	'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
	'02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
	'103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
	'688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
	'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
	}


	SHA1_TO_CTAGS = {
	'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{
	'name': 'foo',
	'kind': 'str',
	'line': 10,
	'lang': 'bar',
	}],
	'd4c647f0fc257591cc9ba1722484229780d1c607': [{
	'name': 'let',
	'kind': 'int',
	'line': 100,
	'lang': 'haskell',
	}],
	'688a5ef812c53907562fe379d4b3851e69c7cb15': [{
	'name': 'symbol',
	'kind': 'float',
	'line': 99,
	'lang': 'python',
	}],
	}


	class MockObjStorage:
	"""Mock an swh-objstorage objstorage with predefined contents.

	"""
	data = {}

	def __init__(self):
	self.data = {
	'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text',
	'688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text',
	'8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text',
	'02fb2c89e14f7fab46701478c83779c7beb7b069': b"""
	import unittest
	import logging
	from swh.indexer.mimetype import ContentMimetypeIndexer
	from swh.indexer.tests.test_utils import MockObjStorage

	class MockStorage():
	def content_mimetype_add(self, mimetypes):
	self.state = mimetypes
	self.conflict_update = conflict_update

	def indexer_configuration_add(self, tools):
	return [{
	'id': 10,
	}]
	""",
	'103bc087db1d26afc3a0283f38663d081e9b01e6': b"""
	#ifndef __AVL__
	#define __AVL__

	typedef struct _avl_tree avl_tree;

	typedef struct _data_t {
	int content;
	} data_t;
	""",
	'93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
	(should 'pygments (recognize 'lisp 'easily))

	""",
	'26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
	{
	"name": "test_metadata",
	"version": "0.0.1",
	"description": "Simple package.json test for indexer",
	"repository": {
	"type": "git",
	"url": "https://github.com/moranegg/metadata_test"
	}
	}
	""",
	'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
	{
	"version": "5.0.3",
	"name": "npm",
	"description": "a package manager for JavaScript",
	"keywords": [
	"install",
	"modules",
	"package manager",
	"package.json"
	],
	"preferGlobal": true,
	"config": {
	"publishtest": false
	},
	"homepage": "https://docs.npmjs.com/",
	"author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
	"repository": {
	"type": "git",
	"url": "https://github.com/npm/npm"
	},
	"bugs": {
	"url": "https://github.com/npm/npm/issues"
	},
	"dependencies": {
	"JSONStream": "~1.3.1",
	"abbrev": "~1.1.0",
	"ansi-regex": "~2.1.1",
	"ansicolors": "~0.3.2",
	"ansistyles": "~0.1.3"
	},
	"devDependencies": {
	"tacks": "~1.2.6",
	"tap": "~10.3.2"
	},
	"license": "Artistic-2.0"
	}

	""",
	'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b"""
	""",
	'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'',
	}

	def __iter__(self):
	yield from self.data.keys()

	def __contains__(self, sha1):
	return self.data.get(sha1) is not None

	def get(self, sha1):
	raw_content = self.data.get(sha1)
	if raw_content is None:
	raise ObjNotFoundError(sha1)
	return raw_content


	class MockIndexerStorage():
	"""Mock an swh-indexer storage.

	"""
	added_data = []
	revision_metadata = {}

	def indexer_configuration_add(self, tools):
	results = []
	for tool in tools:
	results.append(self._indexer_configuration_add_one(tool))
	return results

	def _indexer_configuration_add_one(self, tool):
	if tool['tool_name'] == 'swh-metadata-translator':
	return {
	'id': 30,
	'tool_name': 'swh-metadata-translator',
	'tool_version': '0.0.1',
	'tool_configuration': {
	'type': 'local',
	'context': 'NpmMapping'
	},
	}
	elif tool['tool_name'] == 'swh-metadata-detector':
	return {
	'id': 7,
	'tool_name': 'swh-metadata-detector',
	'tool_version': '0.0.1',
	'tool_configuration': {
	'type': 'local',
	'context': 'NpmMapping'
	},
	}
	elif tool['tool_name'] == 'origin-metadata':
	return {
	'id': 8,
	'tool_name': 'origin-metadata',
	'tool_version': '0.0.1',
	'tool_configuration': {},
	}
	else:
	assert False, 'Unknown tool {tool_name}'.format(**tool)

	def content_metadata_missing(self, sha1s):
	yield from []

	def content_metadata_add(self, metadata, conflict_update=None):
	self.added_data.append(
	('content_metadata', conflict_update, metadata))

	def revision_metadata_add(self, metadata, conflict_update=None):
	assert conflict_update
	self.added_data.append(
	('revision_metadata', conflict_update, metadata))
	for item in metadata:
	+ assert isinstance(item['id'], bytes)
	self.revision_metadata.setdefault(item['id'], []).append(item)

	def revision_metadata_get(self, ids):
	for id_ in ids:
	+ assert isinstance(id_, bytes)
	yield from self.revision_metadata.get(id_)

	def origin_intrinsic_metadata_add(self, metadata, conflict_update=None):
	self.added_data.append(
	('origin_intrinsic_metadata', conflict_update, metadata))

	def content_metadata_get(self, sha1s):
	return [{
	'tool': {
	'configuration': {
	'type': 'local',
	'context': 'NpmMapping'
	},
	'version': '0.0.1',
	'id': 6,
	'name': 'swh-metadata-translator'
	},
	'id': b'cde',
	'translated_metadata': {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'type': 'SoftwareSourceCode',
	'codemeta:issueTracker':
	'https://github.com/librariesio/yarn-parser/issues',
	'version': '1.0.0',
	'name': 'yarn-parser',
	'schema:author': 'Andrew Nesbitt',
	'url':
	'https://github.com/librariesio/yarn-parser#readme',
	'processorRequirements': {'node': '7.5'},
	'license': 'AGPL-3.0',
	'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
	'schema:codeRepository':
	'git+https://github.com/librariesio/yarn-parser.git',
	'description':
	'Tiny web service for parsing yarn.lock files',
	}
	}]


	class MockStorage():
	"""Mock a real swh-storage storage to simplify reading indexers'
	outputs.

	"""
	def origin_get(self, id_):
	for origin in ORIGINS:
	for (k, v) in id_.items():
	if origin[k] != v:
	break
	else:
	# This block is run iff we didn't break, ie. if all supplied
	# parts of the id are set to the expected value.
	return origin
	assert False, id_

	def snapshot_get_latest(self, origin_id):
	if origin_id in SNAPSHOTS:
	return SNAPSHOTS[origin_id]
	else:
	assert False, origin_id

	def revision_get(self, revisions):
	return [{
	- 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
	+ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
	'committer': {
	'id': 26,
	'name': b'Andrew Nesbitt',
	'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
	'email': b'andrewnez@gmail.com'
	},
	'synthetic': False,
	'date': {
	'negative_utc': False,
	'timestamp': {
	'seconds': 1487596456,
	'microseconds': 0
	},
	'offset': 0
	},
	'directory': b'10'
	}]

	def directory_ls(self, directory, recursive=False, cur=None):
	# with directory: b'\x9d',
	return [{
	'sha1_git': b'abc',
	'name': b'index.js',
	'target': b'abc',
	'length': 897,
	'status': 'visible',
	'type': 'file',
	'perms': 33188,
	'dir_id': b'10',
	'sha1': b'bcd'
	},
	{
	'sha1_git': b'aab',
	'name': b'package.json',
	'target': b'aab',
	'length': 712,
	'status': 'visible',
	'type': 'file',
	'perms': 33188,
	'dir_id': b'10',
	'sha1': b'cde'
	},
	{
	'dir_id': b'10',
	'target': b'11',
	'type': 'dir',
	'length': None,
	'name': b'.github',
	'sha1': None,
	'perms': 16384,
	'sha1_git': None,
	'status': None,
	'sha256': None
	}]


	class BasicMockStorage():
	"""In memory implementation to fake the content_get_range api.

	FIXME: To remove when the actual in-memory lands.

	"""
	contents = []

	def __init__(self, contents):
	self.contents = contents

	def content_get_range(self, start, end, limit=1000):
	# to make input test data consilient with actual runtime the
	# other way of doing properly things would be to rewrite all
	# tests (that's another task entirely so not right now)
	if isinstance(start, bytes):
	start = hashutil.hash_to_hex(start)
	if isinstance(end, bytes):
	end = hashutil.hash_to_hex(end)
	results = []
	_next_id = None
	counter = 0
	for c in self.contents:
	_id = c['sha1']
	if start <= _id and _id <= end:
	results.append(c)
	if counter >= limit:
	break
	counter += 1

	return {
	'contents': results,
	'next': _next_id
	}


	class BasicMockIndexerStorage():
	"""Mock Indexer storage to simplify reading indexers' outputs.

	"""
	state = []

	def _internal_add(self, data, conflict_update=None):
	"""All content indexer have the same structure. So reuse `data` as the
	same data. It's either mimetype, language,
	fossology_license, etc...

	"""
	self.state = data
	self.conflict_update = conflict_update

	def content_mimetype_add(self, data, conflict_update=None):
	self._internal_add(data, conflict_update=conflict_update)

	def content_fossology_license_add(self, data, conflict_update=None):
	self._internal_add(data, conflict_update=conflict_update)

	def content_language_add(self, data, conflict_update=None):
	self._internal_add(data, conflict_update=conflict_update)

	def content_ctags_add(self, data, conflict_update=None):
	self._internal_add(data, conflict_update=conflict_update)

	def _internal_get_range(self, start, end,
	indexer_configuration_id, limit=1000):
	"""Same logic as _internal_add, we retrieve indexed data given an
	identifier. So the code here does not change even though
	the underlying data does.

	"""
	# to make input test data consilient with actual runtime the
	# other way of doing properly things would be to rewrite all
	# tests (that's another task entirely so not right now)
	if isinstance(start, bytes):
	start = hashutil.hash_to_hex(start)
	if isinstance(end, bytes):
	end = hashutil.hash_to_hex(end)
	results = []
	_next = None
	counter = 0
	for m in self.state:
	_id = m['id']
	_tool_id = m['indexer_configuration_id']
	if (start <= _id and _id <= end and
	_tool_id == indexer_configuration_id):
	results.append(_id)
	if counter >= limit:
	break
	counter += 1

	return {
	'ids': results,
	'next': _next
	}

	def content_mimetype_get_range(
	self, start, end, indexer_configuration_id, limit=1000):
	return self._internal_get_range(
	start, end, indexer_configuration_id, limit=limit)

	def content_fossology_license_get_range(
	self, start, end, indexer_configuration_id, limit=1000):
	return self._internal_get_range(
	start, end, indexer_configuration_id, limit=limit)

	def indexer_configuration_add(self, tools):
	return [{
	'id': 10,
	}]


	class CommonIndexerNoTool:
	"""Mixin to wronly initialize content indexer"""
	def prepare(self):
	super().prepare()
	self.tools = None


	class CommonIndexerWithErrorsTest:
	"""Test indexer configuration checks.

	"""
	Indexer = None
	RangeIndexer = None

	def test_wrong_unknown_configuration_tool(self):
	"""Indexer with unknown configuration tool fails check"""
	with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
	print('indexer: %s' % self.Indexer)
	self.Indexer()

	def test_wrong_unknown_configuration_tool_range(self):
	"""Range Indexer with unknown configuration tool fails check"""
	if self.RangeIndexer is not None:
	with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
	self.RangeIndexer()


	class CommonContentIndexerTest:
	def assert_results_ok(self, actual_results, expected_results=None):
	if expected_results is None:
	expected_results = self.expected_results

	for indexed_data in actual_results:
	_id = indexed_data['id']
	self.assertEqual(indexed_data, expected_results[_id])
	_tool_id = indexed_data['indexer_configuration_id']
	self.assertEqual(_tool_id, self.indexer.tool['id'])

	def test_index(self):
	"""Known sha1 have their data indexed

	"""
	sha1s = [self.id0, self.id1, self.id2]

	# when
	self.indexer.run(sha1s, policy_update='update-dups')

	actual_results = self.indexer.idx_storage.state
	self.assertTrue(self.indexer.idx_storage.conflict_update)
	self.assert_results_ok(actual_results)

	# 2nd pass
	self.indexer.run(sha1s, policy_update='ignore-dups')

	self.assertFalse(self.indexer.idx_storage.conflict_update)
	self.assert_results_ok(actual_results)

	def test_index_one_unknown_sha1(self):
	"""Unknown sha1 are not indexed"""
	sha1s = [self.id1,
	'799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
	'800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown

	# when
	self.indexer.run(sha1s, policy_update='update-dups')
	actual_results = self.indexer.idx_storage.state

	# then
	expected_results = {
	k: v for k, v in self.expected_results.items() if k in sha1s
	}

	self.assert_results_ok(actual_results, expected_results)


	class CommonContentIndexerRangeTest:
	"""Allows to factorize tests on range indexer.

	"""
	def assert_results_ok(self, start, end, actual_results,
	expected_results=None):
	if expected_results is None:
	expected_results = self.expected_results

	for indexed_data in actual_results:
	_id = indexed_data['id']
	self.assertEqual(indexed_data, expected_results[_id])
	self.assertTrue(start <= _id and _id <= end)
	_tool_id = indexed_data['indexer_configuration_id']
	self.assertEqual(_tool_id, self.indexer.tool['id'])

	def test__index_contents(self):
	"""Indexing contents without existing data results in indexed data

	"""
	start, end = [self.contents[0], self.contents[2]] # output hex ids
	# given
	actual_results = list(self.indexer._index_contents(
	start, end, indexed={}))

	self.assert_results_ok(start, end, actual_results)

	def test__index_contents_with_indexed_data(self):
	"""Indexing contents with existing data results in less indexed data

	"""
	start, end = [self.contents[0], self.contents[2]] # output hex ids
	data_indexed = [self.id0, self.id2]

	# given
	actual_results = self.indexer._index_contents(
	start, end, indexed=set(data_indexed))

	# craft the expected results
	expected_results = self.expected_results.copy()
	for already_indexed_key in data_indexed:
	expected_results.pop(already_indexed_key)

	self.assert_results_ok(
	start, end, actual_results, expected_results)

	def test_generate_content_get(self):
	"""Optimal indexing should result in indexed data

	"""
	start, end = [self.contents[0], self.contents[2]] # output hex ids
	# given
	actual_results = self.indexer.run(start, end)

	# then
	self.assertTrue(actual_results)

	def test_generate_content_get_input_as_bytes(self):
	"""Optimal indexing should result in indexed data

	Input are in bytes here.

	"""
	_start, _end = [self.contents[0], self.contents[2]] # output hex ids
	start, end = map(hashutil.hash_to_bytes, (_start, _end))

	# given
	actual_results = self.indexer.run( # checks the bytes input this time
	start, end, skip_existing=False) # no data so same result

	# then
	self.assertTrue(actual_results)

	def test_generate_content_get_no_result(self):
	"""No result indexed returns False"""
	start, end = ['0000000000000000000000000000000000000000',
	'0000000000000000000000000000000000000001']
	# given
	actual_results = self.indexer.run(
	start, end, incremental=False)

	# then
	self.assertFalse(actual_results)


	class NoDiskIndexer:
	"""Mixin to override the DiskIndexer behavior avoiding side-effects in
	tests.

	"""

	def write_to_temp(self, filename, data): # noop
	return filename

	def cleanup(self, content_path): # noop
	return None

File Metadata

Mime Type: text/x-diff
Expires: Thu, Jul 3, 10:51 AM (1 w, 5 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3292892

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions