No OneTemporary
Actions

Size

102 KB

Subscribers

None

View Options

	diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
	index d4ea785..0aed233 100644
	--- a/swh/indexer/storage/in_memory.py
	+++ b/swh/indexer/storage/in_memory.py
	@@ -1,845 +1,848 @@
	# Copyright (C) 2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import bisect
	from collections import defaultdict, Counter
	import itertools
	import json
	import operator
	import math
	import re

	from . import MAPPING_NAMES

	SHA1_DIGEST_SIZE = 160


	def _transform_tool(tool):
	return {
	'id': tool['id'],
	'name': tool['tool_name'],
	'version': tool['tool_version'],
	'configuration': tool['tool_configuration'],
	}


	class SubStorage:
	"""Implements common missing/get/add logic for each indexer type."""
	def __init__(self, tools):
	self._tools = tools
	self._sorted_ids = []
	self._data = {} # map (id_, tool_id) -> metadata_dict
	self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id]

	def missing(self, ids):
	"""List data missing from storage.

	Args:
	data (iterable): dictionaries with keys:

	- id (bytes): sha1 identifier
	- indexer_configuration_id (int): tool used to compute
	the results

	Yields:
	missing sha1s

	"""
	for id_ in ids:
	tool_id = id_['indexer_configuration_id']
	id_ = id_['id']
	if tool_id not in self._tools_per_id.get(id_, set()):
	yield id_

	def get(self, ids):
	"""Retrieve data per id.

	Args:
	ids (iterable): sha1 checksums

	Yields:
	dict: dictionaries with the following keys:

	- id (bytes)
	- tool (dict): tool used to compute metadata
	- arbitrary data (as provided to `add`)

	"""
	for id_ in ids:
	for tool_id in self._tools_per_id.get(id_, set()):
	key = (id_, tool_id)
	yield {
	'id': id_,
	'tool': _transform_tool(self._tools[tool_id]),
	**self._data[key],
	}

	def get_all(self):
	yield from self.get(self._sorted_ids)

	def get_range(self, start, end, indexer_configuration_id, limit):
	"""Retrieve data within range [start, end] bound by limit.

	Args:
	start (bytes): Starting identifier range (expected smaller
	than end)
	end (bytes): Ending identifier range (expected larger
	than start)
	indexer_configuration_id (int): The tool used to index data
	limit (int): Limit result

	Raises:
	ValueError for limit to None

	Returns:
	a dict with keys:
	- ids [bytes]: iterable of content ids within the range.
	- next (Optional[bytes]): The next range of sha1 starts at
	this sha1 if any

	"""
	if limit is None:
	raise ValueError('Development error: limit should not be None')
	from_index = bisect.bisect_left(self._sorted_ids, start)
	to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index)
	if to_index - from_index >= limit:
	return {
	'ids': self._sorted_ids[from_index:from_index+limit],
	'next': self._sorted_ids[from_index+limit],
	}
	else:
	return {
	'ids': self._sorted_ids[from_index:to_index],
	'next': None,
	}

	def add(self, data, conflict_update):
	"""Add data not present in storage.

	Args:
	data (iterable): dictionaries with keys:

	- id: sha1
	- indexer_configuration_id: tool used to compute the
	results
	- arbitrary data

	conflict_update (bool): Flag to determine if we want to overwrite
	(true) or skip duplicates (false)

	"""
	data = list(data)
	if len({x['id'] for x in data}) < len(data):
	# For "exception-compatibility" with the pgsql backend
	raise ValueError('The same id is present more than once.')
	for item in data:
	item = item.copy()
	tool_id = item.pop('indexer_configuration_id')
	id_ = item.pop('id')
	data = item
	if not conflict_update and \
	tool_id in self._tools_per_id.get(id_, set()):
	# Duplicate, should not be updated
	continue
	key = (id_, tool_id)
	self._data[key] = data
	self._tools_per_id[id_].add(tool_id)
	if id_ not in self._sorted_ids:
	bisect.insort(self._sorted_ids, id_)

	def add_merge(self, new_data, conflict_update, merged_key):
	for new_item in new_data:
	id_ = new_item['id']
	tool_id = new_item['indexer_configuration_id']
	if conflict_update:
	all_subitems = []
	else:
	existing = list(self.get([id_]))
	all_subitems = [
	old_subitem
	for existing_item in existing
	if existing_item['tool']['id'] == tool_id
	for old_subitem in existing_item[merged_key]
	]
	for new_subitem in new_item[merged_key]:
	if new_subitem not in all_subitems:
	all_subitems.append(new_subitem)
	self.add([
	{
	'id': id_,
	'indexer_configuration_id': tool_id,
	merged_key: all_subitems,
	}
	], conflict_update=True)
	if id_ not in self._sorted_ids:
	bisect.insort(self._sorted_ids, id_)

	def delete(self, entries):
	for entry in entries:
	(id_, tool_id) = (entry['id'], entry['indexer_configuration_id'])
	key = (id_, tool_id)
	if tool_id in self._tools_per_id[id_]:
	self._tools_per_id[id_].remove(tool_id)
	if key in self._data:
	del self._data[key]


	class IndexerStorage:
	"""In-memory SWH indexer storage."""

	def __init__(self):
	self._tools = {}
	self._mimetypes = SubStorage(self._tools)
	self._languages = SubStorage(self._tools)
	self._content_ctags = SubStorage(self._tools)
	self._licenses = SubStorage(self._tools)
	self._content_metadata = SubStorage(self._tools)
	self._revision_intrinsic_metadata = SubStorage(self._tools)
	self._origin_intrinsic_metadata = SubStorage(self._tools)

	+ def check_config(self, *, check_write):
	+ return True
	+
	def content_mimetype_missing(self, mimetypes):
	"""Generate mimetypes missing from storage.

	Args:
	mimetypes (iterable): iterable of dict with keys:

	- id (bytes): sha1 identifier
	- indexer_configuration_id (int): tool used to compute the
	results

	Yields:
	tuple (id, indexer_configuration_id): missing id

	"""
	yield from self._mimetypes.missing(mimetypes)

	def content_mimetype_get_range(
	self, start, end, indexer_configuration_id, limit=1000):
	"""Retrieve mimetypes within range [start, end] bound by limit.

	Args:
	start (bytes): Starting identifier range (expected smaller
	than end)
	end (bytes): Ending identifier range (expected larger
	than start)
	indexer_configuration_id (int): The tool used to index data
	limit (int): Limit result (default to 1000)

	Raises:
	ValueError for limit to None

	Returns:
	a dict with keys:
	- ids [bytes]: iterable of content ids within the range.
	- next (Optional[bytes]): The next range of sha1 starts at
	this sha1 if any

	"""
	return self._mimetypes.get_range(
	start, end, indexer_configuration_id, limit)

	def content_mimetype_add(self, mimetypes, conflict_update=False):
	"""Add mimetypes not present in storage.

	Args:
	mimetypes (iterable): dictionaries with keys:

	- id (bytes): sha1 identifier
	- mimetype (bytes): raw content's mimetype
	- encoding (bytes): raw content's encoding
	- indexer_configuration_id (int): tool's id used to
	compute the results
	- conflict_update (bool): Flag to determine if we want to
	overwrite (``True``) or skip duplicates (``False``, the
	default)

	"""
	if not all(isinstance(x['id'], bytes) for x in mimetypes):
	raise TypeError('identifiers must be bytes.')
	self._mimetypes.add(mimetypes, conflict_update)

	def content_mimetype_get(self, ids, db=None, cur=None):
	"""Retrieve full content mimetype per ids.

	Args:
	ids (iterable): sha1 identifier

	Yields:
	mimetypes (iterable): dictionaries with keys:

	- id (bytes): sha1 identifier
	- mimetype (bytes): raw content's mimetype
	- encoding (bytes): raw content's encoding
	- tool (dict): Tool used to compute the language

	"""
	yield from self._mimetypes.get(ids)

	def content_language_missing(self, languages):
	"""List languages missing from storage.

	Args:
	languages (iterable): dictionaries with keys:

	- id (bytes): sha1 identifier
	- indexer_configuration_id (int): tool used to compute
	the results

	Yields:
	an iterable of missing id for the tuple (id,
	indexer_configuration_id)

	"""
	yield from self._languages.missing(languages)

	def content_language_get(self, ids):
	"""Retrieve full content language per ids.

	Args:
	ids (iterable): sha1 identifier

	Yields:
	languages (iterable): dictionaries with keys:

	- id (bytes): sha1 identifier
	- lang (bytes): raw content's language
	- tool (dict): Tool used to compute the language

	"""
	yield from self._languages.get(ids)

	def content_language_add(self, languages, conflict_update=False):
	"""Add languages not present in storage.

	Args:
	languages (iterable): dictionaries with keys:

	- id (bytes): sha1
	- lang (bytes): language detected

	conflict_update (bool): Flag to determine if we want to
	overwrite (true) or skip duplicates (false, the
	default)

	"""
	if not all(isinstance(x['id'], bytes) for x in languages):
	raise TypeError('identifiers must be bytes.')
	self._languages.add(languages, conflict_update)

	def content_ctags_missing(self, ctags):
	"""List ctags missing from storage.

	Args:
	ctags (iterable): dicts with keys:

	- id (bytes): sha1 identifier
	- indexer_configuration_id (int): tool used to compute
	the results

	Yields:
	an iterable of missing id for the tuple (id,
	indexer_configuration_id)

	"""
	yield from self._content_ctags.missing(ctags)

	def content_ctags_get(self, ids):
	"""Retrieve ctags per id.

	Args:
	ids (iterable): sha1 checksums

	Yields:
	Dictionaries with keys:

	- id (bytes): content's identifier
	- name (str): symbol's name
	- kind (str): symbol's kind
	- lang (str): language for that content
	- tool (dict): tool used to compute the ctags' info


	"""
	for item in self._content_ctags.get(ids):
	for item_ctags_item in item['ctags']:
	yield {
	'id': item['id'],
	'tool': item['tool'],
	**item_ctags_item
	}

	def content_ctags_add(self, ctags, conflict_update=False):
	"""Add ctags not present in storage

	Args:
	ctags (iterable): dictionaries with keys:

	- id (bytes): sha1
	- ctags ([list): List of dictionary with keys: name, kind,
	line, lang
	- indexer_configuration_id: tool used to compute the
	results

	"""
	if not all(isinstance(x['id'], bytes) for x in ctags):
	raise TypeError('identifiers must be bytes.')
	self._content_ctags.add_merge(ctags, conflict_update, 'ctags')

	def content_ctags_search(self, expression,
	limit=10, last_sha1=None, db=None, cur=None):
	"""Search through content's raw ctags symbols.

	Args:
	expression (str): Expression to search for
	limit (int): Number of rows to return (default to 10).
	last_sha1 (str): Offset from which retrieving data (default to '').

	Yields:
	rows of ctags including id, name, lang, kind, line, etc...

	"""
	nb_matches = 0
	for ((id_, tool_id), item) in \
	sorted(self._content_ctags._data.items()):
	if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))):
	continue
	for ctags_item in item['ctags']:
	if ctags_item['name'] != expression:
	continue
	nb_matches += 1
	yield {
	'id': id_,
	'tool': _transform_tool(self._tools[tool_id]),
	**ctags_item
	}
	if nb_matches >= limit:
	return

	def content_fossology_license_get(self, ids):
	"""Retrieve licenses per id.

	Args:
	ids (iterable): sha1 checksums

	Yields:
	dict: ``{id: facts}`` where ``facts`` is a dict with the
	following keys:

	- licenses ([str]): associated licenses for that content
	- tool (dict): Tool used to compute the license

	"""
	# Rewrites the output of SubStorage.get from the old format to
	# the new one. SubStorage.get should be updated once all other
	# *_get methods use the new format.
	# See: https://forge.softwareheritage.org/T1433
	res = {}
	for d in self._licenses.get(ids):
	res.setdefault(d.pop('id'), []).append(d)
	for (id_, facts) in res.items():
	yield {id_: facts}

	def content_fossology_license_add(self, licenses, conflict_update=False):
	"""Add licenses not present in storage.

	Args:
	licenses (iterable): dictionaries with keys:

	- id: sha1
	- licenses ([bytes]): List of licenses associated to sha1
	- tool (str): nomossa

	conflict_update: Flag to determine if we want to overwrite (true)
	or skip duplicates (false, the default)

	Returns:
	list: content_license entries which failed due to unknown licenses

	"""
	if not all(isinstance(x['id'], bytes) for x in licenses):
	raise TypeError('identifiers must be bytes.')
	self._licenses.add_merge(licenses, conflict_update, 'licenses')

	def content_fossology_license_get_range(
	self, start, end, indexer_configuration_id, limit=1000):
	"""Retrieve licenses within range [start, end] bound by limit.

	Args:
	start (bytes): Starting identifier range (expected smaller
	than end)
	end (bytes): Ending identifier range (expected larger
	than start)
	indexer_configuration_id (int): The tool used to index data
	limit (int): Limit result (default to 1000)

	Raises:
	ValueError for limit to None

	Returns:
	a dict with keys:
	- ids [bytes]: iterable of content ids within the range.
	- next (Optional[bytes]): The next range of sha1 starts at
	this sha1 if any

	"""
	return self._licenses.get_range(
	start, end, indexer_configuration_id, limit)

	def content_metadata_missing(self, metadata):
	"""List metadata missing from storage.

	Args:
	metadata (iterable): dictionaries with keys:

	- id (bytes): sha1 identifier
	- indexer_configuration_id (int): tool used to compute
	the results

	Yields:
	missing sha1s

	"""
	yield from self._content_metadata.missing(metadata)

	def content_metadata_get(self, ids):
	"""Retrieve metadata per id.

	Args:
	ids (iterable): sha1 checksums

	Yields:
	dictionaries with the following keys:

	- id (bytes)
	- metadata (str): associated metadata
	- tool (dict): tool used to compute metadata

	"""
	yield from self._content_metadata.get(ids)

	def content_metadata_add(self, metadata, conflict_update=False):
	"""Add metadata not present in storage.

	Args:
	metadata (iterable): dictionaries with keys:

	- id: sha1
	- metadata: arbitrary dict
	- indexer_configuration_id: tool used to compute the
	results

	conflict_update: Flag to determine if we want to overwrite (true)
	or skip duplicates (false, the default)

	"""
	if not all(isinstance(x['id'], bytes) for x in metadata):
	raise TypeError('identifiers must be bytes.')
	self._content_metadata.add(metadata, conflict_update)

	def revision_intrinsic_metadata_missing(self, metadata):
	"""List metadata missing from storage.

	Args:
	metadata (iterable): dictionaries with keys:

	- id (bytes): sha1_git revision identifier
	- indexer_configuration_id (int): tool used to compute
	the results

	Yields:
	missing ids

	"""
	yield from self._revision_intrinsic_metadata.missing(metadata)

	def revision_intrinsic_metadata_get(self, ids):
	"""Retrieve revision metadata per id.

	Args:
	ids (iterable): sha1 checksums

	Yields:
	dictionaries with the following keys:

	- id (bytes)
	- metadata (str): associated metadata
	- tool (dict): tool used to compute metadata
	- mappings (List[str]): list of mappings used to translate
	these metadata

	"""
	yield from self._revision_intrinsic_metadata.get(ids)

	def revision_intrinsic_metadata_add(self, metadata, conflict_update=False):
	"""Add metadata not present in storage.

	Args:
	metadata (iterable): dictionaries with keys:

	- id: sha1_git of revision
	- metadata: arbitrary dict
	- indexer_configuration_id: tool used to compute metadata
	- mappings (List[str]): list of mappings used to translate
	these metadata

	conflict_update: Flag to determine if we want to overwrite (true)
	or skip duplicates (false, the default)

	"""
	if not all(isinstance(x['id'], bytes) for x in metadata):
	raise TypeError('identifiers must be bytes.')
	self._revision_intrinsic_metadata.add(metadata, conflict_update)

	def revision_intrinsic_metadata_delete(self, entries):
	"""Remove revision metadata from the storage.

	Args:
	entries (dict): dictionaries with the following keys:
	- revision (int): origin identifier
	- id (int): tool used to compute metadata
	"""
	self._revision_intrinsic_metadata.delete(entries)

	def origin_intrinsic_metadata_get(self, ids):
	"""Retrieve origin metadata per id.

	Args:
	ids (iterable): origin identifiers

	Yields:
	list: dictionaries with the following keys:

	- id (str): origin url
	- from_revision (bytes): which revision this metadata
	was extracted from
	- metadata (str): associated metadata
	- tool (dict): tool used to compute metadata
	- mappings (List[str]): list of mappings used to translate
	these metadata

	"""
	yield from self._origin_intrinsic_metadata.get(ids)

	def origin_intrinsic_metadata_add(self, metadata,
	conflict_update=False):
	"""Add origin metadata not present in storage.

	Args:
	metadata (iterable): dictionaries with keys:

	- id: origin url
	- from_revision: sha1 id of the revision used to generate
	these metadata.
	- metadata: arbitrary dict
	- indexer_configuration_id: tool used to compute metadata
	- mappings (List[str]): list of mappings used to translate
	these metadata

	conflict_update: Flag to determine if we want to overwrite (true)
	or skip duplicates (false, the default)

	"""
	self._origin_intrinsic_metadata.add(metadata, conflict_update)

	def origin_intrinsic_metadata_delete(self, entries):
	"""Remove origin metadata from the storage.

	Args:
	entries (dict): dictionaries with the following keys:
	- id (str): origin url
	- indexer_configuration_id (int): tool used to compute
	metadata
	"""
	self._origin_intrinsic_metadata.delete(entries)

	def origin_intrinsic_metadata_search_fulltext(
	self, conjunction, limit=100):
	"""Returns the list of origins whose metadata contain all the terms.

	Args:
	conjunction (List[str]): List of terms to be searched for.
	limit (int): The maximum number of results to return

	Yields:
	list: dictionaries with the following keys:

	- id (str): origin url
	- from_revision (bytes): which revision this metadata
	was extracted from
	- metadata (str): associated metadata
	- tool (dict): tool used to compute metadata
	- mappings (List[str]): list of mappings used to translate
	these metadata

	"""
	# A very crude fulltext search implementation, but that's enough
	# to work on English metadata
	tokens_re = re.compile('[a-zA-Z0-9]+')
	search_tokens = list(itertools.chain(
	*map(tokens_re.findall, conjunction)))

	def rank(data):
	# Tokenize the metadata
	text = json.dumps(data['metadata'])
	text_tokens = tokens_re.findall(text)
	text_token_occurences = Counter(text_tokens)

	# Count the number of occurrences of search tokens in the text
	score = 0
	for search_token in search_tokens:
	if text_token_occurences[search_token] == 0:
	# Search token is not in the text.
	return 0
	score += text_token_occurences[search_token]

	# Normalize according to the text's length
	return score / math.log(len(text_tokens))

	results = [(rank(data), data)
	for data in self._origin_intrinsic_metadata.get_all()]
	results = [(rank_, data) for (rank_, data) in results if rank_ > 0]
	results.sort(key=operator.itemgetter(0), # Don't try to order 'data'
	reverse=True)
	for (rank_, result) in results[:limit]:
	yield result

	def origin_intrinsic_metadata_search_by_producer(
	self, page_token='', limit=100, ids_only=False,
	mappings=None, tool_ids=None,
	db=None, cur=None):
	"""Returns the list of origins whose metadata contain all the terms.

	Args:
	page_token (str): Opaque token used for pagination.
	limit (int): The maximum number of results to return
	ids_only (bool): Determines whether only origin ids are returned
	or the content as well
	mappings (List[str]): Returns origins whose intrinsic metadata
	were generated using at least one of these mappings.

	Returns:
	dict: dict with the following keys:
	- next_page_token (str, optional): opaque token to be used as
	`page_token` for retrieveing the next page.
	- origins (list): list of origin url (str) if `ids_only=True`
	else dictionaries with the following keys:

	- id (str): origin urls
	- from_revision: sha1 id of the revision used to generate
	these metadata.
	- metadata (str): associated metadata
	- tool (dict): tool used to compute metadata
	- mappings (List[str]): list of mappings used to translate
	these metadata

	"""
	assert isinstance(page_token, str)
	nb_results = 0
	if mappings is not None:
	mappings = frozenset(mappings)
	if tool_ids is not None:
	tool_ids = frozenset(tool_ids)
	origins = []

	# we go to limit+1 to check wether we should add next_page_token in
	# the response
	for entry in self._origin_intrinsic_metadata.get_all():
	if entry['id'] <= page_token:
	continue
	if nb_results >= (limit + 1):
	break
	if mappings is not None and mappings.isdisjoint(entry['mappings']):
	continue
	if tool_ids is not None and entry['tool']['id'] not in tool_ids:
	continue
	origins.append(entry)
	nb_results += 1

	result = {}
	if len(origins) > limit:
	origins = origins[:limit]
	result['next_page_token'] = origins[-1]['id']
	if ids_only:
	origins = [origin['id'] for origin in origins]
	result['origins'] = origins
	return result

	def origin_intrinsic_metadata_stats(self):
	"""Returns statistics on stored intrinsic metadata.

	Returns:
	dict: dictionary with keys:

	- total (int): total number of origins that were indexed
	(possibly yielding an empty metadata dictionary)
	- non_empty (int): total number of origins that we extracted
	a non-empty metadata dictionary from
	- per_mapping (dict): a dictionary with mapping names as
	keys and number of origins whose indexing used this
	mapping. Note that indexing a given origin may use
	0, 1, or many mappings.
	"""
	mapping_count = {m: 0 for m in MAPPING_NAMES}
	total = non_empty = 0
	for data in self._origin_intrinsic_metadata.get_all():
	total += 1
	if set(data['metadata']) - {'@context'}:
	non_empty += 1
	for mapping in data['mappings']:
	mapping_count[mapping] += 1
	return {
	'per_mapping': mapping_count,
	'total': total,
	'non_empty': non_empty
	}

	def indexer_configuration_add(self, tools):
	"""Add new tools to the storage.

	Args:
	tools ([dict]): List of dictionary representing tool to
	insert in the db. Dictionary with the following keys:

	- tool_name (str): tool's name
	- tool_version (str): tool's version
	- tool_configuration (dict): tool's configuration
	(free form dict)

	Returns:
	list: List of dict inserted in the db (holding the id key as
	well). The order of the list is not guaranteed to match
	the order of the initial list.

	"""
	inserted = []
	for tool in tools:
	tool = tool.copy()
	id_ = self._tool_key(tool)
	tool['id'] = id_
	self._tools[id_] = tool
	inserted.append(tool)
	return inserted

	def indexer_configuration_get(self, tool):
	"""Retrieve tool information.

	Args:
	tool (dict): Dictionary representing a tool with the
	following keys:

	- tool_name (str): tool's name
	- tool_version (str): tool's version
	- tool_configuration (dict): tool's configuration
	(free form dict)

	Returns:
	The same dictionary with an `id` key, None otherwise.

	"""
	return self._tools.get(self._tool_key(tool))

	def _tool_key(self, tool):
	return hash((tool['tool_name'], tool['tool_version'],
	json.dumps(tool['tool_configuration'], sort_keys=True)))
	diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py
	index d2f8f9f..cc6b500 100644
	--- a/swh/indexer/tests/storage/conftest.py
	+++ b/swh/indexer/tests/storage/conftest.py
	@@ -1,125 +1,80 @@
	# Copyright (C) 2015-2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from os.path import join
	import pytest

	from . import SQL_DIR
	from swh.storage.tests.conftest import postgresql_fact
	from swh.indexer.storage import get_indexer_storage
	from swh.model.hashutil import hash_to_bytes
	-from .generate_data_test import MIMETYPE_OBJECTS, FOSSOLOGY_LICENSES
	+from .generate_data_test import (
	+ MIMETYPE_OBJECTS, FOSSOLOGY_LICENSES, TOOLS
	+)


	DUMP_FILES = join(SQL_DIR, '*.sql')

	-TOOLS = [
	- {
	- 'tool_name': 'universal-ctags',
	- 'tool_version': '~git7859817b',
	- 'tool_configuration': {
	- "command_line": "ctags --fields=+lnz --sort=no --links=no "
	- "--output-format=json <filepath>"}
	- },
	- {
	- 'tool_name': 'swh-metadata-translator',
	- 'tool_version': '0.0.1',
	- 'tool_configuration': {"type": "local", "context": "NpmMapping"},
	- },
	- {
	- 'tool_name': 'swh-metadata-detector',
	- 'tool_version': '0.0.1',
	- 'tool_configuration': {
	- "type": "local", "context": ["NpmMapping", "CodemetaMapping"]},
	- },
	- {
	- 'tool_name': 'swh-metadata-detector2',
	- 'tool_version': '0.0.1',
	- 'tool_configuration': {
	- "type": "local", "context": ["NpmMapping", "CodemetaMapping"]},
	- },
	- {
	- 'tool_name': 'file',
	- 'tool_version': '5.22',
	- 'tool_configuration': {"command_line": "file --mime <filepath>"},
	- },
	- {
	- 'tool_name': 'pygments',
	- 'tool_version': '2.0.1+dfsg-1.1+deb8u1',
	- 'tool_configuration': {
	- "type": "library", "debian-package": "python3-pygments"},
	- },
	- {
	- 'tool_name': 'pygments2',
	- 'tool_version': '2.0.1+dfsg-1.1+deb8u1',
	- 'tool_configuration': {
	- "type": "library",
	- "debian-package": "python3-pygments",
	- "max_content_size": 10240
	- },
	- },
	- {
	- 'tool_name': 'nomos',
	- 'tool_version': '3.1.0rc2-31-ga2cbb8c',
	- 'tool_configuration': {"command_line": "nomossa <filepath>"},
	- }
	-]
	-

	class DataObj(dict):
	def __getattr__(self, key):
	return self.__getitem__(key)

	def __setattr__(self, key, value):
	return self.__setitem__(key, value)


	@pytest.fixture
	def swh_indexer_storage_with_data(swh_indexer_storage):
	data = DataObj()
	tools = {
	tool['tool_name']: {
	'id': tool['id'],
	'name': tool['tool_name'],
	'version': tool['tool_version'],
	'configuration': tool['tool_configuration'],
	}
	for tool in swh_indexer_storage.indexer_configuration_add(TOOLS)}
	data.tools = tools
	data.sha1_1 = hash_to_bytes(
	'34973274ccef6ab4dfaaf86599792fa9c3fe4689')
	data.sha1_2 = hash_to_bytes(
	'61c2b3a30496d329e21af70dd2d7e097046d07b7')
	data.revision_id_1 = hash_to_bytes(
	'7026b7c1a2af56521e951c01ed20f255fa054238')
	data.revision_id_2 = hash_to_bytes(
	'7026b7c1a2af56521e9587659012345678904321')
	data.revision_id_3 = hash_to_bytes(
	'7026b7c1a2af56521e9587659012345678904320')
	data.origin_url_1 = 'file:///dev/0/zero' # 44434341
	data.origin_url_2 = 'file:///dev/1/one' # 44434342
	data.origin_url_3 = 'file:///dev/2/two' # 54974445
	- data.mimetypes = MIMETYPE_OBJECTS[:]
	- swh_indexer_storage.content_mimetype_add(
	- MIMETYPE_OBJECTS)
	- data.fossology_licenses = FOSSOLOGY_LICENSES[:]
	+ data.mimetypes = [
	+ {**mimetype_obj, 'indexer_configuration_id': tools['file']['id']}
	+ for mimetype_obj in MIMETYPE_OBJECTS
	+ ]
	+ swh_indexer_storage.content_mimetype_add(data.mimetypes)
	+ data.fossology_licenses = [
	+ {**fossology_obj, 'indexer_configuration_id': tools['nomos']['id']}
	+ for fossology_obj in FOSSOLOGY_LICENSES
	+ ]
	swh_indexer_storage._test_data = data

	return (swh_indexer_storage, data)


	swh_indexer_storage_postgresql = postgresql_fact(
	'postgresql_proc', dump_files=DUMP_FILES)


	@pytest.fixture
	def swh_indexer_storage(swh_indexer_storage_postgresql):
	storage_config = {
	'cls': 'local',
	'args': {
	'db': swh_indexer_storage_postgresql.dsn,
	},
	}
	return get_indexer_storage(**storage_config)
	diff --git a/swh/indexer/tests/storage/generate_data_test.py b/swh/indexer/tests/storage/generate_data_test.py
	index 5df332a..5a6798e 100644
	--- a/swh/indexer/tests/storage/generate_data_test.py
	+++ b/swh/indexer/tests/storage/generate_data_test.py
	@@ -1,153 +1,206 @@
	# Copyright (C) 2018-2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from uuid import uuid1

	from swh.model.hashutil import MultiHash
	from hypothesis.strategies import (composite, sets, one_of, uuids,
	tuples, sampled_from)
	MIMETYPES = [
	b'application/json',
	b'application/octet-stream',
	b'application/xml',
	b'text/plain',
	]

	ENCODINGS = [
	b'iso8859-1',
	b'iso8859-15',
	b'latin1',
	b'utf-8',
	]


	def gen_mimetype():
	"""Generate one mimetype strategy.

	"""
	return one_of(sampled_from(MIMETYPES))


	def gen_encoding():
	"""Generate one encoding strategy.

	"""
	return one_of(sampled_from(ENCODINGS))


	def _init_content(uuid):
	"""Given a uuid, initialize a content

	"""
	return {
	'id': MultiHash.from_data(uuid.bytes, {'sha1'}).digest()['sha1'],
	'indexer_configuration_id': 1,
	}


	@composite
	def gen_content_mimetypes(draw, *, min_size=0, max_size=100):
	"""Generate valid and consistent content_mimetypes.

	Context: Test purposes

	Args:
	draw (callable): Used by hypothesis to generate data
	min_size (int): Minimal number of elements to generate
	(default: 0)
	max_size (int): Maximal number of elements to generate
	(default: 100)

	Returns:
	List of content_mimetypes as expected by the
	content_mimetype_add api endpoint.

	"""
	_ids = draw(
	sets(
	tuples(
	uuids(),
	gen_mimetype(),
	gen_encoding()
	),
	min_size=min_size, max_size=max_size
	)
	)

	content_mimetypes = []
	for uuid, mimetype, encoding in _ids:
	content_mimetypes.append({
	**_init_content(uuid),
	'mimetype': mimetype,
	'encoding': encoding,
	})
	return content_mimetypes


	+TOOLS = [
	+ {
	+ 'tool_name': 'universal-ctags',
	+ 'tool_version': '~git7859817b',
	+ 'tool_configuration': {
	+ "command_line": "ctags --fields=+lnz --sort=no --links=no "
	+ "--output-format=json <filepath>"}
	+ },
	+ {
	+ 'tool_name': 'swh-metadata-translator',
	+ 'tool_version': '0.0.1',
	+ 'tool_configuration': {"type": "local", "context": "NpmMapping"},
	+ },
	+ {
	+ 'tool_name': 'swh-metadata-detector',
	+ 'tool_version': '0.0.1',
	+ 'tool_configuration': {
	+ "type": "local", "context": ["NpmMapping", "CodemetaMapping"]},
	+ },
	+ {
	+ 'tool_name': 'swh-metadata-detector2',
	+ 'tool_version': '0.0.1',
	+ 'tool_configuration': {
	+ "type": "local", "context": ["NpmMapping", "CodemetaMapping"]},
	+ },
	+ {
	+ 'tool_name': 'file',
	+ 'tool_version': '5.22',
	+ 'tool_configuration': {"command_line": "file --mime <filepath>"},
	+ },
	+ {
	+ 'tool_name': 'pygments',
	+ 'tool_version': '2.0.1+dfsg-1.1+deb8u1',
	+ 'tool_configuration': {
	+ "type": "library", "debian-package": "python3-pygments"},
	+ },
	+ {
	+ 'tool_name': 'pygments2',
	+ 'tool_version': '2.0.1+dfsg-1.1+deb8u1',
	+ 'tool_configuration': {
	+ "type": "library",
	+ "debian-package": "python3-pygments",
	+ "max_content_size": 10240
	+ },
	+ },
	+ {
	+ 'tool_name': 'nomos',
	+ 'tool_version': '3.1.0rc2-31-ga2cbb8c',
	+ 'tool_configuration': {"command_line": "nomossa <filepath>"},
	+ },
	+]
	+
	+
	MIMETYPE_OBJECTS = [
	{'id': MultiHash.from_data(uuid1().bytes, {'sha1'}).digest()['sha1'],
	- 'indexer_configuration_id': 1,
	'mimetype': mt,
	'encoding': enc,
	+ # 'indexer_configuration_id' will be added after TOOLS get registered
	}
	for mt in MIMETYPES
	for enc in ENCODINGS]

	LICENSES = [
	b'3DFX',
	b'BSD',
	b'GPL',
	b'Apache2',
	b'MIT',
	]

	FOSSOLOGY_LICENSES = [
	{'id': MultiHash.from_data(uuid1().bytes, {'sha1'}).digest()['sha1'],
	- 'indexer_configuration_id': 1,
	'licenses': [LICENSES[i % len(LICENSES)], ],
	+ # 'indexer_configuration_id' will be added after TOOLS get registered
	}
	for i in range(10)
	]


	def gen_license():
	return one_of(sampled_from(LICENSES))


	@composite
	def gen_content_fossology_licenses(draw, *, min_size=0, max_size=100):
	"""Generate valid and consistent content_fossology_licenses.

	Context: Test purposes

	Args:
	draw (callable): Used by hypothesis to generate data
	min_size (int): Minimal number of elements to generate
	(default: 0)
	max_size (int): Maximal number of elements to generate
	(default: 100)

	Returns:
	List of content_fossology_licenses as expected by the
	content_fossology_license_add api endpoint.

	"""
	_ids = draw(
	sets(
	tuples(
	uuids(),
	gen_license(),
	),
	min_size=min_size, max_size=max_size
	)
	)

	content_licenses = []
	for uuid, license in _ids:
	content_licenses.append({
	**_init_content(uuid),
	'licenses': [license],
	})
	return content_licenses
	diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py
	index d49a079..8e7b0e5 100644
	--- a/swh/indexer/tests/storage/test_in_memory.py
	+++ b/swh/indexer/tests/storage/test_in_memory.py
	@@ -1,21 +1,20 @@
	# Copyright (C) 2015-2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import pytest

	from swh.indexer.storage import get_indexer_storage

	from .test_storage import * # noqa


	@pytest.fixture
	-def swh_indexer_storage(swh_indexer_storage_postgresql):
	+def swh_indexer_storage():
	storage_config = {
	- 'cls': 'local',
	+ 'cls': 'memory',
	'args': {
	- 'db': swh_indexer_storage_postgresql.dsn,
	},
	}
	return get_indexer_storage(**storage_config)
	diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
	index c20bba1..c6dbc94 100644
	--- a/swh/indexer/tests/storage/test_storage.py
	+++ b/swh/indexer/tests/storage/test_storage.py
	@@ -1,1852 +1,1855 @@
	# Copyright (C) 2015-2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import threading
	import pytest
	from swh.model.hashutil import hash_to_bytes


	def prepare_mimetypes_from(fossology_licenses):
	"""Fossology license needs some consistent data in db to run.

	"""
	mimetypes = []
	for c in fossology_licenses:
	mimetypes.append({
	'id': c['id'],
	'mimetype': 'text/plain',
	'encoding': 'utf-8',
	'indexer_configuration_id': c['indexer_configuration_id'],
	})
	return mimetypes


	def endpoint(storage, endpoint_type, endpoint_name):
	return getattr(storage, endpoint_type + '_' + endpoint_name)


	class StorageETypeTester:
	"""Base class for testing a series of common behaviour between a bunch of
	endpoint types supported by an IndexerStorage.

	This is supposed to be inherited with the following class attributes:
	- endpoint_type
	- tool_name
	- example_data

	See below for example usage.
	"""

	def test_missing(self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	etype = self.endpoint_type
	tool_id = data.tools[self.tool_name]['id']

	# given 2 (hopefully) unknown objects
	query = [
	{
	'id': data.sha1_1,
	'indexer_configuration_id': tool_id,
	},
	{
	'id': data.sha1_2,
	'indexer_configuration_id': tool_id,
	}]

	# we expect these are both returned by the xxx_missing endpoint
	actual_missing = endpoint(storage, etype, 'missing')(query)
	assert list(actual_missing) == [
	data.sha1_1,
	data.sha1_2,
	]

	# now, when we add one of them
	endpoint(storage, etype, 'add')([{
	'id': data.sha1_2,
	**self.example_data[0],
	'indexer_configuration_id': tool_id,
	}])

	# we expect only the other one returned
	actual_missing = endpoint(storage, etype, 'missing')(query)
	assert list(actual_missing) == [data.sha1_1]

	def test_add__drop_duplicate(self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	etype = self.endpoint_type
	tool_id = data.tools[self.tool_name]['id']

	# add the first object
	data_v1 = {
	'id': data.sha1_2,
	**self.example_data[0],
	'indexer_configuration_id': tool_id,
	}
	endpoint(storage, etype, 'add')([data_v1])

	# should be able to retrieve it
	actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2]))
	expected_data_v1 = [{
	'id': data.sha1_2,
	**self.example_data[0],
	'tool': data.tools[self.tool_name],
	}]
	assert actual_data == expected_data_v1

	# now if we add a modified version of the same object (same id)
	data_v2 = data_v1.copy()
	data_v2.update(self.example_data[1])
	endpoint(storage, etype, 'add')([data_v2])

	# we expect to retrieve the original data, not the modified one
	actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2]))
	assert actual_data == expected_data_v1

	def test_add__update_in_place_duplicate(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	etype = self.endpoint_type
	tool = data.tools[self.tool_name]

	data_v1 = {
	'id': data.sha1_2,
	**self.example_data[0],
	'indexer_configuration_id': tool['id'],
	}

	# given
	endpoint(storage, etype, 'add')([data_v1])

	# when
	actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2]))

	expected_data_v1 = [{
	'id': data.sha1_2,
	**self.example_data[0],
	'tool': tool,
	}]

	# then
	assert actual_data == expected_data_v1

	# given
	data_v2 = data_v1.copy()
	data_v2.update(self.example_data[1])

	endpoint(storage, etype, 'add')([data_v2], conflict_update=True)

	actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2]))

	expected_data_v2 = [{
	'id': data.sha1_2,
	**self.example_data[1],
	'tool': tool,
	}]

	# data did change as the v2 was used to overwrite v1
	assert actual_data == expected_data_v2

	def test_add__update_in_place_deadlock(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	etype = self.endpoint_type
	tool = data.tools[self.tool_name]

	hashes = [
	hash_to_bytes(
	'34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}'.format(i))
	for i in range(1000)]

	data_v1 = [
	{
	'id': hash_,
	**self.example_data[0],
	'indexer_configuration_id': tool['id'],
	}
	for hash_ in hashes
	]
	data_v2 = [
	{
	'id': hash_,
	**self.example_data[1],
	'indexer_configuration_id': tool['id'],
	}
	for hash_ in hashes
	]

	# Remove one item from each, so that both queries have to succeed for
	# all items to be in the DB.
	data_v2a = data_v2[1:]
	data_v2b = list(reversed(data_v2[0:-1]))

	# given
	endpoint(storage, etype, 'add')(data_v1)

	# when
	actual_data = list(endpoint(storage, etype, 'get')(hashes))

	expected_data_v1 = [
	{
	'id': hash_,
	**self.example_data[0],
	'tool': tool,
	}
	for hash_ in hashes
	]

	# then
	assert actual_data == expected_data_v1

	# given
	def f1():
	endpoint(storage, etype, 'add')(data_v2a, conflict_update=True)

	def f2():
	endpoint(storage, etype, 'add')(data_v2b, conflict_update=True)

	t1 = threading.Thread(target=f1)
	t2 = threading.Thread(target=f2)
	t2.start()
	t1.start()

	t1.join()
	t2.join()

	actual_data = sorted(endpoint(storage, etype, 'get')(hashes),
	key=lambda x: x['id'])

	expected_data_v2 = [
	{
	'id': hash_,
	**self.example_data[1],
	'tool': tool,
	}
	for hash_ in hashes
	]

	assert actual_data == expected_data_v2

	def test_add__duplicate_twice(self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	etype = self.endpoint_type
	tool = data.tools[self.tool_name]

	data_rev1 = {
	'id': data.revision_id_2,
	**self.example_data[0],
	'indexer_configuration_id': tool['id']
	}

	data_rev2 = {
	'id': data.revision_id_2,
	**self.example_data[1],
	'indexer_configuration_id': tool['id']
	}

	# when
	endpoint(storage, etype, 'add')([data_rev1])

	with pytest.raises(ValueError):
	endpoint(storage, etype, 'add')(
	[data_rev2, data_rev2],
	conflict_update=True)

	# then
	actual_data = list(endpoint(storage, etype, 'get')(
	[data.revision_id_2, data.revision_id_1]))

	expected_data = [{
	'id': data.revision_id_2,
	**self.example_data[0],
	'tool': tool,
	}]
	assert actual_data == expected_data

	def test_get(self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	etype = self.endpoint_type
	tool = data.tools[self.tool_name]

	query = [data.sha1_2, data.sha1_1]
	data1 = {
	'id': data.sha1_2,
	**self.example_data[0],
	'indexer_configuration_id': tool['id'],
	}

	# when
	endpoint(storage, etype, 'add')([data1])

	# then
	actual_data = list(endpoint(storage, etype, 'get')(query))

	# then
	expected_data = [{
	'id': data.sha1_2,
	**self.example_data[0],
	'tool': tool,
	}]

	assert actual_data == expected_data


	class TestIndexerStorageContentMimetypes(StorageETypeTester):
	"""Test Indexer Storage content_mimetype related methods
	"""
	endpoint_type = 'content_mimetype'
	tool_name = 'file'
	example_data = [
	{
	'mimetype': 'text/plain',
	'encoding': 'utf-8',
	},
	{
	'mimetype': 'text/html',
	'encoding': 'us-ascii',
	},
	]

	def test_generate_content_mimetype_get_range_limit_none(
	self, swh_indexer_storage):
	"""mimetype_get_range call with wrong limit input should fail"""
	storage = swh_indexer_storage
	with pytest.raises(ValueError) as e:
	storage.content_mimetype_get_range(
	start=None, end=None, indexer_configuration_id=None,
	limit=None)

	assert e.value.args == (
	'Development error: limit should not be None',)

	def test_generate_content_mimetype_get_range_no_limit(
	self, swh_indexer_storage_with_data):
	"""mimetype_get_range returns mimetypes within range provided"""
	storage, data = swh_indexer_storage_with_data
	mimetypes = data.mimetypes

	# All ids from the db
	content_ids = sorted([c['id'] for c in mimetypes])

	start = content_ids[0]
	end = content_ids[-1]

	# retrieve mimetypes
	tool_id = mimetypes[0]['indexer_configuration_id']
	actual_result = storage.content_mimetype_get_range(
	start, end, indexer_configuration_id=tool_id)

	actual_ids = actual_result['ids']
	actual_next = actual_result['next']

	assert len(mimetypes) == len(actual_ids)
	assert actual_next is None
	assert content_ids == actual_ids

	def test_generate_content_mimetype_get_range_limit(
	self, swh_indexer_storage_with_data):
	"""mimetype_get_range paginates results if limit exceeded"""
	storage, data = swh_indexer_storage_with_data

	+ indexer_configuration_id = data.tools['file']['id']
	+
	# input the list of sha1s we want from storage
	content_ids = sorted(
	[c['id'] for c in data.mimetypes])
	mimetypes = list(storage.content_mimetype_get(content_ids))
	assert len(mimetypes) == len(data.mimetypes)

	start = content_ids[0]
	end = content_ids[-1]
	# retrieve mimetypes limited to 10 results
	actual_result = storage.content_mimetype_get_range(
	start, end,
	- indexer_configuration_id=1,
	+ indexer_configuration_id=indexer_configuration_id,
	limit=10)

	assert actual_result
	assert set(actual_result.keys()) == {'ids', 'next'}
	actual_ids = actual_result['ids']
	actual_next = actual_result['next']

	assert len(actual_ids) == 10
	assert actual_next is not None
	assert actual_next == content_ids[10]

	expected_mimetypes = content_ids[:10]
	assert expected_mimetypes == actual_ids

	# retrieve next part
	actual_result = storage.content_mimetype_get_range(
	- start=end, end=end, indexer_configuration_id=1)
	+ start=end, end=end,
	+ indexer_configuration_id=indexer_configuration_id)
	assert set(actual_result.keys()) == {'ids', 'next'}
	actual_ids = actual_result['ids']
	actual_next = actual_result['next']

	assert actual_next is None
	expected_mimetypes = [content_ids[-1]]
	assert expected_mimetypes == actual_ids


	class TestIndexerStorageContentLanguage(StorageETypeTester):
	"""Test Indexer Storage content_language related methods
	"""
	endpoint_type = 'content_language'
	tool_name = 'pygments'
	example_data = [
	{
	'lang': 'haskell',
	},
	{
	'lang': 'common-lisp',
	},
	]


	class TestIndexerStorageContentCTags(StorageETypeTester):
	"""Test Indexer Storage content_ctags related methods
	"""
	endpoint_type = 'content_ctags'
	tool_name = 'universal-ctags'
	example_data = [
	{
	'ctags': [{
	'name': 'done',
	'kind': 'variable',
	'line': 119,
	'lang': 'OCaml',
	}]
	},
	{
	'ctags': [
	{
	'name': 'done',
	'kind': 'variable',
	'line': 100,
	'lang': 'Python',
	},
	{
	'name': 'main',
	'kind': 'function',
	'line': 119,
	'lang': 'Python',
	}]
	},
	]

	# the following tests are disabled because CTAGS behaves differently
	@pytest.mark.skip
	def test_add__drop_duplicate(self):
	pass

	@pytest.mark.skip
	def test_add__update_in_place_duplicate(self):
	pass

	@pytest.mark.skip
	def test_add__update_in_place_deadlock(self):
	pass

	@pytest.mark.skip
	def test_add__duplicate_twice(self):
	pass

	@pytest.mark.skip
	def test_get(self):
	pass

	def test_content_ctags_search(self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# 1. given
	tool = data.tools['universal-ctags']
	tool_id = tool['id']

	ctag1 = {
	'id': data.sha1_1,
	'indexer_configuration_id': tool_id,
	'ctags': [
	{
	'name': 'hello',
	'kind': 'function',
	'line': 133,
	'lang': 'Python',
	},
	{
	'name': 'counter',
	'kind': 'variable',
	'line': 119,
	'lang': 'Python',
	},
	{
	'name': 'hello',
	'kind': 'variable',
	'line': 210,
	'lang': 'Python',
	},
	]
	}

	ctag2 = {
	'id': data.sha1_2,
	'indexer_configuration_id': tool_id,
	'ctags': [
	{
	'name': 'hello',
	'kind': 'variable',
	'line': 100,
	'lang': 'C',
	},
	{
	'name': 'result',
	'kind': 'variable',
	'line': 120,
	'lang': 'C',
	},
	]
	}

	storage.content_ctags_add([ctag1, ctag2])

	# 1. when
	actual_ctags = list(storage.content_ctags_search('hello', limit=1))

	# 1. then
	assert actual_ctags == [
	{
	'id': ctag1['id'],
	'tool': tool,
	'name': 'hello',
	'kind': 'function',
	'line': 133,
	'lang': 'Python',
	}
	]

	# 2. when
	actual_ctags = list(storage.content_ctags_search(
	'hello',
	limit=1,
	last_sha1=ctag1['id']))

	# 2. then
	assert actual_ctags == [
	{
	'id': ctag2['id'],
	'tool': tool,
	'name': 'hello',
	'kind': 'variable',
	'line': 100,
	'lang': 'C',
	}
	]

	# 3. when
	actual_ctags = list(storage.content_ctags_search('hello'))

	# 3. then
	assert actual_ctags == [
	{
	'id': ctag1['id'],
	'tool': tool,
	'name': 'hello',
	'kind': 'function',
	'line': 133,
	'lang': 'Python',
	},
	{
	'id': ctag1['id'],
	'tool': tool,
	'name': 'hello',
	'kind': 'variable',
	'line': 210,
	'lang': 'Python',
	},
	{
	'id': ctag2['id'],
	'tool': tool,
	'name': 'hello',
	'kind': 'variable',
	'line': 100,
	'lang': 'C',
	},
	]

	# 4. when
	actual_ctags = list(storage.content_ctags_search('counter'))

	# then
	assert actual_ctags == [{
	'id': ctag1['id'],
	'tool': tool,
	'name': 'counter',
	'kind': 'variable',
	'line': 119,
	'lang': 'Python',
	}]

	# 5. when
	actual_ctags = list(storage.content_ctags_search('result', limit=1))

	# then
	assert actual_ctags == [{
	'id': ctag2['id'],
	'tool': tool,
	'name': 'result',
	'kind': 'variable',
	'line': 120,
	'lang': 'C',
	}]

	def test_content_ctags_search_no_result(self, swh_indexer_storage):
	storage = swh_indexer_storage
	actual_ctags = list(storage.content_ctags_search('counter'))

	assert not actual_ctags

	def test_content_ctags_add__add_new_ctags_added(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data

	# given
	tool = data.tools['universal-ctags']
	tool_id = tool['id']

	ctag_v1 = {
	'id': data.sha1_2,
	'indexer_configuration_id': tool_id,
	'ctags': [{
	'name': 'done',
	'kind': 'variable',
	'line': 100,
	'lang': 'Scheme',
	}]
	}

	# given
	storage.content_ctags_add([ctag_v1])
	storage.content_ctags_add([ctag_v1]) # conflict does nothing

	# when
	actual_ctags = list(storage.content_ctags_get([data.sha1_2]))

	# then
	expected_ctags = [{
	'id': data.sha1_2,
	'name': 'done',
	'kind': 'variable',
	'line': 100,
	'lang': 'Scheme',
	'tool': tool,
	}]

	assert actual_ctags == expected_ctags

	# given
	ctag_v2 = ctag_v1.copy()
	ctag_v2.update({
	'ctags': [
	{
	'name': 'defn',
	'kind': 'function',
	'line': 120,
	'lang': 'Scheme',
	}
	]
	})

	storage.content_ctags_add([ctag_v2])

	expected_ctags = [
	{
	'id': data.sha1_2,
	'name': 'done',
	'kind': 'variable',
	'line': 100,
	'lang': 'Scheme',
	'tool': tool,
	}, {
	'id': data.sha1_2,
	'name': 'defn',
	'kind': 'function',
	'line': 120,
	'lang': 'Scheme',
	'tool': tool,
	}
	]

	actual_ctags = list(storage.content_ctags_get(
	[data.sha1_2]))

	assert actual_ctags == expected_ctags

	def test_content_ctags_add__update_in_place(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# given
	tool = data.tools['universal-ctags']
	tool_id = tool['id']

	ctag_v1 = {
	'id': data.sha1_2,
	'indexer_configuration_id': tool_id,
	'ctags': [{
	'name': 'done',
	'kind': 'variable',
	'line': 100,
	'lang': 'Scheme',
	}]
	}

	# given
	storage.content_ctags_add([ctag_v1])

	# when
	actual_ctags = list(storage.content_ctags_get(
	[data.sha1_2]))

	# then
	expected_ctags = [
	{
	'id': data.sha1_2,
	'name': 'done',
	'kind': 'variable',
	'line': 100,
	'lang': 'Scheme',
	'tool': tool
	}
	]
	assert actual_ctags == expected_ctags

	# given
	ctag_v2 = ctag_v1.copy()
	ctag_v2.update({
	'ctags': [
	{
	'name': 'done',
	'kind': 'variable',
	'line': 100,
	'lang': 'Scheme',
	},
	{
	'name': 'defn',
	'kind': 'function',
	'line': 120,
	'lang': 'Scheme',
	}
	]
	})

	storage.content_ctags_add([ctag_v2], conflict_update=True)

	actual_ctags = list(storage.content_ctags_get(
	[data.sha1_2]))

	# ctag did change as the v2 was used to overwrite v1
	expected_ctags = [
	{
	'id': data.sha1_2,
	'name': 'done',
	'kind': 'variable',
	'line': 100,
	'lang': 'Scheme',
	'tool': tool,
	},
	{
	'id': data.sha1_2,
	'name': 'defn',
	'kind': 'function',
	'line': 120,
	'lang': 'Scheme',
	'tool': tool,
	}
	]
	assert actual_ctags == expected_ctags


	class TestIndexerStorageContentMetadata(StorageETypeTester):
	"""Test Indexer Storage content_metadata related methods
	"""
	tool_name = 'swh-metadata-detector'
	endpoint_type = 'content_metadata'
	example_data = [
	{
	'metadata': {
	'other': {},
	'codeRepository': {
	'type': 'git',
	'url': 'https://github.com/moranegg/metadata_test'
	},
	'description': 'Simple package.json test for indexer',
	'name': 'test_metadata',
	'version': '0.0.1'
	},
	},
	{
	'metadata': {
	'other': {},
	'name': 'test_metadata',
	'version': '0.0.1'
	},
	},
	]


	class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester):
	"""Test Indexer Storage revision_intrinsic_metadata related methods
	"""
	tool_name = 'swh-metadata-detector'
	endpoint_type = 'revision_intrinsic_metadata'
	example_data = [
	{
	'metadata': {
	'other': {},
	'codeRepository': {
	'type': 'git',
	'url': 'https://github.com/moranegg/metadata_test'
	},
	'description': 'Simple package.json test for indexer',
	'name': 'test_metadata',
	'version': '0.0.1'
	},
	'mappings': ['mapping1'],
	},
	{
	'metadata': {
	'other': {},
	'name': 'test_metadata',
	'version': '0.0.1'
	},
	'mappings': ['mapping2'],
	},
	]

	def test_revision_intrinsic_metadata_delete(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	etype = self.endpoint_type
	tool = data.tools[self.tool_name]

	query = [data.sha1_2, data.sha1_1]
	data1 = {
	'id': data.sha1_2,
	**self.example_data[0],
	'indexer_configuration_id': tool['id'],
	}

	# when
	endpoint(storage, etype, 'add')([data1])
	endpoint(storage, etype, 'delete')([
	{
	'id': data.sha1_2,
	'indexer_configuration_id': tool['id'],
	}
	])

	# then
	actual_data = list(endpoint(storage, etype, 'get')(query))

	# then
	assert not actual_data

	def test_revision_intrinsic_metadata_delete_nonexisting(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	etype = self.endpoint_type
	tool = data.tools[self.tool_name]
	endpoint(storage, etype, 'delete')([
	{
	'id': data.sha1_2,
	'indexer_configuration_id': tool['id'],
	}
	])


	class TestIndexerStorageContentFossologyLicence:
	def test_content_fossology_license_add__new_license_added(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# given
	tool = data.tools['nomos']
	tool_id = tool['id']

	license_v1 = {
	'id': data.sha1_1,
	'licenses': ['Apache-2.0'],
	'indexer_configuration_id': tool_id,
	}

	# given
	storage.content_fossology_license_add([license_v1])
	# conflict does nothing
	storage.content_fossology_license_add([license_v1])

	# when
	actual_licenses = list(storage.content_fossology_license_get(
	[data.sha1_1]))

	# then
	expected_license = {
	data.sha1_1: [{
	'licenses': ['Apache-2.0'],
	'tool': tool,
	}]
	}
	assert actual_licenses == [expected_license]

	# given
	license_v2 = license_v1.copy()
	license_v2.update({
	'licenses': ['BSD-2-Clause'],
	})

	storage.content_fossology_license_add([license_v2])

	actual_licenses = list(storage.content_fossology_license_get(
	[data.sha1_1]))

	expected_license = {
	data.sha1_1: [{
	'licenses': ['Apache-2.0', 'BSD-2-Clause'],
	'tool': tool
	}]
	}

	# license did not change as the v2 was dropped.
	assert actual_licenses == [expected_license]

	def test_generate_content_fossology_license_get_range_limit_none(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	"""license_get_range call with wrong limit input should fail"""
	with pytest.raises(ValueError) as e:
	storage.content_fossology_license_get_range(
	start=None, end=None, indexer_configuration_id=None,
	limit=None)

	assert e.value.args == (
	'Development error: limit should not be None',)

	def test_generate_content_fossology_license_get_range_no_limit(
	self, swh_indexer_storage_with_data):
	"""license_get_range returns licenses within range provided"""
	storage, data = swh_indexer_storage_with_data
	# craft some consistent mimetypes
	fossology_licenses = data.fossology_licenses
	mimetypes = prepare_mimetypes_from(fossology_licenses)

	storage.content_mimetype_add(mimetypes, conflict_update=True)
	# add fossology_licenses to storage
	storage.content_fossology_license_add(fossology_licenses)

	# All ids from the db
	content_ids = sorted([c['id'] for c in fossology_licenses])

	start = content_ids[0]
	end = content_ids[-1]

	# retrieve fossology_licenses
	tool_id = fossology_licenses[0]['indexer_configuration_id']
	actual_result = storage.content_fossology_license_get_range(
	start, end, indexer_configuration_id=tool_id)

	actual_ids = actual_result['ids']
	actual_next = actual_result['next']

	assert len(fossology_licenses) == len(actual_ids)
	assert actual_next is None
	assert content_ids == actual_ids

	def test_generate_content_fossology_license_get_range_no_limit_with_filter(
	self, swh_indexer_storage_with_data):
	"""This filters non textual, then returns results within range"""
	storage, data = swh_indexer_storage_with_data
	fossology_licenses = data.fossology_licenses
	mimetypes = data.mimetypes

	# craft some consistent mimetypes
	_mimetypes = prepare_mimetypes_from(fossology_licenses)
	# add binary mimetypes which will get filtered out in results
	for m in mimetypes:
	_mimetypes.append({
	'mimetype': 'binary',
	**m,
	})

	storage.content_mimetype_add(_mimetypes, conflict_update=True)
	# add fossology_licenses to storage
	storage.content_fossology_license_add(fossology_licenses)

	# All ids from the db
	content_ids = sorted([c['id'] for c in fossology_licenses])

	start = content_ids[0]
	end = content_ids[-1]

	# retrieve fossology_licenses
	tool_id = fossology_licenses[0]['indexer_configuration_id']
	actual_result = storage.content_fossology_license_get_range(
	start, end, indexer_configuration_id=tool_id)

	actual_ids = actual_result['ids']
	actual_next = actual_result['next']

	assert len(fossology_licenses) == len(actual_ids)
	assert actual_next is None
	assert content_ids == actual_ids

	def test_generate_fossology_license_get_range_limit(
	self, swh_indexer_storage_with_data):
	"""fossology_license_get_range paginates results if limit exceeded"""
	storage, data = swh_indexer_storage_with_data
	fossology_licenses = data.fossology_licenses

	# craft some consistent mimetypes
	mimetypes = prepare_mimetypes_from(fossology_licenses)

	# add fossology_licenses to storage
	storage.content_mimetype_add(mimetypes, conflict_update=True)
	storage.content_fossology_license_add(fossology_licenses)

	# input the list of sha1s we want from storage
	content_ids = sorted([c['id'] for c in fossology_licenses])
	start = content_ids[0]
	end = content_ids[-1]

	# retrieve fossology_licenses limited to 3 results
	limited_results = len(fossology_licenses) - 1
	tool_id = fossology_licenses[0]['indexer_configuration_id']
	actual_result = storage.content_fossology_license_get_range(
	start, end,
	indexer_configuration_id=tool_id, limit=limited_results)

	actual_ids = actual_result['ids']
	actual_next = actual_result['next']

	assert limited_results == len(actual_ids)
	assert actual_next is not None
	assert actual_next == content_ids[-1]

	expected_fossology_licenses = content_ids[:-1]
	assert expected_fossology_licenses == actual_ids

	# retrieve next part
	actual_results2 = storage.content_fossology_license_get_range(
	start=end, end=end, indexer_configuration_id=tool_id)
	actual_ids2 = actual_results2['ids']
	actual_next2 = actual_results2['next']

	assert actual_next2 is None
	expected_fossology_licenses2 = [content_ids[-1]]
	assert expected_fossology_licenses2 == actual_ids2


	class TestIndexerStorageOriginIntrinsicMetadata:
	def test_origin_intrinsic_metadata_get(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# given
	tool_id = data.tools['swh-metadata-detector']['id']

	metadata = {
	'version': None,
	'name': None,
	}
	metadata_rev = {
	'id': data.revision_id_2,
	'metadata': metadata,
	'mappings': ['mapping1'],
	'indexer_configuration_id': tool_id,
	}
	metadata_origin = {
	'id': data.origin_url_1,
	'metadata': metadata,
	'indexer_configuration_id': tool_id,
	'mappings': ['mapping1'],
	'from_revision': data.revision_id_2,
	}

	# when
	storage.revision_intrinsic_metadata_add([metadata_rev])
	storage.origin_intrinsic_metadata_add([metadata_origin])

	# then
	actual_metadata = list(storage.origin_intrinsic_metadata_get(
	[data.origin_url_1, 'no://where']))

	expected_metadata = [{
	'id': data.origin_url_1,
	'metadata': metadata,
	'tool': data.tools['swh-metadata-detector'],
	'from_revision': data.revision_id_2,
	'mappings': ['mapping1'],
	}]

	assert actual_metadata == expected_metadata

	def test_origin_intrinsic_metadata_delete(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# given
	tool_id = data.tools['swh-metadata-detector']['id']

	metadata = {
	'version': None,
	'name': None,
	}
	metadata_rev = {
	'id': data.revision_id_2,
	'metadata': metadata,
	'mappings': ['mapping1'],
	'indexer_configuration_id': tool_id,
	}
	metadata_origin = {
	'id': data.origin_url_1,
	'metadata': metadata,
	'indexer_configuration_id': tool_id,
	'mappings': ['mapping1'],
	'from_revision': data.revision_id_2,
	}
	metadata_origin2 = metadata_origin.copy()
	metadata_origin2['id'] = data.origin_url_2

	# when
	storage.revision_intrinsic_metadata_add([metadata_rev])
	storage.origin_intrinsic_metadata_add([
	metadata_origin, metadata_origin2])

	storage.origin_intrinsic_metadata_delete([
	{
	'id': data.origin_url_1,
	'indexer_configuration_id': tool_id
	}
	])

	# then
	actual_metadata = list(storage.origin_intrinsic_metadata_get(
	[data.origin_url_1, data.origin_url_2, 'no://where']))
	for item in actual_metadata:
	item['indexer_configuration_id'] = item.pop('tool')['id']
	assert actual_metadata == [metadata_origin2]

	def test_origin_intrinsic_metadata_delete_nonexisting(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	tool_id = data.tools['swh-metadata-detector']['id']
	storage.origin_intrinsic_metadata_delete([
	{
	'id': data.origin_url_1,
	'indexer_configuration_id': tool_id
	}
	])

	def test_origin_intrinsic_metadata_add_drop_duplicate(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# given
	tool_id = data.tools['swh-metadata-detector']['id']

	metadata_v1 = {
	'version': None,
	'name': None,
	}
	metadata_rev_v1 = {
	'id': data.revision_id_1,
	'metadata': metadata_v1.copy(),
	'mappings': [],
	'indexer_configuration_id': tool_id,
	}
	metadata_origin_v1 = {
	'id': data.origin_url_1,
	'metadata': metadata_v1.copy(),
	'indexer_configuration_id': tool_id,
	'mappings': [],
	'from_revision': data.revision_id_1,
	}

	# given
	storage.revision_intrinsic_metadata_add([metadata_rev_v1])
	storage.origin_intrinsic_metadata_add([metadata_origin_v1])

	# when
	actual_metadata = list(storage.origin_intrinsic_metadata_get(
	[data.origin_url_1, 'no://where']))

	expected_metadata_v1 = [{
	'id': data.origin_url_1,
	'metadata': metadata_v1,
	'tool': data.tools['swh-metadata-detector'],
	'from_revision': data.revision_id_1,
	'mappings': [],
	}]

	assert actual_metadata == expected_metadata_v1

	# given
	metadata_v2 = metadata_v1.copy()
	metadata_v2.update({
	'name': 'test_metadata',
	'author': 'MG',
	})
	metadata_rev_v2 = metadata_rev_v1.copy()
	metadata_origin_v2 = metadata_origin_v1.copy()
	metadata_rev_v2['metadata'] = metadata_v2
	metadata_origin_v2['metadata'] = metadata_v2

	storage.revision_intrinsic_metadata_add([metadata_rev_v2])
	storage.origin_intrinsic_metadata_add([metadata_origin_v2])

	# then
	actual_metadata = list(storage.origin_intrinsic_metadata_get(
	[data.origin_url_1]))

	# metadata did not change as the v2 was dropped.
	assert actual_metadata == expected_metadata_v1

	def test_origin_intrinsic_metadata_add_update_in_place_duplicate(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# given
	tool_id = data.tools['swh-metadata-detector']['id']

	metadata_v1 = {
	'version': None,
	'name': None,
	}
	metadata_rev_v1 = {
	'id': data.revision_id_2,
	'metadata': metadata_v1,
	'mappings': [],
	'indexer_configuration_id': tool_id,
	}
	metadata_origin_v1 = {
	'id': data.origin_url_1,
	'metadata': metadata_v1.copy(),
	'indexer_configuration_id': tool_id,
	'mappings': [],
	'from_revision': data.revision_id_2,
	}

	# given
	storage.revision_intrinsic_metadata_add([metadata_rev_v1])
	storage.origin_intrinsic_metadata_add([metadata_origin_v1])

	# when
	actual_metadata = list(storage.origin_intrinsic_metadata_get(
	[data.origin_url_1]))

	# then
	expected_metadata_v1 = [{
	'id': data.origin_url_1,
	'metadata': metadata_v1,
	'tool': data.tools['swh-metadata-detector'],
	'from_revision': data.revision_id_2,
	'mappings': [],
	}]
	assert actual_metadata == expected_metadata_v1

	# given
	metadata_v2 = metadata_v1.copy()
	metadata_v2.update({
	'name': 'test_update_duplicated_metadata',
	'author': 'MG',
	})
	metadata_rev_v2 = metadata_rev_v1.copy()
	metadata_origin_v2 = metadata_origin_v1.copy()
	metadata_rev_v2['metadata'] = metadata_v2
	metadata_origin_v2 = {
	'id': data.origin_url_1,
	'metadata': metadata_v2.copy(),
	'indexer_configuration_id': tool_id,
	'mappings': ['npm'],
	'from_revision': data.revision_id_1,
	}

	storage.revision_intrinsic_metadata_add(
	[metadata_rev_v2], conflict_update=True)
	storage.origin_intrinsic_metadata_add(
	[metadata_origin_v2], conflict_update=True)

	actual_metadata = list(storage.origin_intrinsic_metadata_get(
	[data.origin_url_1]))

	expected_metadata_v2 = [{
	'id': data.origin_url_1,
	'metadata': metadata_v2,
	'tool': data.tools['swh-metadata-detector'],
	'from_revision': data.revision_id_1,
	'mappings': ['npm'],
	}]

	# metadata did change as the v2 was used to overwrite v1
	assert actual_metadata == expected_metadata_v2

	def test_origin_intrinsic_metadata_add__update_in_place_deadlock(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# given
	tool_id = data.tools['swh-metadata-detector']['id']

	ids = list(range(10))

	example_data1 = {
	'metadata': {
	'version': None,
	'name': None,
	},
	'mappings': [],
	}
	example_data2 = {
	'metadata': {
	'version': 'v1.1.1',
	'name': 'foo',
	},
	'mappings': [],
	}

	metadata_rev_v1 = {
	'id': data.revision_id_2,
	'metadata': {
	'version': None,
	'name': None,
	},
	'mappings': [],
	'indexer_configuration_id': tool_id,
	}

	data_v1 = [
	{
	'id': 'file:///tmp/origin%d' % id_,
	'from_revision': data.revision_id_2,
	**example_data1,
	'indexer_configuration_id': tool_id,
	}
	for id_ in ids
	]
	data_v2 = [
	{
	'id': 'file:///tmp/origin%d' % id_,
	'from_revision': data.revision_id_2,
	**example_data2,
	'indexer_configuration_id': tool_id,
	}
	for id_ in ids
	]

	# Remove one item from each, so that both queries have to succeed for
	# all items to be in the DB.
	data_v2a = data_v2[1:]
	data_v2b = list(reversed(data_v2[0:-1]))

	# given
	storage.revision_intrinsic_metadata_add([metadata_rev_v1])
	storage.origin_intrinsic_metadata_add(data_v1)

	# when
	origins = ['file:///tmp/origin%d' % i for i in ids]
	actual_data = list(storage.origin_intrinsic_metadata_get(origins))

	expected_data_v1 = [
	{
	'id': 'file:///tmp/origin%d' % id_,
	'from_revision': data.revision_id_2,
	**example_data1,
	'tool': data.tools['swh-metadata-detector'],
	}
	for id_ in ids
	]

	# then
	assert actual_data == expected_data_v1

	# given
	def f1():
	storage.origin_intrinsic_metadata_add(
	data_v2a, conflict_update=True)

	def f2():
	storage.origin_intrinsic_metadata_add(
	data_v2b, conflict_update=True)

	t1 = threading.Thread(target=f1)
	t2 = threading.Thread(target=f2)
	t2.start()
	t1.start()

	t1.join()
	t2.join()

	actual_data = list(storage.origin_intrinsic_metadata_get(origins))

	expected_data_v2 = [
	{
	'id': 'file:///tmp/origin%d' % id_,
	'from_revision': data.revision_id_2,
	**example_data2,
	'tool': data.tools['swh-metadata-detector'],
	}
	for id_ in ids
	]

	assert len(actual_data) == len(expected_data_v2)
	assert sorted(actual_data, key=lambda x: x['id']) == expected_data_v2

	def test_origin_intrinsic_metadata_add__duplicate_twice(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# given
	tool_id = data.tools['swh-metadata-detector']['id']

	metadata = {
	'developmentStatus': None,
	'name': None,
	}
	metadata_rev = {
	'id': data.revision_id_2,
	'metadata': metadata,
	'mappings': ['mapping1'],
	'indexer_configuration_id': tool_id,
	}
	metadata_origin = {
	'id': data.origin_url_1,
	'metadata': metadata,
	'indexer_configuration_id': tool_id,
	'mappings': ['mapping1'],
	'from_revision': data.revision_id_2,
	}

	# when
	storage.revision_intrinsic_metadata_add([metadata_rev])

	with pytest.raises(ValueError):
	storage.origin_intrinsic_metadata_add([
	metadata_origin, metadata_origin])

	def test_origin_intrinsic_metadata_search_fulltext(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# given
	tool_id = data.tools['swh-metadata-detector']['id']

	metadata1 = {
	'author': 'John Doe',
	}
	metadata1_rev = {
	'id': data.revision_id_1,
	'metadata': metadata1,
	'mappings': [],
	'indexer_configuration_id': tool_id,
	}
	metadata1_origin = {
	'id': data.origin_url_1,
	'metadata': metadata1,
	'mappings': [],
	'indexer_configuration_id': tool_id,
	'from_revision': data.revision_id_1,
	}
	metadata2 = {
	'author': 'Jane Doe',
	}
	metadata2_rev = {
	'id': data.revision_id_2,
	'metadata': metadata2,
	'mappings': [],
	'indexer_configuration_id': tool_id,
	}
	metadata2_origin = {
	'id': data.origin_url_2,
	'metadata': metadata2,
	'mappings': [],
	'indexer_configuration_id': tool_id,
	'from_revision': data.revision_id_2,
	}

	# when
	storage.revision_intrinsic_metadata_add([metadata1_rev])
	storage.origin_intrinsic_metadata_add([metadata1_origin])
	storage.revision_intrinsic_metadata_add([metadata2_rev])
	storage.origin_intrinsic_metadata_add([metadata2_origin])

	# then
	search = storage.origin_intrinsic_metadata_search_fulltext
	assert set([res['id'] for res in search(['Doe'])]) \
	== set([data.origin_url_1, data.origin_url_2])
	assert [res['id'] for res in search(['John', 'Doe'])] \
	== [data.origin_url_1]
	assert [res['id'] for res in search(['John'])] \
	== [data.origin_url_1]
	assert not list(search(['John', 'Jane']))

	def test_origin_intrinsic_metadata_search_fulltext_rank(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	# given
	tool_id = data.tools['swh-metadata-detector']['id']

	# The following authors have "Random Person" to add some more content
	# to the JSON data, to work around normalization quirks when there
	# are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words
	# for small values of nb_words).
	metadata1 = {
	'author': [
	'Random Person',
	'John Doe',
	'Jane Doe',
	]
	}
	metadata1_rev = {
	'id': data.revision_id_1,
	'metadata': metadata1,
	'mappings': [],
	'indexer_configuration_id': tool_id,
	}
	metadata1_origin = {
	'id': data.origin_url_1,
	'metadata': metadata1,
	'mappings': [],
	'indexer_configuration_id': tool_id,
	'from_revision': data.revision_id_1,
	}
	metadata2 = {
	'author': [
	'Random Person',
	'Jane Doe',
	]
	}
	metadata2_rev = {
	'id': data.revision_id_2,
	'metadata': metadata2,
	'mappings': [],
	'indexer_configuration_id': tool_id,
	}
	metadata2_origin = {
	'id': data.origin_url_2,
	'metadata': metadata2,
	'mappings': [],
	'indexer_configuration_id': tool_id,
	'from_revision': data.revision_id_2,
	}

	# when
	storage.revision_intrinsic_metadata_add([metadata1_rev])
	storage.origin_intrinsic_metadata_add([metadata1_origin])
	storage.revision_intrinsic_metadata_add([metadata2_rev])
	storage.origin_intrinsic_metadata_add([metadata2_origin])

	# then
	search = storage.origin_intrinsic_metadata_search_fulltext
	assert [res['id'] for res in search(['Doe'])] \
	== [data.origin_url_1, data.origin_url_2]
	assert [res['id'] for res in search(['Doe'], limit=1)] \
	== [data.origin_url_1]
	assert [res['id'] for res in search(['John'])] \
	== [data.origin_url_1]
	assert [res['id'] for res in search(['Jane'])] \
	== [data.origin_url_2, data.origin_url_1]
	assert [res['id'] for res in search(['John', 'Jane'])] \
	== [data.origin_url_1]

	def _fill_origin_intrinsic_metadata(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	tool1_id = data.tools['swh-metadata-detector']['id']
	tool2_id = data.tools['swh-metadata-detector2']['id']

	metadata1 = {
	'@context': 'foo',
	'author': 'John Doe',
	}
	metadata1_rev = {
	'id': data.revision_id_1,
	'metadata': metadata1,
	'mappings': ['npm'],
	'indexer_configuration_id': tool1_id,
	}
	metadata1_origin = {
	'id': data.origin_url_1,
	'metadata': metadata1,
	'mappings': ['npm'],
	'indexer_configuration_id': tool1_id,
	'from_revision': data.revision_id_1,
	}
	metadata2 = {
	'@context': 'foo',
	'author': 'Jane Doe',
	}
	metadata2_rev = {
	'id': data.revision_id_2,
	'metadata': metadata2,
	'mappings': ['npm', 'gemspec'],
	'indexer_configuration_id': tool2_id,
	}
	metadata2_origin = {
	'id': data.origin_url_2,
	'metadata': metadata2,
	'mappings': ['npm', 'gemspec'],
	'indexer_configuration_id': tool2_id,
	'from_revision': data.revision_id_2,
	}
	metadata3 = {
	'@context': 'foo',
	}
	metadata3_rev = {
	'id': data.revision_id_3,
	'metadata': metadata3,
	'mappings': ['npm', 'gemspec'],
	'indexer_configuration_id': tool2_id,
	}
	metadata3_origin = {
	'id': data.origin_url_3,
	'metadata': metadata3,
	'mappings': ['pkg-info'],
	'indexer_configuration_id': tool2_id,
	'from_revision': data.revision_id_3,
	}

	storage.revision_intrinsic_metadata_add([metadata1_rev])
	storage.origin_intrinsic_metadata_add([metadata1_origin])
	storage.revision_intrinsic_metadata_add([metadata2_rev])
	storage.origin_intrinsic_metadata_add([metadata2_origin])
	storage.revision_intrinsic_metadata_add([metadata3_rev])
	storage.origin_intrinsic_metadata_add([metadata3_origin])

	def test_origin_intrinsic_metadata_search_by_producer(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	self._fill_origin_intrinsic_metadata(
	swh_indexer_storage_with_data)
	tool1 = data.tools['swh-metadata-detector']
	tool2 = data.tools['swh-metadata-detector2']
	endpoint = storage.origin_intrinsic_metadata_search_by_producer

	# test pagination
	# no 'page_token' param, return all origins
	result = endpoint(ids_only=True)
	assert result['origins'] \
	== [data.origin_url_1, data.origin_url_2, data.origin_url_3]
	assert 'next_page_token' not in result

	# 'page_token' is < than origin_1, return everything
	result = endpoint(page_token=data.origin_url_1[:-1], ids_only=True)
	assert result['origins'] \
	== [data.origin_url_1, data.origin_url_2, data.origin_url_3]
	assert 'next_page_token' not in result

	# 'page_token' is origin_3, return nothing
	result = endpoint(page_token=data.origin_url_3, ids_only=True)
	assert not result['origins']
	assert 'next_page_token' not in result

	# test limit argument
	result = endpoint(page_token=data.origin_url_1[:-1],
	limit=2, ids_only=True)
	assert result['origins'] == [data.origin_url_1, data.origin_url_2]
	assert result['next_page_token'] == result['origins'][-1]

	result = endpoint(page_token=data.origin_url_1, limit=2, ids_only=True)
	assert result['origins'] == [data.origin_url_2, data.origin_url_3]
	assert 'next_page_token' not in result

	result = endpoint(page_token=data.origin_url_2, limit=2, ids_only=True)
	assert result['origins'] == [data.origin_url_3]
	assert 'next_page_token' not in result

	# test mappings filtering
	result = endpoint(mappings=['npm'], ids_only=True)
	assert result['origins'] == [data.origin_url_1, data.origin_url_2]
	assert 'next_page_token' not in result

	result = endpoint(mappings=['npm', 'gemspec'], ids_only=True)
	assert result['origins'] == [data.origin_url_1, data.origin_url_2]
	assert 'next_page_token' not in result

	result = endpoint(mappings=['gemspec'], ids_only=True)
	assert result['origins'] == [data.origin_url_2]
	assert 'next_page_token' not in result

	result = endpoint(mappings=['pkg-info'], ids_only=True)
	assert result['origins'] == [data.origin_url_3]
	assert 'next_page_token' not in result

	result = endpoint(mappings=['foobar'], ids_only=True)
	assert not result['origins']
	assert 'next_page_token' not in result

	# test pagination + mappings
	result = endpoint(mappings=['npm'], limit=1, ids_only=True)
	assert result['origins'] == [data.origin_url_1]
	assert result['next_page_token'] == result['origins'][-1]

	# test tool filtering
	result = endpoint(tool_ids=[tool1['id']], ids_only=True)
	assert result['origins'] == [data.origin_url_1]
	assert 'next_page_token' not in result

	result = endpoint(tool_ids=[tool2['id']], ids_only=True)
	assert sorted(result['origins']) \
	== [data.origin_url_2, data.origin_url_3]
	assert 'next_page_token' not in result

	result = endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True)
	assert sorted(result['origins']) \
	== [data.origin_url_1, data.origin_url_2, data.origin_url_3]
	assert 'next_page_token' not in result

	# test ids_only=False
	assert endpoint(mappings=['gemspec'])['origins'] \
	== [{
	'id': data.origin_url_2,
	'metadata': {
	'@context': 'foo',
	'author': 'Jane Doe',
	},
	'mappings': ['npm', 'gemspec'],
	'tool': tool2,
	'from_revision': data.revision_id_2,
	}]

	def test_origin_intrinsic_metadata_stats(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	self._fill_origin_intrinsic_metadata(
	swh_indexer_storage_with_data)

	result = storage.origin_intrinsic_metadata_stats()
	assert result == {
	'per_mapping': {
	'gemspec': 1,
	'npm': 2,
	'pkg-info': 1,
	'codemeta': 0,
	'maven': 0,
	},
	'total': 3,
	'non_empty': 2,
	}


	class TestIndexerStorageIndexerCondifuration:
	def test_indexer_configuration_add(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	tool = {
	'tool_name': 'some-unknown-tool',
	'tool_version': 'some-version',
	'tool_configuration': {"debian-package": "some-package"},
	}

	actual_tool = storage.indexer_configuration_get(tool)
	assert actual_tool is None # does not exist

	# add it
	actual_tools = list(storage.indexer_configuration_add([tool]))

	assert len(actual_tools) == 1
	actual_tool = actual_tools[0]
	assert actual_tool is not None # now it exists
	new_id = actual_tool.pop('id')
	assert actual_tool == tool

	actual_tools2 = list(storage.indexer_configuration_add([tool]))
	actual_tool2 = actual_tools2[0]
	assert actual_tool2 is not None # now it exists
	new_id2 = actual_tool2.pop('id')

	assert new_id == new_id2
	assert actual_tool == actual_tool2

	def test_indexer_configuration_add_multiple(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	tool = {
	'tool_name': 'some-unknown-tool',
	'tool_version': 'some-version',
	'tool_configuration': {"debian-package": "some-package"},
	}

	actual_tools = list(storage.indexer_configuration_add([tool]))
	assert len(actual_tools) == 1

	new_tools = [tool, {
	'tool_name': 'yet-another-tool',
	'tool_version': 'version',
	'tool_configuration': {},
	}]

	actual_tools = list(storage.indexer_configuration_add(new_tools))
	assert len(actual_tools) == 2

	# order not guaranteed, so we iterate over results to check
	for tool in actual_tools:
	_id = tool.pop('id')
	assert _id is not None
	assert tool in new_tools

	def test_indexer_configuration_get_missing(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	tool = {
	'tool_name': 'unknown-tool',
	'tool_version': '3.1.0rc2-31-ga2cbb8c',
	'tool_configuration': {"command_line": "nomossa <filepath>"},
	}

	actual_tool = storage.indexer_configuration_get(tool)

	assert actual_tool is None

	def test_indexer_configuration_get(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	tool = {
	'tool_name': 'nomos',
	'tool_version': '3.1.0rc2-31-ga2cbb8c',
	'tool_configuration': {"command_line": "nomossa <filepath>"},
	}

	actual_tool = storage.indexer_configuration_get(tool)
	assert actual_tool

	expected_tool = tool.copy()
	del actual_tool['id']

	assert expected_tool == actual_tool

	def test_indexer_configuration_metadata_get_missing_context(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	tool = {
	'tool_name': 'swh-metadata-translator',
	'tool_version': '0.0.1',
	'tool_configuration': {"context": "unknown-context"},
	}

	actual_tool = storage.indexer_configuration_get(tool)

	assert actual_tool is None

	def test_indexer_configuration_metadata_get(
	self, swh_indexer_storage_with_data):
	storage, data = swh_indexer_storage_with_data
	tool = {
	'tool_name': 'swh-metadata-translator',
	'tool_version': '0.0.1',
	'tool_configuration': {"type": "local", "context": "NpmMapping"},
	}

	storage.indexer_configuration_add([tool])
	actual_tool = storage.indexer_configuration_get(tool)
	assert actual_tool

	expected_tool = tool.copy()
	expected_tool['id'] = actual_tool['id']

	assert expected_tool == actual_tool


	class TestIndexerStorageMisc:
	"""Misc endpoints tests for the IndexerStorage.
	"""

	def test_check_config(self, swh_indexer_storage):
	storage = swh_indexer_storage
	assert storage.check_config(check_write=True)
	assert storage.check_config(check_write=False)

File Metadata

Mime Type: text/x-diff
Expires: Jun 4 2025, 7:43 PM (10 w, 4 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3284723

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions