Differential D784 Diff 2493 swh/indexer/storage/in_memory.py

Changeset View

Standalone View

swh/indexer/storage/in_memory.py

# Copyright (C) 2018 The Software Heritage developers		# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

		import bisect
from collections import defaultdict		from collections import defaultdict
import json		import json

SHA1_DIGEST_SIZE = 160		SHA1_DIGEST_SIZE = 160


def _transform_tool(tool):		def _transform_tool(tool):
return {		return {
'id': tool['id'],		'id': tool['id'],
'name': tool['tool_name'],		'name': tool['tool_name'],
'version': tool['tool_version'],		'version': tool['tool_version'],
'configuration': tool['tool_configuration'],		'configuration': tool['tool_configuration'],
}		}


class SubStorage:		class SubStorage:
"""Implements common missing/get/add logic for each indexer type."""		"""Implements common missing/get/add logic for each indexer type."""
def __init__(self, tools):		def __init__(self, tools):
self._tools = tools		self._tools = tools
		self._sorted_ids = []
self._data = {} # map (id_, tool_id) -> metadata_dict		self._data = {} # map (id_, tool_id) -> metadata_dict
self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id]		self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id]

def missing(self, ids):		def missing(self, ids):
"""List data missing from storage.		"""List data missing from storage.

Args:		Args:
data (iterable): dictionaries with keys:		data (iterable): dictionaries with keys:
Show All 30 Lines	def get(self, ids):
for tool_id in self._tools_per_id.get(id_, set()):		for tool_id in self._tools_per_id.get(id_, set()):
key = (id_, tool_id)		key = (id_, tool_id)
yield {		yield {
'id': id_,		'id': id_,
'tool': _transform_tool(self._tools[tool_id]),		'tool': _transform_tool(self._tools[tool_id]),
**self._data[key],		**self._data[key],
}		}

		def get_range(self, start, end, indexer_configuration_id, limit):
		"""Retrieve data within range [start, end] bound by limit.

		Args:
		start (bytes): Starting identifier range (expected smaller
		than end)
		end (bytes): Ending identifier range (expected larger
		than start)
		indexer_configuration_id (int): The tool used to index data
		limit (int): Limit result

		Raises:
		ValueError for limit to None

		Returns:
		a dict with keys:
		- ids [bytes]: iterable of content ids within the range.
		- next (Optional[bytes]): The next range of sha1 starts at
		this sha1 if any

		"""
		if limit is None:
		raise ValueError('Development error: limit should not be None')
		from_index = bisect.bisect_left(self._sorted_ids, start)
		to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index)
		if to_index - from_index >= limit:
		return {
		'ids': self._sorted_ids[from_index:from_index+limit],
		'next': self._sorted_ids[from_index+limit],
		}
		else:
		return {
		'ids': self._sorted_ids[from_index:to_index],
		'next': None,
		}

def add(self, data, conflict_update):		def add(self, data, conflict_update):
"""Add data not present in storage.		"""Add data not present in storage.

Args:		Args:
data (iterable): dictionaries with keys:		data (iterable): dictionaries with keys:

- id: sha1		- id: sha1
- indexer_configuration_id: tool used to compute the		- indexer_configuration_id: tool used to compute the
Show All 11 Lines	def add(self, data, conflict_update):
data = item		data = item
if not conflict_update and \		if not conflict_update and \
tool_id in self._tools_per_id.get(id_, set()):		tool_id in self._tools_per_id.get(id_, set()):
# Duplicate, should not be updated		# Duplicate, should not be updated
continue		continue
key = (id_, tool_id)		key = (id_, tool_id)
self._data[key] = data		self._data[key] = data
self._tools_per_id[id_].add(tool_id)		self._tools_per_id[id_].add(tool_id)
		if id_ not in self._sorted_ids:
		bisect.insort(self._sorted_ids, id_)

		def add_merge(self, new_data, conflict_update, merged_key):
		for new_item in new_data:
		id_ = new_item['id']
		tool_id = new_item['indexer_configuration_id']
		if conflict_update:
		all_subitems = []
		else:
		existing = list(self.get([id_]))
		all_subitems = [
		old_subitem
		for existing_item in existing
		if existing_item['tool']['id'] == tool_id
		for old_subitem in existing_item[merged_key]
		]
		for new_subitem in new_item[merged_key]:
		if new_subitem not in all_subitems:
		all_subitems.append(new_subitem)
		self.add([
		{
		'id': id_,
		'indexer_configuration_id': tool_id,
		merged_key: all_subitems,
		}
		], conflict_update=True)
		if id_ not in self._sorted_ids:
		bisect.insort(self._sorted_ids, id_)


		ardumontUnsubmitted Not Done Inline Actions That's not supposed to be merge. As i mention early in our discussion [1] Another point to what i said earlier is that there is no merge scenario in the indexer storage tests. Knowing me, i would have added it if i initially added this behavior it. [1] D783#inline-4276 ardumont: That's not supposed to be merge. As i mention early in our discussion [1] Another point to…
class IndexerStorage:		class IndexerStorage:
"""In-memory SWH indexer storage."""		"""In-memory SWH indexer storage."""

def __init__(self):		def __init__(self):
		ardumontUnsubmitted Not Done Inline Actions My point is, if you add a simple add scenario here, everything should still be green. ardumont: My point is, if you add a simple add scenario here, everything should still be green.
		ardumontUnsubmitted Not Done Inline Actions Ok, so everything is fine in the end. It's also an implementation detail of the in-memory storage this time. Moving along ;) ardumont: Ok, so everything is fine in the end. It's also an implementation detail of the in-memory…
self._tools = {}		self._tools = {}
self._mimetypes = SubStorage(self._tools)		self._mimetypes = SubStorage(self._tools)
self._content_ctags = SubStorage(self._tools)		self._content_ctags = SubStorage(self._tools)
		self._licenses = SubStorage(self._tools)
self._content_metadata = SubStorage(self._tools)		self._content_metadata = SubStorage(self._tools)
self._revision_metadata = SubStorage(self._tools)		self._revision_metadata = SubStorage(self._tools)

def content_mimetype_missing(self, mimetypes):		def content_mimetype_missing(self, mimetypes):
"""Generate mimetypes missing from storage.		"""Generate mimetypes missing from storage.

Args:		Args:
mimetypes (iterable): iterable of dict with keys:		mimetypes (iterable): iterable of dict with keys:
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines	def content_ctags_get(self, ids):
}		}

def content_ctags_add(self, ctags, conflict_update=False):		def content_ctags_add(self, ctags, conflict_update=False):
"""Add ctags not present in storage		"""Add ctags not present in storage

Args:		Args:
ctags (iterable): dictionaries with keys:		ctags (iterable): dictionaries with keys:

- id (bytes): sha1		- id (bytes): sha1
		ardumontUnsubmitted Not Done Inline Actions That's actually the other way around as you mentioned in T1443. ardumont: That's actually the other way around as you mentioned in T1443.
		ardumontUnsubmitted Not Done Inline Actions fixed in another diff. ardumont: fixed in another diff.
- ctags ([list): List of dictionary with keys: name, kind,		- ctags ([list): List of dictionary with keys: name, kind,
line, lang		line, lang
- indexer_configuration_id: tool used to compute the		- indexer_configuration_id: tool used to compute the
results		results

"""		"""
for item in ctags:		self._content_ctags.add_merge(ctags, conflict_update, 'ctags')
tool_id = item['indexer_configuration_id']
if conflict_update:
item_ctags = []
else:
# merge old ctags with new ctags
existing = list(self._content_ctags.get([item['id']]))
item_ctags = [
{
key: ctags_item[key]
for key in ('name', 'kind', 'line', 'lang')
}
for existing_item in existing
if existing_item['tool']['id'] == tool_id
for ctags_item in existing_item['ctags']
]
for new_item_ctags in item['ctags']:
if new_item_ctags not in item_ctags:
item_ctags.append(new_item_ctags)
self._content_ctags.add([
{
'id': item['id'],
'indexer_configuration_id': tool_id,
'ctags': item_ctags,
}
], conflict_update=True)

def content_ctags_search(self, expression,		def content_ctags_search(self, expression,
limit=10, last_sha1=None, db=None, cur=None):		limit=10, last_sha1=None, db=None, cur=None):
"""Search through content's raw ctags symbols.		"""Search through content's raw ctags symbols.

Args:		Args:
expression (str): Expression to search for		expression (str): Expression to search for
limit (int): Number of rows to return (default to 10).		limit (int): Number of rows to return (default to 10).
Show All 15 Lines	def content_ctags_search(self, expression,
yield {		yield {
'id': id_,		'id': id_,
'tool': _transform_tool(self._tools[tool_id]),		'tool': _transform_tool(self._tools[tool_id]),
**ctags_item		**ctags_item
}		}
if nb_matches >= limit:		if nb_matches >= limit:
return		return

		def content_fossology_license_get(self, ids):
		"""Retrieve licenses per id.

		Args:
		ids (iterable): sha1 checksums

		Yields:
		`{id: facts}` where `facts` is a dict with the following keys:

		- licenses ([str]): associated licenses for that content
		- tool (dict): Tool used to compute the license

		"""
		# TODO: remove this reformatting in order to yield items with the
		# same format as other _get methods.
		res = {}
		for d in self._licenses.get(ids):
		res.setdefault(d.pop('id'), []).append(d)
		for (id_, facts) in res.items():
		yield {id_: facts}

		def content_fossology_license_add(self, licenses, conflict_update=False):
		"""Add licenses not present in storage.

		Args:
		licenses (iterable): dictionaries with keys:

		- id: sha1
		- licenses ([bytes]): List of licenses associated to sha1
		- tool (str): nomossa

		conflict_update: Flag to determine if we want to overwrite (true)
		or skip duplicates (false, the default)

		Returns:
		list: content_license entries which failed due to unknown licenses

		"""
		self._licenses.add_merge(licenses, conflict_update, 'licenses')

		def content_fossology_license_get_range(
		self, start, end, indexer_configuration_id, limit=1000):
		"""Retrieve licenses within range [start, end] bound by limit.

		Args:
		start (bytes): Starting identifier range (expected smaller
		than end)
		end (bytes): Ending identifier range (expected larger
		than start)
		indexer_configuration_id (int): The tool used to index data
		limit (int): Limit result (default to 1000)

		Raises:
		ValueError for limit to None

		Returns:
		a dict with keys:
		- ids [bytes]: iterable of content ids within the range.
		- next (Optional[bytes]): The next range of sha1 starts at
		this sha1 if any

		"""
		return self._licenses.get_range(
		start, end, indexer_configuration_id, limit)

def content_metadata_missing(self, metadata):		def content_metadata_missing(self, metadata):
"""List metadata missing from storage.		"""List metadata missing from storage.

Args:		Args:
metadata (iterable): dictionaries with keys:		metadata (iterable): dictionaries with keys:

- id (bytes): sha1 identifier		- id (bytes): sha1 identifier
- indexer_configuration_id (int): tool used to compute		- indexer_configuration_id (int): tool used to compute
▲ Show 20 Lines • Show All 137 Lines • Show Last 20 Lines