Differential D785 Diff 2477 swh/indexer/storage/in_memory.py

Changeset View

Standalone View

swh/indexer/storage/in_memory.py

	# Copyright (C) 2018 The Software Heritage developers			# Copyright (C) 2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution			# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version			# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information			# See top-level LICENSE file for more information

	from collections import defaultdict			from collections import defaultdict
	import json			import json

				SHA1_DIGEST_SIZE = 160

	class MetadataStorage:
	"""Implements missing/get/add logic for both content_metadata and
	revision_metadata."""
	def __init__(self, tools):
	self._tools = tools
	self._metadata = {} # map (id_, tool_id) -> metadata_dict
	self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id]

	def _transform_tool(self, tool):			def _transform_tool(tool):
	return {			return {
	'id': tool['id'],			'id': tool['id'],
	'name': tool['tool_name'],			'name': tool['tool_name'],
	'version': tool['tool_version'],			'version': tool['tool_version'],
	'configuration': tool['tool_configuration'],			'configuration': tool['tool_configuration'],
	}			}


				class SubStorage:
				"""Implements common missing/get/add logic for each indexer type."""
				def __init__(self, tools):
				self._tools = tools
				self._data = {} # map (id_, tool_id) -> metadata_dict
				self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id]

	def missing(self, ids):			def missing(self, ids):
	"""List metadata missing from storage.			"""List data missing from storage.

	Args:			Args:
	metadata (iterable): dictionaries with keys:			data (iterable): dictionaries with keys:

	- id (bytes): sha1 identifier			- id (bytes): sha1 identifier
	- indexer_configuration_id (int): tool used to compute			- indexer_configuration_id (int): tool used to compute
	the results			the results

	Yields:			Yields:
	missing sha1s			missing sha1s

	"""			"""
	for id_ in ids:			for id_ in ids:
	tool_id = id_['indexer_configuration_id']			tool_id = id_['indexer_configuration_id']
	id_ = id_['id']			id_ = id_['id']
	if tool_id not in self._tools_per_id.get(id_, set()):			if tool_id not in self._tools_per_id.get(id_, set()):
	yield id_			yield id_

	def get(self, ids):			def get(self, ids):
	"""Retrieve metadata per id.			"""Retrieve data per id.

	Args:			Args:
	ids (iterable): sha1 checksums			ids (iterable): sha1 checksums

	Yields:			Yields:
	dict: dictionaries with the following keys:			dict: dictionaries with the following keys:

	- id (bytes)			- id (bytes)
	- translated_metadata (str): associated metadata
	- tool (dict): tool used to compute metadata			- tool (dict): tool used to compute metadata
				- arbitrary data (as provided to `add`)

	"""			"""
	for id_ in ids:			for id_ in ids:
	for tool_id in self._tools_per_id.get(id_, set()):			for tool_id in self._tools_per_id.get(id_, set()):
	key = (id_, tool_id)			key = (id_, tool_id)
	yield {			yield {
	'id': id_,			'id': id_,
	'tool': self._transform_tool(self._tools[tool_id]),			'tool': _transform_tool(self._tools[tool_id]),
	'translated_metadata': self._metadata[key],			**self._data[key],
	}			}

	def add(self, metadata, conflict_update):			def add(self, data, conflict_update):
	"""Add metadata not present in storage.			"""Add data not present in storage.

	Args:			Args:
	metadata (iterable): dictionaries with keys:			data (iterable): dictionaries with keys:

	- id: sha1			- id: sha1
	- translated_metadata: arbitrary dict
	- indexer_configuration_id: tool used to compute the			- indexer_configuration_id: tool used to compute the
	results			results
				- arbitrary data

	conflict_update (bool): Flag to determine if we want to overwrite			conflict_update (bool): Flag to determine if we want to overwrite
	(true) or skip duplicates (false)			(true) or skip duplicates (false)

	"""			"""
	for item in metadata:			for item in data:
	tool_id = item['indexer_configuration_id']			item = item.copy()
	data = item['translated_metadata']			tool_id = item.pop('indexer_configuration_id')
	id_ = item['id']			id_ = item.pop('id')
				data = item
	if not conflict_update and \			if not conflict_update and \
	tool_id in self._tools_per_id.get(id_, set()):			tool_id in self._tools_per_id.get(id_, set()):
	# Duplicate, should not be updated			# Duplicate, should not be updated
	continue			continue
	key = (id_, tool_id)			key = (id_, tool_id)
	self._metadata[key] = data			self._data[key] = data
	self._tools_per_id[id_].add(tool_id)			self._tools_per_id[id_].add(tool_id)


	class IndexerStorage:			class IndexerStorage:
	"""In-memory SWH indexer storage."""			"""In-memory SWH indexer storage."""

	def __init__(self):			def __init__(self):
	self._tools = {}			self._tools = {}
	self._content_metadata = MetadataStorage(self._tools)			self._mimetypes = SubStorage(self._tools)
	self._revision_metadata = MetadataStorage(self._tools)			self._content_ctags = SubStorage(self._tools)
				self._content_metadata = SubStorage(self._tools)
				self._revision_metadata = SubStorage(self._tools)

				def content_mimetype_missing(self, mimetypes):
				"""Generate mimetypes missing from storage.

				Args:
				mimetypes (iterable): iterable of dict with keys:

				- id (bytes): sha1 identifier
				- indexer_configuration_id (int): tool used to compute the
				results

				Yields:
				tuple (id, indexer_configuration_id): missing id

				"""
				yield from self._mimetypes.missing(mimetypes)

				def content_mimetype_add(self, mimetypes, conflict_update=False):
				"""Add mimetypes not present in storage.

				Args:
				mimetypes (iterable): dictionaries with keys:

				- id (bytes): sha1 identifier
				- mimetype (bytes): raw content's mimetype
				- encoding (bytes): raw content's encoding
				- indexer_configuration_id (int): tool's id used to
				compute the results
				- conflict_update (bool): Flag to determine if we want to
				overwrite (``True``) or skip duplicates (``False``, the
				default)

				"""
				self._mimetypes.add(mimetypes, conflict_update)

				def content_mimetype_get(self, ids, db=None, cur=None):
				"""Retrieve full content mimetype per ids.

				Args:
				ids (iterable): sha1 identifier

				Yields:
				mimetypes (iterable): dictionaries with keys:

				- id (bytes): sha1 identifier
				- mimetype (bytes): raw content's mimetype
				- encoding (bytes): raw content's encoding
				- tool (dict): Tool used to compute the language

				"""
				yield from self._mimetypes.get(ids)

				def content_ctags_missing(self, ctags):
				"""List ctags missing from storage.

				Args:
				ctags (iterable): dicts with keys:

				- id (bytes): sha1 identifier
				- indexer_configuration_id (int): tool used to compute
				the results

				Yields:
				an iterable of missing id for the tuple (id,
				indexer_configuration_id)

				"""
				yield from self._content_ctags.missing(ctags)

				def content_ctags_get(self, ids):
				"""Retrieve ctags per id.

				Args:
				ids (iterable): sha1 checksums

				Yields:
				Dictionaries with keys:

				- id (bytes): content's identifier
				- name (str): symbol's name
				- kind (str): symbol's kind
				- lang (str): language for that content
				- tool (dict): tool used to compute the ctags' info


				"""
				for item in self._content_ctags.get(ids):
				for item_ctags_item in item['ctags']:
				yield {
				'id': item['id'],
				'tool': item['tool'],
				**item_ctags_item
				}

				def content_ctags_add(self, ctags, conflict_update=False):
				"""Add ctags not present in storage

				Args:
				ctags (iterable): dictionaries with keys:

				- id (bytes): sha1
				- ctags ([list): List of dictionary with keys: name, kind,
				line, lang
				- indexer_configuration_id: tool used to compute the
				results

				"""
				for item in ctags:
				tool_id = item['indexer_configuration_id']
				if conflict_update:
				item_ctags = []
				else:
				# merge old ctags with new ctags
				existing = list(self._content_ctags.get([item['id']]))
				item_ctags = [
				{
				key: ctags_item[key]
				for key in ('name', 'kind', 'line', 'lang')
				}
				for existing_item in existing
				if existing_item['tool']['id'] == tool_id
				for ctags_item in existing_item['ctags']
				]
				for new_item_ctags in item['ctags']:
				if new_item_ctags not in item_ctags:
				item_ctags.append(new_item_ctags)
				self._content_ctags.add([
				{
				'id': item['id'],
				'indexer_configuration_id': tool_id,
				'ctags': item_ctags,
				}
				], conflict_update=True)

				def content_ctags_search(self, expression,
				limit=10, last_sha1=None, db=None, cur=None):
				"""Search through content's raw ctags symbols.

				Args:
				expression (str): Expression to search for
				limit (int): Number of rows to return (default to 10).
				last_sha1 (str): Offset from which retrieving data (default to '').

				Yields:
				rows of ctags including id, name, lang, kind, line, etc...

				"""
				nb_matches = 0
				for ((id_, tool_id), item) in \
				sorted(self._content_ctags._data.items()):
				if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))):
				continue
				nb_matches += 1
				for ctags_item in item['ctags']:
				if ctags_item['name'] != expression:
				continue
				yield {
				'id': id_,
				'tool': _transform_tool(self._tools[tool_id]),
				**ctags_item
				}
				if nb_matches >= limit:
				return

	def content_metadata_missing(self, metadata):			def content_metadata_missing(self, metadata):
	"""List metadata missing from storage.			"""List metadata missing from storage.

	Args:			Args:
	metadata (iterable): dictionaries with keys:			metadata (iterable): dictionaries with keys:

	- id (bytes): sha1 identifier			- id (bytes): sha1 identifier
	▲ Show 20 Lines • Show All 138 Lines • Show Last 20 Lines