Differential D790 Diff 2509 swh/indexer/storage/in_memory.py

Changeset View

Standalone View

swh/indexer/storage/in_memory.py

# Copyright (C) 2018 The Software Heritage developers		# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

import bisect		import bisect
from collections import defaultdict		from collections import defaultdict, Counter
		import itertools
import json		import json
		import operator
		import math
		import re

SHA1_DIGEST_SIZE = 160		SHA1_DIGEST_SIZE = 160


def _transform_tool(tool):		def _transform_tool(tool):
return {		return {
'id': tool['id'],		'id': tool['id'],
'name': tool['tool_name'],		'name': tool['tool_name'],
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	def get(self, ids):
for tool_id in self._tools_per_id.get(id_, set()):		for tool_id in self._tools_per_id.get(id_, set()):
key = (id_, tool_id)		key = (id_, tool_id)
yield {		yield {
'id': id_,		'id': id_,
'tool': _transform_tool(self._tools[tool_id]),		'tool': _transform_tool(self._tools[tool_id]),
**self._data[key],		**self._data[key],
}		}

		def get_all(self):
		yield from self.get(list(self._tools_per_id))

def get_range(self, start, end, indexer_configuration_id, limit):		def get_range(self, start, end, indexer_configuration_id, limit):
"""Retrieve data within range [start, end] bound by limit.		"""Retrieve data within range [start, end] bound by limit.

Args:		Args:
start (bytes): Starting identifier range (expected smaller		start (bytes): Starting identifier range (expected smaller
than end)		than end)
end (bytes): Ending identifier range (expected larger		end (bytes): Ending identifier range (expected larger
than start)		than start)
▲ Show 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	class IndexerStorage:
def __init__(self):		def __init__(self):
self._tools = {}		self._tools = {}
self._mimetypes = SubStorage(self._tools)		self._mimetypes = SubStorage(self._tools)
self._languages = SubStorage(self._tools)		self._languages = SubStorage(self._tools)
self._content_ctags = SubStorage(self._tools)		self._content_ctags = SubStorage(self._tools)
self._licenses = SubStorage(self._tools)		self._licenses = SubStorage(self._tools)
self._content_metadata = SubStorage(self._tools)		self._content_metadata = SubStorage(self._tools)
self._revision_metadata = SubStorage(self._tools)		self._revision_metadata = SubStorage(self._tools)
		self._origin_intrinsic_metadata = SubStorage(self._tools)

def content_mimetype_missing(self, mimetypes):		def content_mimetype_missing(self, mimetypes):
"""Generate mimetypes missing from storage.		"""Generate mimetypes missing from storage.

Args:		Args:
mimetypes (iterable): iterable of dict with keys:		mimetypes (iterable): iterable of dict with keys:

- id (bytes): sha1 identifier		- id (bytes): sha1 identifier
▲ Show 20 Lines • Show All 369 Lines • ▼ Show 20 Lines	def revision_metadata_add(self, metadata, conflict_update=False):
conflict_update: Flag to determine if we want to overwrite (true)		conflict_update: Flag to determine if we want to overwrite (true)
or skip duplicates (false, the default)		or skip duplicates (false, the default)

"""		"""
if not all(isinstance(x['id'], bytes) for x in metadata):		if not all(isinstance(x['id'], bytes) for x in metadata):
raise TypeError('identifiers must be bytes.')		raise TypeError('identifiers must be bytes.')
self._revision_metadata.add(metadata, conflict_update)		self._revision_metadata.add(metadata, conflict_update)

		def origin_intrinsic_metadata_get(self, ids):
		"""Retrieve origin metadata per id.

		Args:
		ids (iterable): origin identifiers

		Yields:
		list: dictionaries with the following keys:

		- origin_id (int)
		- translated_metadata (str): associated metadata
		- tool (dict): tool used to compute metadata

		"""
		for item in self._origin_intrinsic_metadata.get(ids):
		item['origin_id'] = item.pop('id')
		yield item

		def origin_intrinsic_metadata_add(self, metadata,
		conflict_update=False):
		"""Add origin metadata not present in storage.

		Args:
		metadata (iterable): dictionaries with keys:

		- origin_id: origin identifier
		- from_revision: sha1 id of the revision used to generate
		these metadata.
		- metadata: arbitrary dict
		- indexer_configuration_id: tool used to compute metadata

		conflict_update: Flag to determine if we want to overwrite (true)
		or skip duplicates (false, the default)

		"""

		for item in metadata:
		item = item.copy()
		item['id'] = item.pop('origin_id')
		self._origin_intrinsic_metadata.add([item], conflict_update)

		def origin_intrinsic_metadata_search_fulltext(
		self, conjunction, limit=100):
		"""Returns the list of origins whose metadata contain all the terms.

		Args:
		conjunction (List[str]): List of terms to be searched for.
		limit (int): The maximum number of results to return

		Yields:
		list: dictionaries with the following keys:

		- id (int)
		- metadata (str): associated metadata
		- tool (dict): tool used to compute metadata

		"""
		# A very crude fulltext search implementation, but that's enough
		# to work on English metadata
		tokens_re = re.compile('[a-zA-Z0-9]+')
		search_tokens = list(itertools.chain(
		*map(tokens_re.findall, conjunction)))

		def rank(data):
		# Tokenize the metadata
		text = json.dumps(data['metadata'])
		text_tokens = tokens_re.findall(text)
		text_token_occurences = Counter(text_tokens)

		# Count the number of occurences of search tokens in the text
		score = 0
		for search_token in search_tokens:
		if text_token_occurences[search_token] == 0:
		# Search token is not in the text.
		return 0
		score += text_token_occurences[search_token]

		# Normalize according to the text's length
		return score / math.log(len(text_tokens))

		results = [(rank(data), data)
		for data in self._origin_intrinsic_metadata.get_all()]
		results = [(rank_, data) for (rank_, data) in results if rank_ > 0]
		results.sort(key=operator.itemgetter(0), # Don't try to order 'data'
		reverse=True)
		for (rank_, result) in results[:limit]:
		ardumontUnsubmitted Done Inline Actions why not merge those 2 lines? for (_, data) in results[:limit]: ardumont: why not merge those 2 lines? ``` for (_, data) in results[:limit]: ```
		result = result.copy()
		result['origin_id'] = result.pop('id')
		yield result

def indexer_configuration_add(self, tools):		def indexer_configuration_add(self, tools):
"""Add new tools to the storage.		"""Add new tools to the storage.

Args:		Args:
tools ([dict]): List of dictionary representing tool to		tools ([dict]): List of dictionary representing tool to
insert in the db. Dictionary with the following keys:		insert in the db. Dictionary with the following keys:

- tool_name (str): tool's name		- tool_name (str): tool's name
Show All 40 Lines