Differential D695 Diff 2241 swh/storage/in_memory.py

Changeset View

Standalone View

swh/storage/in_memory.py

Show All 11 Lines
import datetime		import datetime
import itertools		import itertools
import random		import random
import warnings		import warnings

from swh.model.hashutil import DEFAULT_ALGORITHMS		from swh.model.hashutil import DEFAULT_ALGORITHMS
from swh.model.identifiers import normalize_timestamp		from swh.model.identifiers import normalize_timestamp

		# Max block size of contents to return
		BULK_BLOCK_CONTENT_LEN_MAX = 10000


def now():		def now():
return datetime.datetime.now(tz=datetime.timezone.utc)		return datetime.datetime.now(tz=datetime.timezone.utc)


OriginVisitKey = collections.namedtuple('OriginVisitKey', 'origin date')		OriginVisitKey = collections.namedtuple('OriginVisitKey', 'origin date')


Show All 9 Lines	def __init__(self):
self._snapshots = {}		self._snapshots = {}
self._origins = {}		self._origins = {}
self._origin_visits = {}		self._origin_visits = {}
self._origin_metadata = defaultdict(list)		self._origin_metadata = defaultdict(list)
self._tools = {}		self._tools = {}
self._metadata_providers = {}		self._metadata_providers = {}
self._objects = defaultdict(list)		self._objects = defaultdict(list)

		# ideally we would want a skip list for both fast inserts and searches
		self._sorted_sha1s = []

def check_config(self, *, check_write):		def check_config(self, *, check_write):
"""Check that the storage is configured and ready to go."""		"""Check that the storage is configured and ready to go."""
return True		return True

def content_add(self, contents):		def content_add(self, contents):
"""Add content blobs to the storage		"""Add content blobs to the storage

Args:		Args:
Show All 20 Lines	def content_add(self, contents):
if content[algorithm] in self._content_indexes[algorithm]:		if content[algorithm] in self._content_indexes[algorithm]:
from . import HashCollision		from . import HashCollision
raise HashCollision(algorithm, content[algorithm], key)		raise HashCollision(algorithm, content[algorithm], key)
for algorithm in DEFAULT_ALGORITHMS:		for algorithm in DEFAULT_ALGORITHMS:
self._content_indexes[algorithm][content[algorithm]].add(key)		self._content_indexes[algorithm][content[algorithm]].add(key)
self._objects[content['sha1_git']].append(		self._objects[content['sha1_git']].append(
('content', content['sha1']))		('content', content['sha1']))
self._contents[key] = copy.deepcopy(content)		self._contents[key] = copy.deepcopy(content)
self._contents[key]['ctime'] = now()		self._contents[key]['ctime'] = now()
		ardumontUnsubmitted Not Done Inline Actions Why? A comment would help. ardumont: Why? A comment would help.
		vlorentzAuthorUnsubmitted Done Inline Actions Why what? It just inserts the new sha1 in the list vlorentz: Why what? It just inserts the new sha1 in the list
		ardumontUnsubmitted Not Done Inline Actions You change something in the function content_add. Your diff is about content_get and content_get_range. It's not immediately apparent what this does. Now it's clearer ;) ardumont: You change something in the function content_add. Your diff is about content_get and…
		vlorentzAuthorUnsubmitted Done Inline Actions Yeah, I added `self._sorted_sha1` to make searches run in logtime. vlorentz: Yeah, I added `self._sorted_sha1` to make searches run in logtime.
		bisect.insort(self._sorted_sha1s, content['sha1'])
if self._contents[key]['status'] == 'visible':		if self._contents[key]['status'] == 'visible':
self._contents_data[key] = self._contents[key].pop('data')		self._contents_data[key] = self._contents[key].pop('data')

		def content_get(self, ids):
		"""Retrieve in bulk contents and their data.

		This function may yield more blobs than provided sha1 identifiers,
		in case they collide.

		Args:
		content: iterables of sha1

		Yields:
		ardumontUnsubmitted Done Inline Actions Dict[bytes, bytes]? ardumont: Dict[bytes, bytes]?
		ardumontUnsubmitted Not Done Inline Actions Why is the key str? What did i miss? ardumont: Why is the key str? What did i miss?
		vlorentzAuthorUnsubmitted Done Inline Actions Keys are `'sha1'` and `'data'`. vlorentz: Keys are `'sha1'` and `'data'`.
		ardumontUnsubmitted Not Done Inline Actions right! ardumont: right!
		Dict[str, bytes]: Generates streams of contents as dict with their
		raw data:
		ardumontUnsubmitted Done Inline Actions sha1 (bytes): content id ardumont: sha1 (bytes): content id

		ardumontUnsubmitted Done Inline Actions data (bytes): content's raw data ardumont: data (bytes): content's raw data
		- sha1 (bytes): content id
		- data (bytes): content's raw data

		Raises:
		ValueError in case of too much contents are required.
		cf. BULK_BLOCK_CONTENT_LEN_MAX

		ardumontUnsubmitted Not Done Inline Actions remove irrelevant fixme. You mean irrelevant for the in-memory usage (test) i guess. Explaining why i ask. For the initial storage, as we starting developing paginated endpoints, i'm wondering if it would not be better to align the endpoint as well. If it's possible that is (i'm not sure we enforce an order for example). That would be out of scope for that diff ;) The question might be worth discussing (the conclusion could also be to drop the conditional altogether ;). ardumont: > remove irrelevant fixme. You mean irrelevant for the in-memory usage (test) i guess.
		vlorentzAuthorUnsubmitted Done Inline Actions nvm, I misunderstood that comment vlorentz: nvm, I misunderstood that comment
		ardumontUnsubmitted Not Done Inline Actions Since you touch the comment, you can now fix it to make it clearer then ;) ardumont: Since you touch the comment, you can now fix it to make it clearer then ;)
		"""
		# FIXME: Make this method support slicing the `data`.
		if len(ids) > BULK_BLOCK_CONTENT_LEN_MAX:
		raise ValueError(
		"Sending at most %s contents." % BULK_BLOCK_CONTENT_LEN_MAX)
		for id_ in ids:
		for key in self._content_indexes['sha1'][id_]:
		yield {
		'sha1': id_,
		'data': self._contents_data[key],
		}

		def content_get_range(self, start, end, limit=1000, db=None, cur=None):
		"""Retrieve contents within range [start, end] bound by limit.
		ardumontUnsubmitted Done Inline Actions Can you please clarify what you mean by 'enforced with multiplicity'? ardumont: Can you please clarify what you mean by 'enforced with multiplicity'?

		Note that this function may return more than one blob per hash. The
		ardumontUnsubmitted Not Done Inline Actions Can't we skip those? Pushing filtering on the endpoint consumer feels bad to me. (and if in the main storage there is such a case, that's an out of scope issue, an issue nonetheless). ardumont: Can't we skip those? Pushing filtering on the endpoint consumer feels bad to me. (and if in…
		vlorentzAuthorUnsubmitted Done Inline Actions That sentence was wrong, `content_get_range` returns no `None` item. vlorentz: That sentence was wrong, `content_get_range` returns no `None` item.
		limit is enforced with multiplicity (ie. two blobs with the same hash
		will count twice toward the limit).

		Args:
		start (bytes): Starting identifier range (expected smaller
		than end)
		end (bytes): Ending identifier range (expected larger
		than start)
		limit (int): Limit result (default to 1000)

		Returns:
		a dict with keys:
		- contents [dict]: iterable of contents in between the range.
		- next (bytes): There remains content in the range
		starting from this next sha1

		"""
		if limit is None:
		raise ValueError('Development error: limit should not be None')
		from_index = bisect.bisect_left(self._sorted_sha1s, start)
		sha1s = itertools.islice(self._sorted_sha1s, from_index, None)
		sha1s = ((sha1, content_key)
		for sha1 in sha1s
		for content_key in self._content_indexes['sha1'][sha1])
		matched = []
		for sha1, key in sha1s:
		if sha1 > end:
		break
		ardumontUnsubmitted Done Inline Actions I never said it before. We are growing inconsistent with the formatting. That seems like nothing but that can end up doing some irrelevant noise in commits. So this should be avoided. Naturally, that's up for discussion but in the mean time, that'd be great to align to current conventions. So. in this case, ending character aligned at the beginning of the initial keyword. return { 'contents': matched, 'next': sha1, } As per tool, that'd be great not to bother with this and have some tool doing that for us. @zack mentioned black 'recently' but we never (re)acted on that matter... ardumont: I never said it before. We are growing inconsistent with the formatting. That seems like…
		if len(matched) >= limit:
		return {
		'contents': matched,
		'next': sha1,
		}
		matched.append({
		'data': self._contents_data[key],
		**self._contents[key],
		douarddaUnsubmitted Not Done Inline Actions Same as a previous comment, prefer a single return statement when possible/not too obfuscating. Here, something like: from itertools import chain, islice next = None, contents = [] sha1s = islice(self._sorted_sha1s, 0, from_index) it_contents = (((sha1, v) for v in self._content_indexes['sha1'][sha1]) for sha1 in sha1s) for sha1, key in islice(chain.from_iterable(it_contents), limit): if sha1 > end: break contents.append({ 'data': self._contents_data[key], self._contents[key], }) else: next = sha1 return dict(next=next, contents=contents) douardda:** Same as a previous comment, prefer a single return statement when possible/not too obfuscating.
		vlorentzAuthorUnsubmitted Done Inline Actions You can't do that, because it assigns the last returned value to `next`, instead of the first non-returned one. As a consequence, `UnboundLocalError` when there is no result because the `for` loop does not run. So the `for` loop needs to be rewritten like this: next = None for sha1, key in chain.from_iterable(it_contents): if sha1 > end: break if len(contents) >= limit: next = sha1 break contents.append({ 'data': self._contents_data[key], self._contents[key], }) return dict(next=next, contents=contents) Which I find slightly less readable than this: for sha1, key in chain.from_iterable(it_contents): if sha1 > end: break if len(contents) >= limit: return dict(next=sha1, contents=contents) contents.append({ 'data': self._contents_data[key], self._contents[key], }) return dict(next=None, contents=contents) because the former uses an extra `next` variable to do the same thing (with a "break then immediately return" instead of just a return). vlorentz: You can't do that, because it assigns the last returned value to `next`, instead of the first…
		ardumontUnsubmitted Not Done Inline Actions Yes, i was wondering whether what david proposed would work. Pick either one of the last one you proposed. They sound both fine to me. ardumont: Yes, i was wondering whether what david proposed would work. Pick either one of the last one…
		douarddaUnsubmitted Not Done Inline Actions Indeed I changed the behavior in my snippet, but a small fix could be: for sha1, key in islice(chain.from_iterable(it_contents), limit+1): #[...] return dict(next=next, contents=contents[:limit]) would do the trick, but again, it's readability is debatable. Whatever, I still don't like your return "hidden" in the middle of the loop, but I won't fight. douardda: Indeed I changed the behavior in my snippet, but a small fix could be: ``` for sha1, key in…
		})
		return {
		'contents': matched,
		'next': None,
		}

def content_get_metadata(self, sha1s):		def content_get_metadata(self, sha1s):
"""Retrieve content metadata in bulk		"""Retrieve content metadata in bulk

Args:		Args:
content: iterable of content identifiers (sha1)		content: iterable of content identifiers (sha1)

Returns:		Returns:
an iterable with content metadata corresponding to the given ids		an iterable with content metadata corresponding to the given ids
▲ Show 20 Lines • Show All 1,017 Lines • Show Last 20 Lines