Differential D2987 Diff 10845 swh/storage/in_memory.py

Changeset View

Standalone View

swh/storage/in_memory.py

# Copyright (C) 2015-2020 The Software Heritage developers		# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

import re		import re
import bisect		import bisect
import dateutil		import dateutil
import collections		import collections
import copy		import copy
import datetime		import datetime
import itertools		import itertools
import random		import random

from collections import defaultdict		from collections import defaultdict
from datetime import timedelta		from datetime import timedelta
from typing import Any, Dict, Iterable, List, Optional, Union		from typing import (
		Any,
		Callable,
		Dict,
		Generic,
		Iterable,
		Iterator,
		List,
		Optional,
		Tuple,
		TypeVar,
		Union,
		)

import attr		import attr

from swh.model.model import (		from swh.model.model import (
BaseContent,		BaseContent,
Content,		Content,
SkippedContent,		SkippedContent,
Directory,		Directory,
Show All 14 Lines
from .converters import origin_url_to_sha1		from .converters import origin_url_to_sha1
from .utils import get_partition_bounds_bytes		from .utils import get_partition_bounds_bytes
from .writer import JournalWriter		from .writer import JournalWriter

# Max block size of contents to return		# Max block size of contents to return
BULK_BLOCK_CONTENT_LEN_MAX = 10000		BULK_BLOCK_CONTENT_LEN_MAX = 10000


		SortedListItem = TypeVar("SortedListItem")
		SortedListKey = TypeVar("SortedListKey")


		class SortedList(collections.UserList, Generic[SortedListKey, SortedListItem]):
		ardumontUnsubmitted Not Done Inline Actions why not Tuple[SortedListKey, ...]? ardumont: why not Tuple[SortedListKey, ...]?
		ardumontUnsubmitted Not Done Inline Actions i'll be explicit to avoid confusion, i meant why not `List[Tuple[SortedListKey, SortedListItem]]`? ardumont: i'll be explicit to avoid confusion, i meant why not `List[Tuple[SortedListKey…
		vlorentzAuthorUnsubmitted Done Inline Actions oh yes, indeed vlorentz: oh yes, indeed
		data: List[Tuple[SortedListKey, SortedListItem]]

		# https://github.com/python/mypy/issues/708
		# key: Callable[[SortedListItem], SortedListKey]

		def __init__(
		self,
		data: List[SortedListItem] = None,
		key: Optional[Callable[[SortedListItem], SortedListKey]] = None,
		):
		if key is None:

		def key(item):
		return item

		assert key is not None # for mypy
		super().__init__(sorted((key(x), x) for x in data or []))

		self.key: Callable[[SortedListItem], SortedListKey] = key

		def add(self, item: SortedListItem):
		k = self.key(item)
		bisect.insort(self.data, (k, item))

		def __iter__(self) -> Iterator[SortedListItem]:
		for (k, item) in self.data:
		yield item

		def iter_from(self, start_key: SortedListKey) -> Iterator[SortedListItem]:
		"""Returns an iterator over all the elements whose key is greater
		or equal to `start_key`.
		(This is an efficient equivalent to:
		`(x for x in L if key(x) >= start_key)`)
		"""
		from_index = bisect.bisect_left(self.data, (start_key,))
		for (k, item) in itertools.islice(self.data, from_index, None):
		yield item


class InMemoryStorage:		class InMemoryStorage:
def __init__(self, journal_writer=None):		def __init__(self, journal_writer=None):

self.reset()		self.reset()
self.journal_writer = JournalWriter(journal_writer)		self.journal_writer = JournalWriter(journal_writer)

def reset(self):		def reset(self):
self._contents = {}		self._contents = {}
Show All 9 Lines	def reset(self):
self._origins_by_sha1 = {}		self._origins_by_sha1 = {}
self._origin_visits = {}		self._origin_visits = {}
self._persons = []		self._persons = []
self._origin_metadata = defaultdict(list)		self._origin_metadata = defaultdict(list)
self._tools = {}		self._tools = {}
self._metadata_providers = {}		self._metadata_providers = {}
self._objects = defaultdict(list)		self._objects = defaultdict(list)

# ideally we would want a skip list for both fast inserts and searches		self._sorted_sha1s = SortedList[bytes, bytes]()
		anlambertUnsubmitted Not Done Inline Actions You can also write: self._shorted_sha1s = SortedList[bytes, bytes]() anlambert: You can also write: ```lang=python self._shorted_sha1s = SortedList[bytes, bytes]() ```
		vlorentzAuthorUnsubmitted Done Inline Actions TIL, thanks vlorentz: TIL, thanks
self._sorted_sha1s = []

self.objstorage = ObjStorage({"cls": "memory", "args": {}})		self.objstorage = ObjStorage({"cls": "memory", "args": {}})

def check_config(self, *, check_write):		def check_config(self, *, check_write):
return True		return True

def _content_add(self, contents: Iterable[Content], with_data: bool) -> Dict:		def _content_add(self, contents: Iterable[Content], with_data: bool) -> Dict:
self.journal_writer.content_add(contents)		self.journal_writer.content_add(contents)
Show All 23 Lines	def _content_add(self, contents: Iterable[Content], with_data: bool) -> Dict:
# Add the new colliding content		# Add the new colliding content
colliding_content_hashes.append(content.hashes())		colliding_content_hashes.append(content.hashes())
raise HashCollision(algorithm, hash_, colliding_content_hashes)		raise HashCollision(algorithm, hash_, colliding_content_hashes)
for algorithm in DEFAULT_ALGORITHMS:		for algorithm in DEFAULT_ALGORITHMS:
hash_ = content.get_hash(algorithm)		hash_ = content.get_hash(algorithm)
self._content_indexes[algorithm][hash_].add(key)		self._content_indexes[algorithm][hash_].add(key)
self._objects[content.sha1_git].append(("content", content.sha1))		self._objects[content.sha1_git].append(("content", content.sha1))
self._contents[key] = content		self._contents[key] = content
bisect.insort(self._sorted_sha1s, content.sha1)		self._sorted_sha1s.add(content.sha1)
self._contents[key] = attr.evolve(self._contents[key], data=None)		self._contents[key] = attr.evolve(self._contents[key], data=None)
content_add += 1		content_add += 1

summary = {		summary = {
"content:add": content_add,		"content:add": content_add,
}		}
if with_data:		if with_data:
summary["content:add:bytes"] = content_add_bytes		summary["content:add:bytes"] = content_add_bytes
Show All 35 Lines	def content_get(self, content):
raise StorageArgumentException(		raise StorageArgumentException(
"Sending at most %s contents." % BULK_BLOCK_CONTENT_LEN_MAX		"Sending at most %s contents." % BULK_BLOCK_CONTENT_LEN_MAX
)		)
yield from self.objstorage.content_get(content)		yield from self.objstorage.content_get(content)

def content_get_range(self, start, end, limit=1000):		def content_get_range(self, start, end, limit=1000):
if limit is None:		if limit is None:
raise StorageArgumentException("limit should not be None")		raise StorageArgumentException("limit should not be None")
from_index = bisect.bisect_left(self._sorted_sha1s, start)
sha1s = itertools.islice(self._sorted_sha1s, from_index, None)
sha1s = (		sha1s = (
(sha1, content_key)		(sha1, content_key)
for sha1 in sha1s		for sha1 in self._sorted_sha1s.iter_from(start)
for content_key in self._content_indexes["sha1"][sha1]		for content_key in self._content_indexes["sha1"][sha1]
)		)
matched = []		matched = []
next_content = None		next_content = None
for sha1, key in sha1s:		for sha1, key in sha1s:
if sha1 > end:		if sha1 > end:
break		break
if len(matched) >= limit:		if len(matched) >= limit:
▲ Show 20 Lines • Show All 849 Lines • Show Last 20 Lines