Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/in_memory.py
# Copyright (C) 2015-2020 The Software Heritage developers | # Copyright (C) 2015-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import re | import re | ||||
import bisect | import bisect | ||||
import dateutil | import dateutil | ||||
import collections | import collections | ||||
import copy | import copy | ||||
import datetime | import datetime | ||||
import itertools | import itertools | ||||
import random | import random | ||||
from collections import defaultdict | from collections import defaultdict | ||||
from datetime import timedelta | from datetime import timedelta | ||||
from typing import Any, Dict, Iterable, List, Optional, Union | from typing import ( | ||||
Any, | |||||
Callable, | |||||
Dict, | |||||
Generic, | |||||
Iterable, | |||||
Iterator, | |||||
List, | |||||
Optional, | |||||
Tuple, | |||||
TypeVar, | |||||
Union, | |||||
) | |||||
import attr | import attr | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
BaseContent, | BaseContent, | ||||
Content, | Content, | ||||
SkippedContent, | SkippedContent, | ||||
Directory, | Directory, | ||||
Show All 14 Lines | |||||
from .converters import origin_url_to_sha1 | from .converters import origin_url_to_sha1 | ||||
from .utils import get_partition_bounds_bytes | from .utils import get_partition_bounds_bytes | ||||
from .writer import JournalWriter | from .writer import JournalWriter | ||||
# Max block size of contents to return | # Max block size of contents to return | ||||
BULK_BLOCK_CONTENT_LEN_MAX = 10000 | BULK_BLOCK_CONTENT_LEN_MAX = 10000 | ||||
SortedListItem = TypeVar("SortedListItem") | |||||
SortedListKey = TypeVar("SortedListKey") | |||||
class SortedList(collections.UserList, Generic[SortedListKey, SortedListItem]): | |||||
ardumont: why not Tuple[SortedListKey, ...]? | |||||
Not Done Inline Actionsi'll be explicit to avoid confusion, i meant why not List[Tuple[SortedListKey, SortedListItem]]? ardumont: i'll be explicit to avoid confusion, i meant why not `List[Tuple[SortedListKey… | |||||
Done Inline Actionsoh yes, indeed vlorentz: oh yes, indeed | |||||
data: List[Tuple[SortedListKey, SortedListItem]] | |||||
# https://github.com/python/mypy/issues/708 | |||||
# key: Callable[[SortedListItem], SortedListKey] | |||||
def __init__( | |||||
self, | |||||
data: List[SortedListItem] = None, | |||||
key: Optional[Callable[[SortedListItem], SortedListKey]] = None, | |||||
): | |||||
if key is None: | |||||
def key(item): | |||||
return item | |||||
assert key is not None # for mypy | |||||
super().__init__(sorted((key(x), x) for x in data or [])) | |||||
self.key: Callable[[SortedListItem], SortedListKey] = key | |||||
def add(self, item: SortedListItem): | |||||
k = self.key(item) | |||||
bisect.insort(self.data, (k, item)) | |||||
def __iter__(self) -> Iterator[SortedListItem]: | |||||
for (k, item) in self.data: | |||||
yield item | |||||
def iter_from(self, start_key: SortedListKey) -> Iterator[SortedListItem]: | |||||
"""Returns an iterator over all the elements whose key is greater | |||||
or equal to `start_key`. | |||||
(This is an efficient equivalent to: | |||||
`(x for x in L if key(x) >= start_key)`) | |||||
""" | |||||
from_index = bisect.bisect_left(self.data, (start_key,)) | |||||
for (k, item) in itertools.islice(self.data, from_index, None): | |||||
yield item | |||||
class InMemoryStorage: | class InMemoryStorage: | ||||
def __init__(self, journal_writer=None): | def __init__(self, journal_writer=None): | ||||
self.reset() | self.reset() | ||||
self.journal_writer = JournalWriter(journal_writer) | self.journal_writer = JournalWriter(journal_writer) | ||||
def reset(self): | def reset(self): | ||||
self._contents = {} | self._contents = {} | ||||
Show All 9 Lines | def reset(self): | ||||
self._origins_by_sha1 = {} | self._origins_by_sha1 = {} | ||||
self._origin_visits = {} | self._origin_visits = {} | ||||
self._persons = [] | self._persons = [] | ||||
self._origin_metadata = defaultdict(list) | self._origin_metadata = defaultdict(list) | ||||
self._tools = {} | self._tools = {} | ||||
self._metadata_providers = {} | self._metadata_providers = {} | ||||
self._objects = defaultdict(list) | self._objects = defaultdict(list) | ||||
# ideally we would want a skip list for both fast inserts and searches | self._sorted_sha1s = SortedList[bytes, bytes]() | ||||
Not Done Inline ActionsYou can also write: self._shorted_sha1s = SortedList[bytes, bytes]() anlambert: You can also write:
```lang=python
self._shorted_sha1s = SortedList[bytes, bytes]()
``` | |||||
Done Inline ActionsTIL, thanks vlorentz: TIL, thanks | |||||
self._sorted_sha1s = [] | |||||
self.objstorage = ObjStorage({"cls": "memory", "args": {}}) | self.objstorage = ObjStorage({"cls": "memory", "args": {}}) | ||||
def check_config(self, *, check_write): | def check_config(self, *, check_write): | ||||
return True | return True | ||||
def _content_add(self, contents: Iterable[Content], with_data: bool) -> Dict: | def _content_add(self, contents: Iterable[Content], with_data: bool) -> Dict: | ||||
self.journal_writer.content_add(contents) | self.journal_writer.content_add(contents) | ||||
Show All 23 Lines | def _content_add(self, contents: Iterable[Content], with_data: bool) -> Dict: | ||||
# Add the new colliding content | # Add the new colliding content | ||||
colliding_content_hashes.append(content.hashes()) | colliding_content_hashes.append(content.hashes()) | ||||
raise HashCollision(algorithm, hash_, colliding_content_hashes) | raise HashCollision(algorithm, hash_, colliding_content_hashes) | ||||
for algorithm in DEFAULT_ALGORITHMS: | for algorithm in DEFAULT_ALGORITHMS: | ||||
hash_ = content.get_hash(algorithm) | hash_ = content.get_hash(algorithm) | ||||
self._content_indexes[algorithm][hash_].add(key) | self._content_indexes[algorithm][hash_].add(key) | ||||
self._objects[content.sha1_git].append(("content", content.sha1)) | self._objects[content.sha1_git].append(("content", content.sha1)) | ||||
self._contents[key] = content | self._contents[key] = content | ||||
bisect.insort(self._sorted_sha1s, content.sha1) | self._sorted_sha1s.add(content.sha1) | ||||
self._contents[key] = attr.evolve(self._contents[key], data=None) | self._contents[key] = attr.evolve(self._contents[key], data=None) | ||||
content_add += 1 | content_add += 1 | ||||
summary = { | summary = { | ||||
"content:add": content_add, | "content:add": content_add, | ||||
} | } | ||||
if with_data: | if with_data: | ||||
summary["content:add:bytes"] = content_add_bytes | summary["content:add:bytes"] = content_add_bytes | ||||
Show All 35 Lines | def content_get(self, content): | ||||
raise StorageArgumentException( | raise StorageArgumentException( | ||||
"Sending at most %s contents." % BULK_BLOCK_CONTENT_LEN_MAX | "Sending at most %s contents." % BULK_BLOCK_CONTENT_LEN_MAX | ||||
) | ) | ||||
yield from self.objstorage.content_get(content) | yield from self.objstorage.content_get(content) | ||||
def content_get_range(self, start, end, limit=1000): | def content_get_range(self, start, end, limit=1000): | ||||
if limit is None: | if limit is None: | ||||
raise StorageArgumentException("limit should not be None") | raise StorageArgumentException("limit should not be None") | ||||
from_index = bisect.bisect_left(self._sorted_sha1s, start) | |||||
sha1s = itertools.islice(self._sorted_sha1s, from_index, None) | |||||
sha1s = ( | sha1s = ( | ||||
(sha1, content_key) | (sha1, content_key) | ||||
for sha1 in sha1s | for sha1 in self._sorted_sha1s.iter_from(start) | ||||
for content_key in self._content_indexes["sha1"][sha1] | for content_key in self._content_indexes["sha1"][sha1] | ||||
) | ) | ||||
matched = [] | matched = [] | ||||
next_content = None | next_content = None | ||||
for sha1, key in sha1s: | for sha1, key in sha1s: | ||||
if sha1 > end: | if sha1 > end: | ||||
break | break | ||||
if len(matched) >= limit: | if len(matched) >= limit: | ||||
▲ Show 20 Lines • Show All 849 Lines • Show Last 20 Lines |
why not Tuple[SortedListKey, ...]?