Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/storage/in_memory.py
# Copyright (C) 2018-2020 The Software Heritage developers | # Copyright (C) 2018-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import Counter, defaultdict | from collections import Counter, defaultdict | ||||
import itertools | import itertools | ||||
import json | import json | ||||
import math | import math | ||||
import operator | import operator | ||||
import re | import re | ||||
from typing import ( | from typing import ( | ||||
Any, | Any, | ||||
Dict, | Dict, | ||||
Generic, | Generic, | ||||
Iterable, | Iterable, | ||||
Iterator, | |||||
List, | List, | ||||
Optional, | Optional, | ||||
Set, | Set, | ||||
Tuple, | Tuple, | ||||
Type, | Type, | ||||
TypeVar, | TypeVar, | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines | def __init__(self, row_class: Type[TValue], tools): | ||||
self._data = defaultdict(dict) | self._data = defaultdict(dict) | ||||
self._tools_per_id = defaultdict(set) | self._tools_per_id = defaultdict(set) | ||||
def _key_from_dict(self, d) -> Tuple: | def _key_from_dict(self, d) -> Tuple: | ||||
"""Like the global _key_from_dict, but filters out dict keys that don't | """Like the global _key_from_dict, but filters out dict keys that don't | ||||
belong in the unique key.""" | belong in the unique key.""" | ||||
return _key_from_dict({k: d[k] for k in self.row_class.UNIQUE_KEY_FIELDS}) | return _key_from_dict({k: d[k] for k in self.row_class.UNIQUE_KEY_FIELDS}) | ||||
def missing(self, keys: Iterable[Dict]) -> Iterator[Sha1]: | def missing(self, keys: Iterable[Dict]) -> List[Sha1]: | ||||
"""List data missing from storage. | """List data missing from storage. | ||||
Args: | Args: | ||||
data (iterable): dictionaries with keys: | data (iterable): dictionaries with keys: | ||||
- **id** (bytes): sha1 identifier | - **id** (bytes): sha1 identifier | ||||
- **indexer_configuration_id** (int): tool used to compute | - **indexer_configuration_id** (int): tool used to compute | ||||
the results | the results | ||||
Yields: | Yields: | ||||
missing sha1s | missing sha1s | ||||
""" | """ | ||||
results = [] | |||||
for key in keys: | for key in keys: | ||||
tool_id = key["indexer_configuration_id"] | tool_id = key["indexer_configuration_id"] | ||||
id_ = key["id"] | id_ = key["id"] | ||||
if tool_id not in self._tools_per_id.get(id_, set()): | if tool_id not in self._tools_per_id.get(id_, set()): | ||||
yield id_ | results.append(id_) | ||||
return results | |||||
def get(self, ids: Iterable[Sha1]) -> Iterator[TValue]: | def get(self, ids: Iterable[Sha1]) -> List[TValue]: | ||||
"""Retrieve data per id. | """Retrieve data per id. | ||||
Args: | Args: | ||||
ids (iterable): sha1 checksums | ids (iterable): sha1 checksums | ||||
Yields: | Yields: | ||||
dict: dictionaries with the following keys: | dict: dictionaries with the following keys: | ||||
- **id** (bytes) | - **id** (bytes) | ||||
- **tool** (dict): tool used to compute metadata | - **tool** (dict): tool used to compute metadata | ||||
- arbitrary data (as provided to `add`) | - arbitrary data (as provided to `add`) | ||||
""" | """ | ||||
results = [] | |||||
for id_ in ids: | for id_ in ids: | ||||
for entry in self._data[id_].values(): | for entry in self._data[id_].values(): | ||||
entry = entry.copy() | entry = entry.copy() | ||||
tool_id = entry.pop("indexer_configuration_id") | tool_id = entry.pop("indexer_configuration_id") | ||||
yield self.row_class( | results.append( | ||||
self.row_class( | |||||
id=id_, tool=_transform_tool(self._tools[tool_id]), **entry, | id=id_, tool=_transform_tool(self._tools[tool_id]), **entry, | ||||
) | ) | ||||
) | |||||
return results | |||||
def get_all(self) -> Iterator[TValue]: | def get_all(self) -> List[TValue]: | ||||
yield from self.get(self._sorted_ids) | return self.get(self._sorted_ids) | ||||
def get_partition( | def get_partition( | ||||
self, | self, | ||||
indexer_configuration_id: int, | indexer_configuration_id: int, | ||||
partition_id: int, | partition_id: int, | ||||
nb_partitions: int, | nb_partitions: int, | ||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
limit: int = 1000, | limit: int = 1000, | ||||
▲ Show 20 Lines • Show All 113 Lines • ▼ Show 20 Lines | def __init__(self): | ||||
OriginIntrinsicMetadataRow, self._tools | OriginIntrinsicMetadataRow, self._tools | ||||
) | ) | ||||
def check_config(self, *, check_write): | def check_config(self, *, check_write): | ||||
return True | return True | ||||
def content_mimetype_missing( | def content_mimetype_missing( | ||||
self, mimetypes: Iterable[Dict] | self, mimetypes: Iterable[Dict] | ||||
) -> Iterable[Tuple[Sha1, int]]: | ) -> List[Tuple[Sha1, int]]: | ||||
yield from self._mimetypes.missing(mimetypes) | return self._mimetypes.missing(mimetypes) | ||||
def content_mimetype_get_partition( | def content_mimetype_get_partition( | ||||
self, | self, | ||||
indexer_configuration_id: int, | indexer_configuration_id: int, | ||||
partition_id: int, | partition_id: int, | ||||
nb_partitions: int, | nb_partitions: int, | ||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
limit: int = 1000, | limit: int = 1000, | ||||
) -> PagedResult[Sha1]: | ) -> PagedResult[Sha1]: | ||||
return self._mimetypes.get_partition( | return self._mimetypes.get_partition( | ||||
indexer_configuration_id, partition_id, nb_partitions, page_token, limit | indexer_configuration_id, partition_id, nb_partitions, page_token, limit | ||||
) | ) | ||||
def content_mimetype_add( | def content_mimetype_add( | ||||
self, mimetypes: List[ContentMimetypeRow], conflict_update: bool = False | self, mimetypes: List[ContentMimetypeRow], conflict_update: bool = False | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
added = self._mimetypes.add(mimetypes, conflict_update) | added = self._mimetypes.add(mimetypes, conflict_update) | ||||
return {"content_mimetype:add": added} | return {"content_mimetype:add": added} | ||||
def content_mimetype_get(self, ids: Iterable[Sha1]) -> Iterable[ContentMimetypeRow]: | def content_mimetype_get(self, ids: Iterable[Sha1]) -> List[ContentMimetypeRow]: | ||||
yield from self._mimetypes.get(ids) | return self._mimetypes.get(ids) | ||||
def content_language_missing(self, languages): | def content_language_missing(self, languages): | ||||
yield from self._languages.missing(languages) | return self._languages.missing(languages) | ||||
def content_language_get(self, ids): | def content_language_get(self, ids): | ||||
yield from (obj.to_dict() for obj in self._languages.get(ids)) | return [obj.to_dict() for obj in self._languages.get(ids)] | ||||
def content_language_add( | def content_language_add( | ||||
self, languages: List[Dict], conflict_update: bool = False | self, languages: List[Dict], conflict_update: bool = False | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
check_id_types(languages) | check_id_types(languages) | ||||
added = self._languages.add( | added = self._languages.add( | ||||
map(ContentLanguageRow.from_dict, languages), conflict_update | map(ContentLanguageRow.from_dict, languages), conflict_update | ||||
) | ) | ||||
return {"content_language:add": added} | return {"content_language:add": added} | ||||
def content_ctags_missing(self, ctags): | def content_ctags_missing(self, ctags): | ||||
yield from self._content_ctags.missing(ctags) | return self._content_ctags.missing(ctags) | ||||
def content_ctags_get(self, ids): | def content_ctags_get(self, ids): | ||||
results = [] | |||||
for item in self._content_ctags.get(ids): | for item in self._content_ctags.get(ids): | ||||
yield {"id": item.id, "tool": item.tool, **item.to_dict()} | results.append({"id": item.id, "tool": item.tool, **item.to_dict()}) | ||||
return results | |||||
def content_ctags_add( | def content_ctags_add( | ||||
self, ctags: List[Dict], conflict_update: bool = False | self, ctags: List[Dict], conflict_update: bool = False | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
check_id_types(ctags) | check_id_types(ctags) | ||||
added = self._content_ctags.add( | added = self._content_ctags.add( | ||||
map( | map( | ||||
ContentCtagsRow.from_dict, | ContentCtagsRow.from_dict, | ||||
itertools.chain.from_iterable(map(converters.ctags_to_db, ctags)), | itertools.chain.from_iterable(map(converters.ctags_to_db, ctags)), | ||||
), | ), | ||||
conflict_update, | conflict_update, | ||||
) | ) | ||||
return {"content_ctags:add": added} | return {"content_ctags:add": added} | ||||
def content_ctags_search(self, expression, limit=10, last_sha1=None): | def content_ctags_search(self, expression, limit=10, last_sha1=None): | ||||
nb_matches = 0 | nb_matches = 0 | ||||
items_per_id: Dict[Tuple[Sha1Git, ToolId], List[ContentCtagsRow]] = {} | items_per_id: Dict[Tuple[Sha1Git, ToolId], List[ContentCtagsRow]] = {} | ||||
for item in sorted(self._content_ctags.get_all()): | for item in sorted(self._content_ctags.get_all()): | ||||
if item.id <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): | if item.id <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): | ||||
continue | continue | ||||
items_per_id.setdefault( | items_per_id.setdefault( | ||||
(item.id, item.indexer_configuration_id), [] | (item.id, item.indexer_configuration_id), [] | ||||
).append(item) | ).append(item) | ||||
results = [] | |||||
for items in items_per_id.values(): | for items in items_per_id.values(): | ||||
ctags = [] | ctags = [] | ||||
for item in items: | for item in items: | ||||
if item.name != expression: | if item.name != expression: | ||||
continue | continue | ||||
nb_matches += 1 | nb_matches += 1 | ||||
if nb_matches > limit: | if nb_matches > limit: | ||||
break | break | ||||
item_dict = item.to_dict() | item_dict = item.to_dict() | ||||
id_ = item_dict.pop("id") | id_ = item_dict.pop("id") | ||||
tool = item_dict.pop("tool") | tool = item_dict.pop("tool") | ||||
ctags.append(item_dict) | ctags.append(item_dict) | ||||
if ctags: | if ctags: | ||||
for ctag in ctags: | for ctag in ctags: | ||||
yield {"id": id_, "tool": tool, **ctag} | results.append({"id": id_, "tool": tool, **ctag}) | ||||
return results | |||||
def content_fossology_license_get( | def content_fossology_license_get( | ||||
self, ids: Iterable[Sha1] | self, ids: Iterable[Sha1] | ||||
) -> Iterable[ContentLicenseRow]: | ) -> List[ContentLicenseRow]: | ||||
return self._licenses.get(ids) | return self._licenses.get(ids) | ||||
def content_fossology_license_add( | def content_fossology_license_add( | ||||
self, licenses: List[ContentLicenseRow], conflict_update: bool = False | self, licenses: List[ContentLicenseRow], conflict_update: bool = False | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
added = self._licenses.add(licenses, conflict_update) | added = self._licenses.add(licenses, conflict_update) | ||||
return {"content_fossology_license:add": added} | return {"content_fossology_license:add": added} | ||||
def content_fossology_license_get_partition( | def content_fossology_license_get_partition( | ||||
self, | self, | ||||
indexer_configuration_id: int, | indexer_configuration_id: int, | ||||
partition_id: int, | partition_id: int, | ||||
nb_partitions: int, | nb_partitions: int, | ||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
limit: int = 1000, | limit: int = 1000, | ||||
) -> PagedResult[Sha1]: | ) -> PagedResult[Sha1]: | ||||
return self._licenses.get_partition( | return self._licenses.get_partition( | ||||
indexer_configuration_id, partition_id, nb_partitions, page_token, limit | indexer_configuration_id, partition_id, nb_partitions, page_token, limit | ||||
) | ) | ||||
def content_metadata_missing(self, metadata): | def content_metadata_missing(self, metadata): | ||||
yield from self._content_metadata.missing(metadata) | return self._content_metadata.missing(metadata) | ||||
def content_metadata_get(self, ids): | def content_metadata_get(self, ids): | ||||
yield from (obj.to_dict() for obj in self._content_metadata.get(ids)) | return [obj.to_dict() for obj in self._content_metadata.get(ids)] | ||||
def content_metadata_add( | def content_metadata_add( | ||||
self, metadata: List[Dict], conflict_update: bool = False | self, metadata: List[Dict], conflict_update: bool = False | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
check_id_types(metadata) | check_id_types(metadata) | ||||
added = self._content_metadata.add( | added = self._content_metadata.add( | ||||
map(ContentMetadataRow.from_dict, metadata), conflict_update | map(ContentMetadataRow.from_dict, metadata), conflict_update | ||||
) | ) | ||||
return {"content_metadata:add": added} | return {"content_metadata:add": added} | ||||
def revision_intrinsic_metadata_missing(self, metadata): | def revision_intrinsic_metadata_missing(self, metadata): | ||||
yield from self._revision_intrinsic_metadata.missing(metadata) | return self._revision_intrinsic_metadata.missing(metadata) | ||||
def revision_intrinsic_metadata_get(self, ids): | def revision_intrinsic_metadata_get(self, ids): | ||||
yield from (obj.to_dict() for obj in self._revision_intrinsic_metadata.get(ids)) | return [obj.to_dict() for obj in self._revision_intrinsic_metadata.get(ids)] | ||||
def revision_intrinsic_metadata_add( | def revision_intrinsic_metadata_add( | ||||
self, metadata: List[Dict], conflict_update: bool = False | self, metadata: List[Dict], conflict_update: bool = False | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
check_id_types(metadata) | check_id_types(metadata) | ||||
added = self._revision_intrinsic_metadata.add( | added = self._revision_intrinsic_metadata.add( | ||||
map(RevisionIntrinsicMetadataRow.from_dict, metadata), conflict_update | map(RevisionIntrinsicMetadataRow.from_dict, metadata), conflict_update | ||||
) | ) | ||||
return {"revision_intrinsic_metadata:add": added} | return {"revision_intrinsic_metadata:add": added} | ||||
def revision_intrinsic_metadata_delete(self, entries: List[Dict]) -> Dict: | def revision_intrinsic_metadata_delete(self, entries: List[Dict]) -> Dict: | ||||
deleted = self._revision_intrinsic_metadata.delete(entries) | deleted = self._revision_intrinsic_metadata.delete(entries) | ||||
return {"revision_intrinsic_metadata:del": deleted} | return {"revision_intrinsic_metadata:del": deleted} | ||||
def origin_intrinsic_metadata_get(self, ids): | def origin_intrinsic_metadata_get(self, ids): | ||||
yield from (obj.to_dict() for obj in self._origin_intrinsic_metadata.get(ids)) | return [obj.to_dict() for obj in self._origin_intrinsic_metadata.get(ids)] | ||||
def origin_intrinsic_metadata_add( | def origin_intrinsic_metadata_add( | ||||
self, metadata: List[Dict], conflict_update: bool = False | self, metadata: List[Dict], conflict_update: bool = False | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
added = self._origin_intrinsic_metadata.add( | added = self._origin_intrinsic_metadata.add( | ||||
map(OriginIntrinsicMetadataRow.from_dict, metadata), conflict_update | map(OriginIntrinsicMetadataRow.from_dict, metadata), conflict_update | ||||
) | ) | ||||
return {"origin_intrinsic_metadata:add": added} | return {"origin_intrinsic_metadata:add": added} | ||||
Show All 27 Lines | def origin_intrinsic_metadata_search_fulltext(self, conjunction, limit=100): | ||||
results = [ | results = [ | ||||
(rank(data), data) for data in self._origin_intrinsic_metadata.get_all() | (rank(data), data) for data in self._origin_intrinsic_metadata.get_all() | ||||
] | ] | ||||
results = [(rank_, data) for (rank_, data) in results if rank_ > 0] | results = [(rank_, data) for (rank_, data) in results if rank_ > 0] | ||||
results.sort( | results.sort( | ||||
key=operator.itemgetter(0), reverse=True # Don't try to order 'data' | key=operator.itemgetter(0), reverse=True # Don't try to order 'data' | ||||
) | ) | ||||
for (rank_, result) in results[:limit]: | return [result.to_dict() for (rank_, result) in results[:limit]] | ||||
yield result.to_dict() | |||||
def origin_intrinsic_metadata_search_by_producer( | def origin_intrinsic_metadata_search_by_producer( | ||||
self, page_token="", limit=100, ids_only=False, mappings=None, tool_ids=None | self, page_token="", limit=100, ids_only=False, mappings=None, tool_ids=None | ||||
): | ): | ||||
assert isinstance(page_token, str) | assert isinstance(page_token, str) | ||||
nb_results = 0 | nb_results = 0 | ||||
if mappings is not None: | if mappings is not None: | ||||
mappings = frozenset(mappings) | mappings = frozenset(mappings) | ||||
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines |