Changeset View
Changeset View
Standalone View
Standalone View
swh/search/in_memory.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import defaultdict | from collections import defaultdict | ||||
import itertools | import itertools | ||||
import re | import re | ||||
from typing import Any, Dict, Iterable, Iterator, List, Optional | from typing import Any, Dict, Iterable, Iterator, List, Optional | ||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.search.interface import PagedResult | from swh.search.interface import MinimalOriginDict, OriginDict, PagedResult | ||||
_words_regexp = re.compile(r"\w+") | _words_regexp = re.compile(r"\w+") | ||||
def _dict_words_set(d): | def _dict_words_set(d): | ||||
"""Recursively extract set of words from dict content.""" | """Recursively extract set of words from dict content.""" | ||||
values = set() | values = set() | ||||
Show All 27 Lines | def initialize(self) -> None: | ||||
self._origins: Dict[str, Dict[str, Any]] = defaultdict(dict) | self._origins: Dict[str, Dict[str, Any]] = defaultdict(dict) | ||||
self._origin_ids: List[str] = [] | self._origin_ids: List[str] = [] | ||||
def flush(self) -> None: | def flush(self) -> None: | ||||
pass | pass | ||||
_url_splitter = re.compile(r"\W") | _url_splitter = re.compile(r"\W") | ||||
def origin_update(self, documents: Iterable[Dict]) -> None: | def origin_update(self, documents: Iterable[OriginDict]) -> None: | ||||
for document in documents: | for source_document in documents: | ||||
document = document.copy() | document: Dict[str, Any] = dict(source_document) | ||||
id_ = origin_identifier(document) | id_ = origin_identifier(document) | ||||
if "url" in document: | if "url" in document: | ||||
document["_url_tokens"] = set(self._url_splitter.split(document["url"])) | document["_url_tokens"] = set( | ||||
self._url_splitter.split(source_document["url"]) | |||||
) | |||||
if "visit_types" in document: | if "visit_types" in document: | ||||
document["visit_types"] = set(document["visit_types"]) | document["visit_types"] = set(source_document["visit_types"]) | ||||
if "visit_types" in self._origins[id_]: | if "visit_types" in self._origins[id_]: | ||||
document["visit_types"].update(self._origins[id_]["visit_types"]) | document["visit_types"].update(self._origins[id_]["visit_types"]) | ||||
self._origins[id_].update(document) | self._origins[id_].update(document) | ||||
if id_ not in self._origin_ids: | if id_ not in self._origin_ids: | ||||
self._origin_ids.append(id_) | self._origin_ids.append(id_) | ||||
def origin_search( | def origin_search( | ||||
self, | self, | ||||
*, | *, | ||||
url_pattern: Optional[str] = None, | url_pattern: Optional[str] = None, | ||||
metadata_pattern: Optional[str] = None, | metadata_pattern: Optional[str] = None, | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
visit_types: Optional[List[str]] = None, | visit_types: Optional[List[str]] = None, | ||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
limit: int = 50, | limit: int = 50, | ||||
) -> PagedResult[Dict[str, Any]]: | ) -> PagedResult[MinimalOriginDict]: | ||||
hits: Iterator[Dict[str, Any]] = ( | hits: Iterator[Dict[str, Any]] = ( | ||||
self._origins[id_] for id_ in self._origin_ids | self._origins[id_] for id_ in self._origin_ids | ||||
) | ) | ||||
if url_pattern: | if url_pattern: | ||||
tokens = set(self._url_splitter.split(url_pattern)) | tokens = set(self._url_splitter.split(url_pattern)) | ||||
def predicate(match): | def predicate(match): | ||||
▲ Show 20 Lines • Show All 60 Lines • Show Last 20 Lines |