Changeset View
Changeset View
Standalone View
Standalone View
swh/search/in_memory.py
# Copyright (C) 2019-2020 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | |||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | |||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | |||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | |||||||||||
from collections import defaultdict | from collections import defaultdict | |||||||||||
import itertools | import itertools | |||||||||||
import re | import re | |||||||||||
from typing import Any, Dict, Iterable, Iterator, List, Optional | from typing import Any, Dict, Iterable, Iterator, List, Optional | |||||||||||
Show All 24 Lines | class InMemorySearch: | |||||||||||
_url_splitter = re.compile(r"\W") | _url_splitter = re.compile(r"\W") | |||||||||||
def origin_update(self, documents: Iterable[Dict]) -> None: | def origin_update(self, documents: Iterable[Dict]) -> None: | |||||||||||
for document in documents: | for document in documents: | |||||||||||
document = document.copy() | document = document.copy() | |||||||||||
id_ = origin_identifier(document) | id_ = origin_identifier(document) | |||||||||||
if "url" in document: | if "url" in document: | |||||||||||
document["_url_tokens"] = set(self._url_splitter.split(document["url"])) | document["_url_tokens"] = set(self._url_splitter.split(document["url"])) | |||||||||||
if "visit_types" in self._origins[id_]: | ||||||||||||
document = dict(document) | ||||||||||||
document["visit_types"] = list( | ||||||||||||
set( | ||||||||||||
self._origins[id_]["visit_types"] | ||||||||||||
+ document.get("visit_types", []) | ||||||||||||
) | ||||||||||||
vlorentz: store a set in `document["visit_types"]`, it will make `origin_search` more readable | ||||||||||||
) | ||||||||||||
self._origins[id_].update(document) | self._origins[id_].update(document) | |||||||||||
if id_ not in self._origin_ids: | if id_ not in self._origin_ids: | |||||||||||
self._origin_ids.append(id_) | self._origin_ids.append(id_) | |||||||||||
def origin_search( | def origin_search( | |||||||||||
self, | self, | |||||||||||
*, | *, | |||||||||||
url_pattern: Optional[str] = None, | url_pattern: Optional[str] = None, | |||||||||||
metadata_pattern: Optional[str] = None, | metadata_pattern: Optional[str] = None, | |||||||||||
with_visit: bool = False, | with_visit: bool = False, | |||||||||||
visit_types: Optional[List[str]] = None, | ||||||||||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | |||||||||||
limit: int = 50, | limit: int = 50, | |||||||||||
) -> PagedResult[Dict[str, Any]]: | ) -> PagedResult[Dict[str, Any]]: | |||||||||||
hits: Iterator[Dict[str, Any]] = ( | hits: Iterator[Dict[str, Any]] = ( | |||||||||||
self._origins[id_] for id_ in self._origin_ids | self._origins[id_] for id_ in self._origin_ids | |||||||||||
) | ) | |||||||||||
if url_pattern: | if url_pattern: | |||||||||||
Show All 25 Lines | ) -> PagedResult[Dict[str, Any]]: | |||||||||||
"At least one of url_pattern and metadata_pattern must be provided." | "At least one of url_pattern and metadata_pattern must be provided." | |||||||||||
) | ) | |||||||||||
next_page_token: Optional[str] = None | next_page_token: Optional[str] = None | |||||||||||
if with_visit: | if with_visit: | |||||||||||
hits = filter(lambda o: o.get("has_visits"), hits) | hits = filter(lambda o: o.get("has_visits"), hits) | |||||||||||
if visit_types is not None: | ||||||||||||
visit_types_set = set(visit_types) | ||||||||||||
hits = filter( | ||||||||||||
lambda o: not (visit_types_set - set(o.get("visit_types", []))), hits | ||||||||||||
Done Inline Actions
Looks like this is equivalent. (but still probably not what we want, see my comment about the semantics of origin_search) vlorentz: Looks like this is equivalent.
(but still probably not what we want, see my comment about the… | ||||||||||||
) | ||||||||||||
start_at_index = int(page_token) if page_token else 0 | start_at_index = int(page_token) if page_token else 0 | |||||||||||
origins = [ | origins = [ | |||||||||||
{"url": hit["url"]} | {"url": hit["url"]} | |||||||||||
for hit in itertools.islice(hits, start_at_index, start_at_index + limit) | for hit in itertools.islice(hits, start_at_index, start_at_index + limit) | |||||||||||
] | ] | |||||||||||
if len(origins) == limit: | if len(origins) == limit: | |||||||||||
next_page_token = str(start_at_index + limit) | next_page_token = str(start_at_index + limit) | |||||||||||
assert len(origins) <= limit | assert len(origins) <= limit | |||||||||||
return PagedResult(results=origins, next_page_token=next_page_token,) | return PagedResult(results=origins, next_page_token=next_page_token,) |
store a set in document["visit_types"], it will make origin_search more readable