Differential D5064 Diff 18088 swh/search/in_memory.py

Changeset View

Standalone View

swh/search/in_memory.py

# See the AUTHORS file at the top-level directory of this distribution # See the AUTHORS file at the top-level directory of this distribution

# License: GNU General Public License version 3, or any later version # License: GNU General Public License version 3, or any later version

# See top-level LICENSE file for more information # See top-level LICENSE file for more information

from collections import defaultdict from collections import defaultdict

import itertools import itertools

import re import re

from typing import Any, Dict, Iterable, Iterator, List, Optional from typing import Any, Dict, Iterable, Iterator, List, Optional

Show All 24 Lines class InMemorySearch:

_url_splitter = re.compile(r"\W") _url_splitter = re.compile(r"\W")

def origin_update(self, documents: Iterable[Dict]) -> None: def origin_update(self, documents: Iterable[Dict]) -> None:

for document in documents: for document in documents:

document = document.copy() document = document.copy()

id_ = origin_identifier(document) id_ = origin_identifier(document)

if "url" in document: if "url" in document:

document["_url_tokens"] = set(self._url_splitter.split(document["url"])) document["_url_tokens"] = set(self._url_splitter.split(document["url"]))

if "visit_types" in self._origins[id_]:

document = dict(document)

document["visit_types"] = list(

set(

self._origins[id_]["visit_types"]

+ document.get("visit_types", [])

)

vlorentzUnsubmitted

Done

store a set in document["visit_types"], it will make origin_search more readable

vlorentz: store a set in `document["visit_types"]`, it will make `origin_search` more readable

)

self._origins[id_].update(document) self._origins[id_].update(document)

if id_ not in self._origin_ids: if id_ not in self._origin_ids:

self._origin_ids.append(id_) self._origin_ids.append(id_)

def origin_search( def origin_search(

self, self,

*, *,

url_pattern: Optional[str] = None, url_pattern: Optional[str] = None,

metadata_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None,

with_visit: bool = False, with_visit: bool = False,

visit_types: Optional[List[str]] = None,

page_token: Optional[str] = None, page_token: Optional[str] = None,

limit: int = 50, limit: int = 50,

) -> PagedResult[Dict[str, Any]]: ) -> PagedResult[Dict[str, Any]]:

hits: Iterator[Dict[str, Any]] = ( hits: Iterator[Dict[str, Any]] = (

self._origins[id_] for id_ in self._origin_ids self._origins[id_] for id_ in self._origin_ids

) )

if url_pattern: if url_pattern:

Show All 25 Lines ) -> PagedResult[Dict[str, Any]]:

"At least one of url_pattern and metadata_pattern must be provided." "At least one of url_pattern and metadata_pattern must be provided."

) )

next_page_token: Optional[str] = None next_page_token: Optional[str] = None

if with_visit: if with_visit:

hits = filter(lambda o: o.get("has_visits"), hits) hits = filter(lambda o: o.get("has_visits"), hits)

if visit_types is not None:

visit_types_set = set(visit_types)

hits = filter(

lambda o: not (visit_types_set - set(o.get("visit_types", []))), hits

vlorentzUnsubmitted

Done

visit_types_set = set(visit_types)

hits = filter(

- lambda o: not (visit_types_set - set(o.get("visit_types", []))), hits

+ lambda o: visit_types_set.issubset(set(o.get("visit_types", [])), hits

)

start_at_index = int(page_token) if page_token else 0

Looks like this is equivalent.

(but still probably not what we want, see my comment about the semantics of origin_search)

vlorentz: Looks like this is equivalent. (but still probably not what we want, see my comment about the…

)

start_at_index = int(page_token) if page_token else 0 start_at_index = int(page_token) if page_token else 0

origins = [ origins = [

{"url": hit["url"]} {"url": hit["url"]}

for hit in itertools.islice(hits, start_at_index, start_at_index + limit) for hit in itertools.islice(hits, start_at_index, start_at_index + limit)

] ]

if len(origins) == limit: if len(origins) == limit:

next_page_token = str(start_at_index + limit) next_page_token = str(start_at_index + limit)

assert len(origins) <= limit assert len(origins) <= limit

return PagedResult(results=origins, next_page_token=next_page_token,) return PagedResult(results=origins, next_page_token=next_page_token,)