Changeset View
Standalone View
swh/search/in_memory.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import defaultdict | from collections import defaultdict | ||||
from datetime import datetime | from datetime import datetime, timezone | ||||
import itertools | |||||
import re | import re | ||||
from typing import Any, Dict, Iterable, Iterator, List, Optional | from typing import Any, Dict, Iterable, Iterator, List, Optional | ||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.search.interface import MinimalOriginDict, OriginDict, PagedResult | from swh.search.interface import ( | ||||
SORT_BY_OPTIONS, | |||||
MinimalOriginDict, | |||||
OriginDict, | |||||
PagedResult, | |||||
) | |||||
_words_regexp = re.compile(r"\w+") | _words_regexp = re.compile(r"\w+") | ||||
def _dict_words_set(d): | def _dict_words_set(d): | ||||
"""Recursively extract set of words from dict content.""" | """Recursively extract set of words from dict content.""" | ||||
values = set() | values = set() | ||||
def extract(obj, words): | def extract(obj, words): | ||||
if isinstance(obj, dict): | if isinstance(obj, dict): | ||||
for k, v in obj.items(): | for k, v in obj.items(): | ||||
extract(v, words) | extract(v, words) | ||||
elif isinstance(obj, list): | elif isinstance(obj, list): | ||||
for item in obj: | for item in obj: | ||||
extract(item, words) | extract(item, words) | ||||
else: | else: | ||||
words.update(_words_regexp.findall(str(obj).lower())) | words.update(_words_regexp.findall(str(obj).lower())) | ||||
return words | return words | ||||
return extract(d, values) | return extract(d, values) | ||||
def _get_sorting_key(origin, field): | |||||
"""Get value of the field from an origin for sorting origins. | |||||
Here field should be a member of SORT_BY_OPTIONS. | |||||
If "-" is present at the start of field then invert the value | |||||
in a way that it reverses the sorting order. | |||||
""" | |||||
reversed = False | |||||
if field[0] == "-": | |||||
field = field[1:] | |||||
reversed = True | |||||
datetime_max = datetime.max.replace(tzinfo=timezone.utc) | |||||
if field in ["nb_visits"]: # unlike other options, nb_visits is of type integer | |||||
if reversed: | |||||
return -origin.get(field, 0) | |||||
else: | |||||
return origin.get(field, 0) | |||||
elif field in SORT_BY_OPTIONS: | |||||
if reversed: | |||||
return datetime_max - datetime.fromisoformat( | |||||
origin.get(field, "0001-01-01T00:00:00Z").replace("Z", "+00:00") | |||||
) | |||||
else: | |||||
return datetime.fromisoformat( | |||||
origin.get(field, "0001-01-01T00:00:00Z").replace("Z", "+00:00") | |||||
) | |||||
class InMemorySearch: | class InMemorySearch: | ||||
def __init__(self): | def __init__(self): | ||||
pass | pass | ||||
def check(self): | def check(self): | ||||
return True | return True | ||||
def deinitialize(self) -> None: | def deinitialize(self) -> None: | ||||
▲ Show 20 Lines • Show All 87 Lines • ▼ Show 20 Lines | def origin_search( | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
visit_types: Optional[List[str]] = None, | visit_types: Optional[List[str]] = None, | ||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
min_nb_visits: int = 0, | min_nb_visits: int = 0, | ||||
min_last_visit_date: str = "", | min_last_visit_date: str = "", | ||||
min_last_eventful_visit_date: str = "", | min_last_eventful_visit_date: str = "", | ||||
min_last_revision_date: str = "", | min_last_revision_date: str = "", | ||||
min_last_release_date: str = "", | min_last_release_date: str = "", | ||||
sort_by: List[str] = [], | |||||
limit: int = 50, | limit: int = 50, | ||||
) -> PagedResult[MinimalOriginDict]: | ) -> PagedResult[MinimalOriginDict]: | ||||
hits: Iterator[Dict[str, Any]] = ( | hits: Iterator[Dict[str, Any]] = ( | ||||
self._origins[id_] | self._origins[id_] | ||||
for id_ in self._origin_ids | for id_ in self._origin_ids | ||||
if not self._origins[id_].get("blocklisted") | if not self._origins[id_].get("blocklisted") | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 87 Lines • ▼ Show 20 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
if visit_types is not None: | if visit_types is not None: | ||||
visit_types_set = set(visit_types) | visit_types_set = set(visit_types) | ||||
hits = filter( | hits = filter( | ||||
lambda o: visit_types_set.intersection(o.get("visit_types", set())), | lambda o: visit_types_set.intersection(o.get("visit_types", set())), | ||||
hits, | hits, | ||||
) | ) | ||||
hits_list = sorted( | |||||
hits, key=lambda o: tuple(_get_sorting_key(o, field) for field in sort_by), | |||||
vlorentz: you can also remove this one | |||||
) | |||||
start_at_index = int(page_token) if page_token else 0 | start_at_index = int(page_token) if page_token else 0 | ||||
origins = [ | origins = [ | ||||
{"url": hit["url"]} | {"url": hit["url"]} | ||||
for hit in itertools.islice(hits, start_at_index, start_at_index + limit) | for hit in hits_list[start_at_index : start_at_index + limit] | ||||
Done Inline Actionssorting the Iterable hits converts it into a List and hence renders itertools.islice useless so I replaced it with [start:end] Can we do something better than this ? KShivendu: sorting the Iterable `hits` converts it into a List and hence renders itertools.islice useless… | |||||
Not Done Inline Actionswhy useless? ardumont: why useless? | |||||
Done Inline ActionsMy bad. I had some misunderstanding. I just wanted to say that, according to https://stackoverflow.com/questions/41079001/python-3-5-slice-vs-islice-vs-alternatives-efficiency-comparison , simply doing mylist[start:stop] is faster than using list(itertools.islice(..)). So it doesn't have any performance increase plus it imports itertools just for this one line. So what do you suggest ? KShivendu: My bad. I had some misunderstanding.
I just wanted to say that, according to https… | |||||
Not Done Inline Actionsyou're right, we don't need islice anymore. vlorentz: you're right, we don't need islice anymore. | |||||
Not Done Inline Actionsright, thanks for the clarification ;) ardumont: right, thanks for the clarification ;) | |||||
] | ] | ||||
Not Done Inline Actionswhy isn't there nb_visits in the sort params here? It's there in the main implementation (.../elasticsearch.py). ardumont: why isn't there nb_visits in the sort params here? It's there in the main implementation (... | |||||
if len(origins) == limit: | if len(origins) == limit: | ||||
next_page_token = str(start_at_index + limit) | next_page_token = str(start_at_index + limit) | ||||
assert len(origins) <= limit | assert len(origins) <= limit | ||||
return PagedResult(results=origins, next_page_token=next_page_token,) | return PagedResult(results=origins, next_page_token=next_page_token,) | ||||
Not Done Inline ActionsCan you move this outside the function, improve the name (if possi ble), and add a docstring? vlorentz: Can you move this outside the function, improve the name (if possi ble), and add a docstring? |
you can also remove this one