Differential D5918 Diff 21300 swh/search/in_memory.py

Changeset View

Standalone View

swh/search/in_memory.py

# Copyright (C) 2019-2021 The Software Heritage developers		# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

from collections import defaultdict		from collections import defaultdict
from datetime import datetime		from datetime import datetime, timezone
import itertools
import re		import re
from typing import Any, Dict, Iterable, Iterator, List, Optional		from typing import Any, Dict, Iterable, Iterator, List, Optional

from swh.model.identifiers import origin_identifier		from swh.model.identifiers import origin_identifier
from swh.search.interface import MinimalOriginDict, OriginDict, PagedResult		from swh.search.interface import (
		SORT_BY_OPTIONS,
		MinimalOriginDict,
		OriginDict,
		PagedResult,
		)

_words_regexp = re.compile(r"\w+")		_words_regexp = re.compile(r"\w+")


def _dict_words_set(d):		def _dict_words_set(d):
"""Recursively extract set of words from dict content."""		"""Recursively extract set of words from dict content."""
values = set()		values = set()

def extract(obj, words):		def extract(obj, words):
if isinstance(obj, dict):		if isinstance(obj, dict):
for k, v in obj.items():		for k, v in obj.items():
extract(v, words)		extract(v, words)
elif isinstance(obj, list):		elif isinstance(obj, list):
for item in obj:		for item in obj:
extract(item, words)		extract(item, words)
else:		else:
words.update(_words_regexp.findall(str(obj).lower()))		words.update(_words_regexp.findall(str(obj).lower()))
return words		return words

return extract(d, values)		return extract(d, values)


		def _get_sorting_key(origin, field):
		"""Get value of the field from an origin for sorting origins.

		Here field should be a member of SORT_BY_OPTIONS.
		If "-" is present at the start of field then invert the value
		in a way that it reverses the sorting order.
		"""
		reversed = False
		if field[0] == "-":
		field = field[1:]
		reversed = True

		datetime_max = datetime.max.replace(tzinfo=timezone.utc)

		if field in ["nb_visits"]: # unlike other options, nb_visits is of type integer
		if reversed:
		return -origin.get(field, 0)
		else:
		return origin.get(field, 0)

		elif field in SORT_BY_OPTIONS:
		if reversed:
		return datetime_max - datetime.fromisoformat(
		origin.get(field, "0001-01-01T00:00:00Z").replace("Z", "+00:00")
		)
		else:
		return datetime.fromisoformat(
		origin.get(field, "0001-01-01T00:00:00Z").replace("Z", "+00:00")
		)


class InMemorySearch:		class InMemorySearch:
def __init__(self):		def __init__(self):
pass		pass

def check(self):		def check(self):
return True		return True

def deinitialize(self) -> None:		def deinitialize(self) -> None:
▲ Show 20 Lines • Show All 87 Lines • ▼ Show 20 Lines	def origin_search(
with_visit: bool = False,		with_visit: bool = False,
visit_types: Optional[List[str]] = None,		visit_types: Optional[List[str]] = None,
page_token: Optional[str] = None,		page_token: Optional[str] = None,
min_nb_visits: int = 0,		min_nb_visits: int = 0,
min_last_visit_date: str = "",		min_last_visit_date: str = "",
min_last_eventful_visit_date: str = "",		min_last_eventful_visit_date: str = "",
min_last_revision_date: str = "",		min_last_revision_date: str = "",
min_last_release_date: str = "",		min_last_release_date: str = "",
		sort_by: List[str] = [],
limit: int = 50,		limit: int = 50,
) -> PagedResult[MinimalOriginDict]:		) -> PagedResult[MinimalOriginDict]:
hits: Iterator[Dict[str, Any]] = (		hits: Iterator[Dict[str, Any]] = (
self._origins[id_]		self._origins[id_]
for id_ in self._origin_ids		for id_ in self._origin_ids
if not self._origins[id_].get("blocklisted")		if not self._origins[id_].get("blocklisted")
)		)

▲ Show 20 Lines • Show All 87 Lines • ▼ Show 20 Lines	) -> PagedResult[MinimalOriginDict]:

if visit_types is not None:		if visit_types is not None:
visit_types_set = set(visit_types)		visit_types_set = set(visit_types)
hits = filter(		hits = filter(
lambda o: visit_types_set.intersection(o.get("visit_types", set())),		lambda o: visit_types_set.intersection(o.get("visit_types", set())),
hits,		hits,
)		)

		hits_list = sorted(
		hits, key=lambda o: tuple(_get_sorting_key(o, field) for field in sort_by),
		vlorentzUnsubmitted Not Done Inline Actions you can also remove this one vlorentz: you can also remove this one
		)

start_at_index = int(page_token) if page_token else 0		start_at_index = int(page_token) if page_token else 0

origins = [		origins = [
{"url": hit["url"]}		{"url": hit["url"]}
for hit in itertools.islice(hits, start_at_index, start_at_index + limit)		for hit in hits_list[start_at_index : start_at_index + limit]
		KShivenduAuthorUnsubmitted Done Inline Actions sorting the Iterable `hits` converts it into a List and hence renders itertools.islice useless so I replaced it with `[start:end]` Can we do something better than this ? KShivendu: sorting the Iterable `hits` converts it into a List and hence renders itertools.islice useless…
		ardumontUnsubmitted Not Done Inline Actions why useless? ardumont: why useless?
		KShivenduAuthorUnsubmitted Done Inline Actions My bad. I had some misunderstanding. I just wanted to say that, according to https://stackoverflow.com/questions/41079001/python-3-5-slice-vs-islice-vs-alternatives-efficiency-comparison , simply doing `mylist[start:stop]` is faster than using `list(itertools.islice(..))`. So it doesn't have any performance increase plus it imports itertools just for this one line. So what do you suggest ? KShivendu: My bad. I had some misunderstanding. I just wanted to say that, according to https…
		vlorentzUnsubmitted Not Done Inline Actions you're right, we don't need islice anymore. vlorentz: you're right, we don't need islice anymore.
		ardumontUnsubmitted Not Done Inline Actions right, thanks for the clarification ;) ardumont: right, thanks for the clarification ;)
]		]

		ardumontUnsubmitted Not Done Inline Actions why isn't there nb_visits in the sort params here? It's there in the main implementation (.../elasticsearch.py). ardumont: why isn't there nb_visits in the sort params here? It's there in the main implementation (...
if len(origins) == limit:		if len(origins) == limit:
next_page_token = str(start_at_index + limit)		next_page_token = str(start_at_index + limit)

assert len(origins) <= limit		assert len(origins) <= limit

return PagedResult(results=origins, next_page_token=next_page_token,)		return PagedResult(results=origins, next_page_token=next_page_token,)
		vlorentzUnsubmitted Not Done Inline Actions Can you move this outside the function, improve the name (if possi ble), and add a docstring? vlorentz: Can you move this outside the function, improve the name (if possi ble), and add a docstring?