Changeset View
Changeset View
Standalone View
Standalone View
swh/search/in_memory.py
Show All 9 Lines | |||||
from swh.indexer import codemeta | from swh.indexer import codemeta | ||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.search.interface import ( | from swh.search.interface import ( | ||||
SORT_BY_OPTIONS, | SORT_BY_OPTIONS, | ||||
MinimalOriginDict, | MinimalOriginDict, | ||||
OriginDict, | OriginDict, | ||||
PagedResult, | PagedResult, | ||||
get_expansion, | |||||
) | ) | ||||
_words_regexp = re.compile(r"\w+") | _words_regexp = re.compile(r"\w+") | ||||
def _dict_words_set(d): | def _dict_words_set(d): | ||||
"""Recursively extract set of words from dict content.""" | """Recursively extract set of words from dict content.""" | ||||
values = set() | values = set() | ||||
▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines | def _get_sorting_key(origin, field): | ||||
If "-" is present at the start of field then invert the value | If "-" is present at the start of field then invert the value | ||||
in a way that it reverses the sorting order. | in a way that it reverses the sorting order. | ||||
""" | """ | ||||
reversed = False | reversed = False | ||||
if field[0] == "-": | if field[0] == "-": | ||||
field = field[1:] | field = field[1:] | ||||
reversed = True | reversed = True | ||||
if field == "score": | |||||
if reversed: | |||||
return -origin.get(field, 0) | |||||
else: | |||||
return origin.get(field, 0) | |||||
datetime_max = datetime.max.replace(tzinfo=timezone.utc) | datetime_max = datetime.max.replace(tzinfo=timezone.utc) | ||||
if field in ["nb_visits"]: # unlike other options, nb_visits is of type integer | if field in ["nb_visits"]: # unlike other options, nb_visits is of type integer | ||||
if reversed: | if reversed: | ||||
return -origin.get(field, 0) | return -origin.get(field, 0) | ||||
else: | else: | ||||
return origin.get(field, 0) | return origin.get(field, 0) | ||||
▲ Show 20 Lines • Show All 120 Lines • ▼ Show 20 Lines | class InMemorySearch: | ||||
def origin_search( | def origin_search( | ||||
self, | self, | ||||
*, | *, | ||||
url_pattern: Optional[str] = None, | url_pattern: Optional[str] = None, | ||||
metadata_pattern: Optional[str] = None, | metadata_pattern: Optional[str] = None, | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
visit_types: Optional[List[str]] = None, | visit_types: Optional[List[str]] = None, | ||||
page_token: Optional[str] = None, | |||||
min_nb_visits: int = 0, | min_nb_visits: int = 0, | ||||
min_last_visit_date: str = "", | min_last_visit_date: str = "", | ||||
min_last_eventful_visit_date: str = "", | min_last_eventful_visit_date: str = "", | ||||
min_last_revision_date: str = "", | min_last_revision_date: str = "", | ||||
min_last_release_date: str = "", | min_last_release_date: str = "", | ||||
programming_languages: List[str] = [], | programming_languages: Optional[List[str]] = None, | ||||
licenses: List[str] = [], | licenses: Optional[List[str]] = None, | ||||
sort_by: List[str] = [], | keywords: Optional[List[str]] = None, | ||||
sort_by: Optional[List[str]] = None, | |||||
page_token: Optional[str] = None, | |||||
limit: int = 50, | limit: int = 50, | ||||
) -> PagedResult[MinimalOriginDict]: | ) -> PagedResult[MinimalOriginDict]: | ||||
hits: Iterator[Dict[str, Any]] = ( | hits: Iterator[Dict[str, Any]] = ( | ||||
self._origins[id_] | self._origins[id_] | ||||
for id_ in self._origin_ids | for id_ in self._origin_ids | ||||
if not self._origins[id_].get("blocklisted") | if not self._origins[id_].get("blocklisted") | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
lambda o: datetime.fromisoformat( | lambda o: datetime.fromisoformat( | ||||
o.get("last_release_date", "0001-01-01T00:00:00Z").replace( | o.get("last_release_date", "0001-01-01T00:00:00Z").replace( | ||||
"Z", "+00:00" | "Z", "+00:00" | ||||
) | ) | ||||
) | ) | ||||
>= datetime.fromisoformat(min_last_release_date), | >= datetime.fromisoformat(min_last_release_date), | ||||
hits, | hits, | ||||
) | ) | ||||
if licenses: | if licenses: | ||||
METADATA_LICENSES = [ | queried_licenses = [license_keyword.lower() for license_keyword in licenses] | ||||
"intrinsic_metadata", | |||||
"http://schema.org/license", | |||||
"@id", | |||||
] | |||||
licenses = [license_keyword.lower() for license_keyword in licenses] | |||||
hits = filter( | hits = filter( | ||||
lambda o: any( | lambda o: any( | ||||
# If any of the queried licenses are found, include the origin | # If any of the queried licenses are found, include the origin | ||||
any( | any( | ||||
# returns True if queried_license_keyword is found | # returns True if queried_license_keyword is found | ||||
# in any of the licenses of the origin | # in any of the licenses of the origin | ||||
queried_license_keyword in origin_license | queried_license_keyword in origin_license | ||||
for origin_license in _nested_get(o, METADATA_LICENSES) | for origin_license in _nested_get(o, get_expansion("licenses")) | ||||
) | ) | ||||
for queried_license_keyword in licenses | for queried_license_keyword in queried_licenses | ||||
), | ), | ||||
hits, | hits, | ||||
) | ) | ||||
if programming_languages: | if programming_languages: | ||||
METADATA_PROGRAMMING_LANGS = [ | queried_programming_languages = [ | ||||
"intrinsic_metadata", | |||||
"http://schema.org/programmingLanguage", | |||||
"@value", | |||||
] | |||||
programming_languages = [ | |||||
lang_keyword.lower() for lang_keyword in programming_languages | lang_keyword.lower() for lang_keyword in programming_languages | ||||
] | ] | ||||
hits = filter( | hits = filter( | ||||
lambda o: any( | lambda o: any( | ||||
# If any of the queried languages are found, include the origin | # If any of the queried languages are found, include the origin | ||||
any( | any( | ||||
# returns True if queried_lang_keyword is found | # returns True if queried_lang_keyword is found | ||||
# in any of the langs of the origin | # in any of the langs of the origin | ||||
queried_lang_keyword in origin_lang | queried_lang_keyword in origin_lang | ||||
for origin_lang in _nested_get(o, METADATA_PROGRAMMING_LANGS) | for origin_lang in _nested_get( | ||||
o, get_expansion("programming_languages") | |||||
) | |||||
) | ) | ||||
for queried_lang_keyword in programming_languages | for queried_lang_keyword in queried_programming_languages | ||||
), | ), | ||||
hits, | hits, | ||||
) | ) | ||||
if keywords: | |||||
if sort_by: | |||||
sort_by.append("-score") | |||||
else: | |||||
sort_by = ["-score"] | |||||
from copy import deepcopy | |||||
hits_list = deepcopy(list(hits)) | |||||
def tokenize(x): | |||||
return x.lower().replace(",", " ").split() | |||||
for origin in hits_list: | |||||
origin_keywords = [ | |||||
tokenize(keyword) | |||||
for keyword in _nested_get(origin, get_expansion("keywords")) | |||||
] | |||||
origin_descriptions = [ | |||||
tokenize(description) | |||||
for description in _nested_get( | |||||
origin, get_expansion("descriptions") | |||||
) | |||||
] | |||||
for q_keyword in keywords: | |||||
for origin_keyword_tokens in origin_keywords: | |||||
if q_keyword in origin_keyword_tokens: | |||||
origin["score"] = origin.get("score", 0) + 2 | |||||
for origin_description_token in origin_descriptions: | |||||
if q_keyword in origin_description_token: | |||||
origin["score"] = origin.get("score", 0) + 1 | |||||
hits = (origin for origin in hits_list if origin.get("score", 0) > 0) | |||||
if visit_types is not None: | if visit_types is not None: | ||||
visit_types_set = set(visit_types) | visit_types_set = set(visit_types) | ||||
hits = filter( | hits = filter( | ||||
lambda o: visit_types_set.intersection(o.get("visit_types", set())), | lambda o: visit_types_set.intersection(o.get("visit_types", set())), | ||||
hits, | hits, | ||||
) | ) | ||||
hits_list = sorted( | hits_list = list(hits) | ||||
hits, key=lambda o: tuple(_get_sorting_key(o, field) for field in sort_by), | if sort_by: | ||||
sort_by_list = list(sort_by) | |||||
hits_list.sort( | |||||
key=lambda o: tuple( | |||||
_get_sorting_key(o, field) for field in sort_by_list | |||||
) | |||||
) | ) | ||||
vlorentz: you can sort in-place, no need to copy | |||||
start_at_index = int(page_token) if page_token else 0 | start_at_index = int(page_token) if page_token else 0 | ||||
origins = [ | origins = [ | ||||
{"url": hit["url"]} | {"url": hit["url"]} | ||||
for hit in hits_list[start_at_index : start_at_index + limit] | for hit in hits_list[start_at_index : start_at_index + limit] | ||||
] | ] | ||||
if len(origins) == limit: | if len(origins) == limit: | ||||
next_page_token = str(start_at_index + limit) | next_page_token = str(start_at_index + limit) | ||||
assert len(origins) <= limit | assert len(origins) <= limit | ||||
return PagedResult(results=origins, next_page_token=next_page_token,) | return PagedResult(results=origins, next_page_token=next_page_token,) |
you can sort in-place, no need to copy