Differential D5963 Diff 21471 swh/search/in_memory.py

Changeset View

Standalone View

View Options

swh/search/in_memory.py

Show All 9 Lines

from swh.indexer import codemeta

from swh.model.identifiers import origin_identifier

from swh.search.interface import (

SORT_BY_OPTIONS,

MinimalOriginDict,

OriginDict,

PagedResult,

get_expansion,

)

_words_regexp = re.compile(r"\w+")

def _dict_words_set(d):

"""Recursively extract set of words from dict content."""

values = set()

▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines

def _get_sorting_key(origin, field):

If "-" is present at the start of field then invert the value

in a way that it reverses the sorting order.

"""

reversed = False

if field[0] == "-":

field = field[1:]

reversed = True

if field == "score":

if reversed:

return -origin.get(field, 0)

else:

return origin.get(field, 0)

datetime_max = datetime.max.replace(tzinfo=timezone.utc)

if field in ["nb_visits"]: # unlike other options, nb_visits is of type integer

if reversed:

return -origin.get(field, 0)

else:

return origin.get(field, 0)

▲ Show 20 Lines • Show All 120 Lines • ▼ Show 20 Lines

class InMemorySearch:

def origin_search(

self,

url_pattern: Optional[str] = None,

metadata_pattern: Optional[str] = None,

with_visit: bool = False,

visit_types: Optional[List[str]] = None,

page_token: Optional[str] = None,

min_nb_visits: int = 0,

min_last_visit_date: str = "",

min_last_eventful_visit_date: str = "",

min_last_revision_date: str = "",

min_last_release_date: str = "",

programming_languages: List[str] = [],

programming_languages: Optional[List[str]] = None,

licenses: List[str] = [],

licenses: Optional[List[str]] = None,

sort_by: List[str] = [],

keywords: Optional[List[str]] = None,

sort_by: Optional[List[str]] = None,

page_token: Optional[str] = None,

limit: int = 50,

) -> PagedResult[MinimalOriginDict]:

hits: Iterator[Dict[str, Any]] = (

self._origins[id_]

for id_ in self._origin_ids

if not self._origins[id_].get("blocklisted")

)

▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines

) -> PagedResult[MinimalOriginDict]:

lambda o: datetime.fromisoformat(

o.get("last_release_date", "0001-01-01T00:00:00Z").replace(

"Z", "+00:00"

)

>= datetime.fromisoformat(min_last_release_date),

hits,

)

if licenses:

METADATA_LICENSES = [

queried_licenses = [license_keyword.lower() for license_keyword in licenses]

"intrinsic_metadata",

"http://schema.org/license",

"@id",

]

licenses = [license_keyword.lower() for license_keyword in licenses]

hits = filter(

lambda o: any(

# If any of the queried licenses are found, include the origin

any(

# returns True if queried_license_keyword is found

# in any of the licenses of the origin

queried_license_keyword in origin_license

for origin_license in _nested_get(o, METADATA_LICENSES)

for origin_license in _nested_get(o, get_expansion("licenses"))

)

for queried_license_keyword in licenses

for queried_license_keyword in queried_licenses

hits,

)

if programming_languages:

METADATA_PROGRAMMING_LANGS = [

queried_programming_languages = [

"intrinsic_metadata",

"http://schema.org/programmingLanguage",

"@value",

]

programming_languages = [

lang_keyword.lower() for lang_keyword in programming_languages

]

hits = filter(

lambda o: any(

# If any of the queried languages are found, include the origin

any(

# returns True if queried_lang_keyword is found

# in any of the langs of the origin

queried_lang_keyword in origin_lang

for origin_lang in _nested_get(o, METADATA_PROGRAMMING_LANGS)

for origin_lang in _nested_get(

o, get_expansion("programming_languages")

)

for queried_lang_keyword in programming_languages

for queried_lang_keyword in queried_programming_languages

hits,

)

if keywords:

if sort_by:

sort_by.append("-score")

else:

sort_by = ["-score"]

from copy import deepcopy

hits_list = deepcopy(list(hits))

def tokenize(x):

return x.lower().replace(",", " ").split()

for origin in hits_list:

origin_keywords = [

tokenize(keyword)

for keyword in _nested_get(origin, get_expansion("keywords"))

]

origin_descriptions = [

tokenize(description)

for description in _nested_get(

origin, get_expansion("descriptions")

)

]

for q_keyword in keywords:

for origin_keyword_tokens in origin_keywords:

if q_keyword in origin_keyword_tokens:

origin["score"] = origin.get("score", 0) + 2

for origin_description_token in origin_descriptions:

if q_keyword in origin_description_token:

origin["score"] = origin.get("score", 0) + 1

hits = (origin for origin in hits_list if origin.get("score", 0) > 0)

if visit_types is not None:

visit_types_set = set(visit_types)

hits = filter(

lambda o: visit_types_set.intersection(o.get("visit_types", set())),

hits,

)

hits_list = sorted(

hits_list = list(hits)

hits, key=lambda o: tuple(_get_sorting_key(o, field) for field in sort_by),

if sort_by:

sort_by_list = list(sort_by)

hits_list.sort(

key=lambda o: tuple(

_get_sorting_key(o, field) for field in sort_by_list

)

vlorentzUnsubmitted

Done

sort_by_list = list(sort_by)

- hits_list = sorted(

- hits_list,

+ hits_list.sort(

key=lambda o: tuple(

_get_sorting_key(o, field) for field in sort_by_list

)

start_at_index = int(page_token) if page_token else 0

you can sort in-place, no need to copy

vlorentz: you can sort in-place, no need to copy

start_at_index = int(page_token) if page_token else 0

origins = [

{"url": hit["url"]}

for hit in hits_list[start_at_index : start_at_index + limit]

]

if len(origins) == limit:

next_page_token = str(start_at_index + limit)

assert len(origins) <= limit

return PagedResult(results=origins, next_page_token=next_page_token,)