Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
Show All 12 Lines | |||||
from swh.indexer import codemeta | from swh.indexer import codemeta | ||||
from swh.model import model | from swh.model import model | ||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.search.interface import ( | from swh.search.interface import ( | ||||
SORT_BY_OPTIONS, | SORT_BY_OPTIONS, | ||||
MinimalOriginDict, | MinimalOriginDict, | ||||
OriginDict, | OriginDict, | ||||
PagedResult, | PagedResult, | ||||
get_expansion, | |||||
) | ) | ||||
from swh.search.metrics import send_metric, timed | from swh.search.metrics import send_metric, timed | ||||
INDEX_NAME_PARAM = "index" | INDEX_NAME_PARAM = "index" | ||||
READ_ALIAS_PARAM = "read_alias" | READ_ALIAS_PARAM = "read_alias" | ||||
WRITE_ALIAS_PARAM = "write_alias" | WRITE_ALIAS_PARAM = "write_alias" | ||||
ORIGIN_DEFAULT_CONFIG = { | ORIGIN_DEFAULT_CONFIG = { | ||||
▲ Show 20 Lines • Show All 299 Lines • ▼ Show 20 Lines | def origin_search( | ||||
metadata_pattern: Optional[str] = None, | metadata_pattern: Optional[str] = None, | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
visit_types: Optional[List[str]] = None, | visit_types: Optional[List[str]] = None, | ||||
min_nb_visits: int = 0, | min_nb_visits: int = 0, | ||||
min_last_visit_date: str = "", | min_last_visit_date: str = "", | ||||
min_last_eventful_visit_date: str = "", | min_last_eventful_visit_date: str = "", | ||||
min_last_revision_date: str = "", | min_last_revision_date: str = "", | ||||
min_last_release_date: str = "", | min_last_release_date: str = "", | ||||
programming_languages: List[str] = [], | programming_languages: Optional[List[str]] = None, | ||||
licenses: List[str] = [], | licenses: Optional[List[str]] = None, | ||||
keywords: Optional[List[str]] = None, | |||||
sort_by: Optional[List[str]] = None, | |||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
sort_by: List[str] = [], | |||||
limit: int = 50, | limit: int = 50, | ||||
) -> PagedResult[MinimalOriginDict]: | ) -> PagedResult[MinimalOriginDict]: | ||||
query_clauses: List[Dict[str, Any]] = [] | query_clauses: List[Dict[str, Any]] = [] | ||||
if url_pattern: | if url_pattern: | ||||
query_clauses.append( | query_clauses.append( | ||||
{ | { | ||||
"multi_match": { | "multi_match": { | ||||
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
{ | { | ||||
"range": { | "range": { | ||||
"last_release_date": { | "last_release_date": { | ||||
"gte": min_last_release_date.replace("Z", "+00:00"), | "gte": min_last_release_date.replace("Z", "+00:00"), | ||||
} | } | ||||
} | } | ||||
} | } | ||||
) | ) | ||||
if keywords: | |||||
query_clauses.append( | |||||
{ | |||||
"nested": { | |||||
"path": "intrinsic_metadata", | |||||
"query": { | |||||
"multi_match": { | |||||
"query": " ".join(keywords), | |||||
"fields": [ | |||||
get_expansion("keywords", ".") + "^2", | |||||
get_expansion("descriptions", "."), | |||||
# "^2" boosts an origin's score by 2x | |||||
# if it the queried keywords are | |||||
vlorentz: what does `^2` do? | |||||
Done Inline ActionsBoosts a result score by 2 times (might not be exactly 2x, but that's the idea) if queried keywords are found in that field KShivendu: Boosts a result score by 2 times (might not be exactly 2x, but that's the idea) if queried… | |||||
# found in its intrinsic_metadata.keywords | |||||
], | |||||
} | |||||
}, | |||||
} | |||||
} | |||||
) | |||||
if licenses or programming_languages: | intrinsic_metadata_filters: List[Dict[str, Dict]] = [] | ||||
license_filters = [] | if licenses: | ||||
license_filters: List[Dict[str, Any]] = [] | |||||
for license in licenses: | for license in licenses: | ||||
license_filters.append( | license_filters.append( | ||||
{ | {"match": {get_expansion("licenses", "."): license}} | ||||
Done Inline Actionswhy these two changes? vlorentz: why these two changes? | |||||
"match": { | |||||
( | |||||
"intrinsic_metadata" ".http://schema.org/license" ".@id" | |||||
): license | |||||
} | |||||
} | |||||
) | ) | ||||
intrinsic_metadata_filters.append({"bool": {"should": license_filters}}) | |||||
language_filters = [] | if programming_languages: | ||||
language_filters: List[Dict[str, Any]] = [] | |||||
for language in programming_languages: | for language in programming_languages: | ||||
language_filters.append( | language_filters.append( | ||||
{ | {"match": {get_expansion("programming_languages", "."): language}} | ||||
"match": { | |||||
( | |||||
"intrinsic_metadata" | |||||
".http://schema.org/programmingLanguage" | |||||
".@value" | |||||
): language | |||||
} | |||||
} | |||||
) | ) | ||||
intrinsic_metadata_filters.append({"bool": {"should": language_filters}}) | |||||
intrinsic_metadata_filters = [ | if intrinsic_metadata_filters: | ||||
{"bool": {"should": license_filters}}, | |||||
{"bool": {"should": language_filters}}, | |||||
] | |||||
query_clauses.append( | query_clauses.append( | ||||
{ | { | ||||
"nested": { | "nested": { | ||||
"path": "intrinsic_metadata", | "path": "intrinsic_metadata", | ||||
"query": {"bool": {"must": intrinsic_metadata_filters,}}, | "query": {"bool": {"must": intrinsic_metadata_filters,}}, | ||||
# "must" is equivalent to "AND" | # "must" is equivalent to "AND" | ||||
# "should" is equivalent to "OR" | # "should" is equivalent to "OR" | ||||
# Resulting origins must return true for the following: | # Resulting origins must return true for the following: | ||||
# (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..) | # (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..) | ||||
# This is equivalent to {"must": [ | # This is equivalent to {"must": [ | ||||
# {"should": [license_1,license_2] }, | # {"should": [license_1,license_2] }, | ||||
# {"should": [lang_1,lang_2]}] } | # {"should": [lang_1,lang_2]}] } | ||||
# ]} | # ]} | ||||
# Note: Usage of "bool" has been omitted for readability | # Note: Usage of "bool" has been omitted for readability | ||||
} | } | ||||
} | } | ||||
) | ) | ||||
if visit_types is not None: | if visit_types is not None: | ||||
query_clauses.append({"terms": {"visit_types": visit_types}}) | query_clauses.append({"terms": {"visit_types": visit_types}}) | ||||
sorting_params = [] | sorting_params = [] | ||||
if sort_by: | |||||
for field in sort_by: | for field in sort_by: | ||||
order = "asc" | order = "asc" | ||||
if field and field[0] == "-": | if field and field[0] == "-": | ||||
field = field[1:] | field = field[1:] | ||||
order = "desc" | order = "desc" | ||||
if field in SORT_BY_OPTIONS: | if field in SORT_BY_OPTIONS: | ||||
sorting_params.append({field: order}) | sorting_params.append({field: order}) | ||||
sorting_params.extend( | sorting_params.extend( | ||||
[{"_score": "desc"}, {"sha1": "asc"},] | [{"_score": "desc"}, {"sha1": "asc"},] | ||||
) | ) | ||||
body = { | body = { | ||||
"query": { | "query": { | ||||
"bool": { | "bool": { | ||||
Show All 39 Lines |
what does ^2 do?