Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
Show First 20 Lines • Show All 327 Lines • ▼ Show 20 Lines | def origin_search( | ||||
metadata_pattern: Optional[str] = None, | metadata_pattern: Optional[str] = None, | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
visit_types: Optional[List[str]] = None, | visit_types: Optional[List[str]] = None, | ||||
min_nb_visits: int = 0, | min_nb_visits: int = 0, | ||||
min_last_visit_date: str = "", | min_last_visit_date: str = "", | ||||
min_last_eventful_visit_date: str = "", | min_last_eventful_visit_date: str = "", | ||||
min_last_revision_date: str = "", | min_last_revision_date: str = "", | ||||
min_last_release_date: str = "", | min_last_release_date: str = "", | ||||
programming_languages: List[str] = [], | |||||
licenses: List[str] = [], | |||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
sort_by: List[str] = [], | sort_by: List[str] = [], | ||||
limit: int = 50, | limit: int = 50, | ||||
) -> PagedResult[MinimalOriginDict]: | ) -> PagedResult[MinimalOriginDict]: | ||||
query_clauses: List[Dict[str, Any]] = [] | query_clauses: List[Dict[str, Any]] = [] | ||||
if url_pattern: | if url_pattern: | ||||
query_clauses.append( | query_clauses.append( | ||||
▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
query_clauses.append( | query_clauses.append( | ||||
{ | { | ||||
"range": { | "range": { | ||||
"last_release_date": { | "last_release_date": { | ||||
"gte": min_last_release_date.replace("Z", "+00:00"), | "gte": min_last_release_date.replace("Z", "+00:00"), | ||||
} | } | ||||
} | } | ||||
} | } | ||||
) | ) | ||||
KShivendu: I'll add the tests for license and programming_language based on `origin_search` in the next… | |||||
if licenses or programming_languages: | |||||
intrinsic_metadata_fields = [] | |||||
vlorentzUnsubmitted Not Done Inline Actionsdo you need to make this conditional? vlorentz: do you need to make this conditional? | |||||
KShivenduAuthorUnsubmitted Done Inline ActionsI can remove it. But that would add this (even for requests without license or programmingLanguage): { "nested": { "path": "intrinsic_metadata", "query": {"bool": {"should": [],}}, } } Which is unnecessary code that bloats the body of every request to ES. KShivendu: I can remove it. But that would add this (even for requests without `license` or… | |||||
Done Inline Actionsshould it be "term" instead of "match" ? KShivendu: should it be "term" instead of "match" ? | |||||
Not Done Inline ActionsWhat are the differences? vlorentz: What are the differences? | |||||
Done Inline ActionsAfaik, "term" = Exact match For example, "term" should be case-sensitive but not "match". Also, "terms" = Exact match with elements of the given list KShivendu: Afaik,
"term" = Exact match
"match" = Applies some preprocessing(analyzer) steps before… | |||||
if licenses: | |||||
for license in licenses: | |||||
intrinsic_metadata_fields.append( | |||||
{ | |||||
"match": { | |||||
( | |||||
"intrinsic_metadata" | |||||
".http://schema.org/license" | |||||
".@id" | |||||
): license | |||||
} | |||||
} | |||||
) | |||||
if programming_languages: | |||||
for language in programming_languages: | |||||
intrinsic_metadata_fields.append( | |||||
{ | |||||
"match": { | |||||
( | |||||
"intrinsic_metadata" | |||||
".http://schema.org/programmingLanguage" | |||||
".@value" | |||||
): language | |||||
} | |||||
} | |||||
) | |||||
vlorentzUnsubmitted Done Inline Actionsno need for if here vlorentz: no need for `if` here | |||||
query_clauses.append( | |||||
Not Done Inline ActionsCan you deduplicate these two? vlorentz: Can you deduplicate these two? | |||||
{ | |||||
"nested": { | |||||
"path": "intrinsic_metadata", | |||||
"query": {"bool": {"should": intrinsic_metadata_fields,}}, | |||||
} | |||||
} | |||||
) | |||||
vlorentzUnsubmitted Not Done Inline ActionsIs this an OR or an AND operator used here? (please write it in a comment for readers unfamiliar with ES) vlorentz: Is this an OR or an AND operator used here? (please write it in a comment for readers… | |||||
if visit_types is not None: | if visit_types is not None: | ||||
query_clauses.append({"terms": {"visit_types": visit_types}}) | query_clauses.append({"terms": {"visit_types": visit_types}}) | ||||
sorting_params = [] | sorting_params = [] | ||||
for field in sort_by: | for field in sort_by: | ||||
order = "asc" | order = "asc" | ||||
Not Done Inline Actionsperfect! vlorentz: perfect! | |||||
if field and field[0] == "-": | if field and field[0] == "-": | ||||
field = field[1:] | field = field[1:] | ||||
order = "desc" | order = "desc" | ||||
if field in SORT_BY_OPTIONS: | if field in SORT_BY_OPTIONS: | ||||
sorting_params.append({field: order}) | sorting_params.append({field: order}) | ||||
sorting_params.extend( | sorting_params.extend( | ||||
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines |
I'll add the tests for license and programming_language based on origin_search in the next draft.