Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
Show All 16 Lines | |||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.search.interface import ( | from swh.search.interface import ( | ||||
SORT_BY_OPTIONS, | SORT_BY_OPTIONS, | ||||
MinimalOriginDict, | MinimalOriginDict, | ||||
OriginDict, | OriginDict, | ||||
PagedResult, | PagedResult, | ||||
) | ) | ||||
from swh.search.metrics import send_metric, timed | from swh.search.metrics import send_metric, timed | ||||
from swh.search.utils import get_expansion, is_date_parsable | from swh.search.translator import Translator | ||||
from swh.search.utils import get_expansion, is_date_parsable, to_raw | |||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
INDEX_NAME_PARAM = "index" | INDEX_NAME_PARAM = "index" | ||||
READ_ALIAS_PARAM = "read_alias" | READ_ALIAS_PARAM = "read_alias" | ||||
WRITE_ALIAS_PARAM = "write_alias" | WRITE_ALIAS_PARAM = "write_alias" | ||||
ORIGIN_DEFAULT_CONFIG = { | ORIGIN_DEFAULT_CONFIG = { | ||||
▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines | |||||
def token_decode(page_token: str) -> Dict[bytes, Any]: | def token_decode(page_token: str) -> Dict[bytes, Any]: | ||||
"""Read the page_token""" | """Read the page_token""" | ||||
return msgpack.loads(base64.b64decode(page_token.encode()), raw=True) | return msgpack.loads(base64.b64decode(page_token.encode()), raw=True) | ||||
class ElasticSearch: | class ElasticSearch: | ||||
def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}): | def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}): | ||||
self._backend = Elasticsearch(hosts=hosts) | self._backend = Elasticsearch(hosts=hosts) | ||||
self._translator = Translator() | |||||
# Merge current configuration with default values | # Merge current configuration with default values | ||||
origin_config = indexes.get("origin", {}) | origin_config = indexes.get("origin", {}) | ||||
self.origin_config = {**ORIGIN_DEFAULT_CONFIG, **origin_config} | self.origin_config = {**ORIGIN_DEFAULT_CONFIG, **origin_config} | ||||
def _get_origin_index(self) -> str: | def _get_origin_index(self) -> str: | ||||
return self.origin_config[INDEX_NAME_PARAM] | return self.origin_config[INDEX_NAME_PARAM] | ||||
▲ Show 20 Lines • Show All 233 Lines • ▼ Show 20 Lines | def origin_dump(self) -> Iterator[model.Origin]: | ||||
yield self._backend.termvectors( | yield self._backend.termvectors( | ||||
index=self._get_origin_read_alias(), id=hit["_id"], fields=["*"] | index=self._get_origin_read_alias(), id=hit["_id"], fields=["*"] | ||||
) | ) | ||||
@timed | @timed | ||||
def origin_search( | def origin_search( | ||||
self, | self, | ||||
*, | *, | ||||
query: str = "", | |||||
url_pattern: Optional[str] = None, | url_pattern: Optional[str] = None, | ||||
metadata_pattern: Optional[str] = None, | metadata_pattern: Optional[str] = None, | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
visit_types: Optional[List[str]] = None, | visit_types: Optional[List[str]] = None, | ||||
min_nb_visits: int = 0, | min_nb_visits: int = 0, | ||||
min_last_visit_date: str = "", | min_last_visit_date: str = "", | ||||
min_last_eventful_visit_date: str = "", | min_last_eventful_visit_date: str = "", | ||||
min_last_revision_date: str = "", | min_last_revision_date: str = "", | ||||
min_last_release_date: str = "", | min_last_release_date: str = "", | ||||
min_date_created: str = "", | min_date_created: str = "", | ||||
min_date_modified: str = "", | min_date_modified: str = "", | ||||
min_date_published: str = "", | min_date_published: str = "", | ||||
programming_languages: Optional[List[str]] = None, | programming_languages: Optional[List[str]] = None, | ||||
licenses: Optional[List[str]] = None, | licenses: Optional[List[str]] = None, | ||||
keywords: Optional[List[str]] = None, | keywords: Optional[List[str]] = None, | ||||
sort_by: Optional[List[str]] = None, | sort_by: Optional[List[str]] = None, | ||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
limit: int = 50, | limit: int = 50, | ||||
) -> PagedResult[MinimalOriginDict]: | ) -> PagedResult[MinimalOriginDict]: | ||||
query_clauses: List[Dict[str, Any]] = [] | query_clauses: List[Dict[str, Any]] = [] | ||||
query_filters = [] | |||||
if url_pattern: | if url_pattern: | ||||
query_clauses.append( | query_filters.append(f"origin = {to_raw(url_pattern)}") | ||||
{ | |||||
"multi_match": { | |||||
"query": url_pattern, | |||||
"type": "bool_prefix", | |||||
"operator": "and", | |||||
"fields": [ | |||||
"url.as_you_type", | |||||
"url.as_you_type._2gram", | |||||
"url.as_you_type._3gram", | |||||
], | |||||
} | |||||
} | |||||
) | |||||
if metadata_pattern: | if metadata_pattern: | ||||
query_clauses.append( | query_filters.append(f"metadata = {to_raw(metadata_pattern)}") | ||||
{ | |||||
"nested": { | |||||
"path": "intrinsic_metadata", | |||||
"query": { | |||||
"multi_match": { | |||||
"query": metadata_pattern, | |||||
# Makes it so that the "foo bar" query returns | |||||
# documents which contain "foo" in a field and "bar" | |||||
# in a different field | |||||
"type": "cross_fields", | |||||
# All keywords must be found in a document for it to | |||||
# be considered a match. | |||||
# TODO: allow missing keywords? | |||||
"operator": "and", | |||||
# Searches on all fields of the intrinsic_metadata dict, | |||||
# recursively. | |||||
"fields": ["intrinsic_metadata.*"], | |||||
# date{Created,Modified,Published} are of type date | |||||
"lenient": True, | |||||
} | |||||
}, | |||||
} | |||||
} | |||||
) | |||||
if not query_clauses: | # if not query_clauses: | ||||
raise ValueError( | # raise ValueError( | ||||
"At least one of url_pattern and metadata_pattern must be provided." | # "At least one of url_pattern and metadata_pattern must be provided." | ||||
) | # ) | ||||
if with_visit: | if with_visit: | ||||
query_clauses.append({"term": {"has_visits": True,}}) | query_filters.append(f"visited = {'true' if with_visit else 'false'}") | ||||
if min_nb_visits: | if min_nb_visits: | ||||
query_clauses.append({"range": {"nb_visits": {"gte": min_nb_visits,},}}) | query_filters.append(f"visits >= {min_nb_visits}") | ||||
if min_last_visit_date: | if min_last_visit_date: | ||||
query_clauses.append( | query_filters.append( | ||||
{ | f"last_visit >= {min_last_visit_date.replace('Z', '+00:00')}" | ||||
"range": { | |||||
"last_visit_date": { | |||||
"gte": min_last_visit_date.replace("Z", "+00:00"), | |||||
} | |||||
} | |||||
} | |||||
) | ) | ||||
if min_last_eventful_visit_date: | if min_last_eventful_visit_date: | ||||
query_clauses.append( | query_filters.append( | ||||
{ | "last_eventful_visit >= " | ||||
"range": { | f"{min_last_eventful_visit_date.replace('Z', '+00:00')}" | ||||
"last_eventful_visit_date": { | |||||
"gte": min_last_eventful_visit_date.replace("Z", "+00:00"), | |||||
} | |||||
} | |||||
} | |||||
) | ) | ||||
if min_last_revision_date: | if min_last_revision_date: | ||||
query_clauses.append( | query_filters.append( | ||||
{ | f"last_revision >= {min_last_revision_date.replace('Z', '+00:00')}" | ||||
"range": { | |||||
"last_revision_date": { | |||||
"gte": min_last_revision_date.replace("Z", "+00:00"), | |||||
} | |||||
} | |||||
} | |||||
) | ) | ||||
if min_last_release_date: | if min_last_release_date: | ||||
query_clauses.append( | query_filters.append( | ||||
{ | f"last_release >= {min_last_release_date.replace('Z', '+00:00')}" | ||||
"range": { | |||||
"last_release_date": { | |||||
"gte": min_last_release_date.replace("Z", "+00:00"), | |||||
} | |||||
} | |||||
} | |||||
) | ) | ||||
if keywords: | if keywords: | ||||
query_clauses.append( | query_filters.append(f"keyword in {to_raw(keywords)}") | ||||
{ | |||||
"nested": { | |||||
"path": "intrinsic_metadata", | |||||
"query": { | |||||
"multi_match": { | |||||
"query": " ".join(keywords), | |||||
"fields": [ | |||||
get_expansion("keywords", ".") + "^2", | |||||
get_expansion("descriptions", "."), | |||||
# "^2" boosts an origin's score by 2x | |||||
# if it the queried keywords are | |||||
# found in its intrinsic_metadata.keywords | |||||
], | |||||
} | |||||
}, | |||||
} | |||||
} | |||||
) | |||||
intrinsic_metadata_filters: List[Dict[str, Dict]] = [] | |||||
if licenses: | if licenses: | ||||
license_filters: List[Dict[str, Any]] = [] | query_filters.append(f"license in {to_raw(licenses)}") | ||||
for license in licenses: | |||||
license_filters.append( | |||||
{"match": {get_expansion("licenses", "."): license}} | |||||
) | |||||
intrinsic_metadata_filters.append({"bool": {"should": license_filters}}) | |||||
if programming_languages: | if programming_languages: | ||||
language_filters: List[Dict[str, Any]] = [] | query_filters.append(f"language in {to_raw(programming_languages)}") | ||||
for language in programming_languages: | |||||
language_filters.append( | |||||
{"match": {get_expansion("programming_languages", "."): language}} | |||||
) | |||||
intrinsic_metadata_filters.append({"bool": {"should": language_filters}}) | |||||
if min_date_created: | if min_date_created: | ||||
intrinsic_metadata_filters.append( | query_filters.append( | ||||
{ | f"created >= {min_date_created.replace('Z', '+00:00')}" | ||||
"range": { | |||||
get_expansion("date_created", "."): {"gte": min_date_created,} | |||||
} | |||||
} | |||||
) | ) | ||||
if min_date_modified: | if min_date_modified: | ||||
intrinsic_metadata_filters.append( | query_filters.append( | ||||
{ | f"modified >= {min_date_modified.replace('Z', '+00:00')}" | ||||
"range": { | |||||
get_expansion("date_modified", "."): {"gte": min_date_modified,} | |||||
} | |||||
} | |||||
) | ) | ||||
if min_date_published: | if min_date_published: | ||||
intrinsic_metadata_filters.append( | query_filters.append( | ||||
{ | f"published >= {min_date_published.replace('Z', '+00:00')}" | ||||
"range": { | |||||
get_expansion("date_published", "."): { | |||||
"gte": min_date_published, | |||||
} | |||||
} | |||||
} | |||||
) | |||||
if intrinsic_metadata_filters: | |||||
query_clauses.append( | |||||
{ | |||||
"nested": { | |||||
"path": "intrinsic_metadata", | |||||
"query": {"bool": {"must": intrinsic_metadata_filters,}}, | |||||
# "must" is equivalent to "AND" | |||||
# "should" is equivalent to "OR" | |||||
# Resulting origins must return true for the following: | |||||
# (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..) | |||||
# This is equivalent to {"must": [ | |||||
# {"should": [license_1,license_2] }, | |||||
# {"should": [lang_1,lang_2]}] } | |||||
# ]} | |||||
# Note: Usage of "bool" has been omitted for readability | |||||
} | |||||
} | |||||
) | ) | ||||
if visit_types is not None: | if visit_types is not None: | ||||
query_clauses.append({"terms": {"visit_types": visit_types}}) | query_filters.append(f"visit_type = {to_raw(visit_types)}") | ||||
combined_filters = f"({' and '.join(query_filters)})" | |||||
query = f"{combined_filters}{' and ' if query != '' else ' '}{query}" | |||||
parsed_query = self._translator.parse_query(query) | |||||
query_clauses.append(parsed_query["filters"]) | |||||
field_map = { | |||||
"visits": "nb_visits", | |||||
"last_visit": "last_visit_date", | |||||
"last_eventful_visit": "last_eventful_visit_date", | |||||
"last_revision": "last_revision_date", | |||||
"last_release": "last_release_date", | |||||
"created": "date_created", | |||||
"modified": "date_modified", | |||||
"published": "date_published", | |||||
} | |||||
if "sort_by" in parsed_query: | |||||
if sort_by is None: | |||||
sort_by = [] | |||||
for sort_by_option in parsed_query["sort_by"]: | |||||
if sort_by_option[0] == "-": | |||||
sort_by.append("-" + field_map[sort_by_option[1:]]) | |||||
else: | |||||
sort_by.append(field_map[sort_by_option]) | |||||
if parsed_query.get("limit", 0): | |||||
vlorentz: looks like this isn't covered by tests | |||||
limit = parsed_query["limit"] | |||||
sorting_params: List[Dict[str, Any]] = [] | sorting_params: List[Dict[str, Any]] = [] | ||||
if sort_by: | if sort_by: | ||||
for field in sort_by: | for field in sort_by: | ||||
order = "asc" | order = "asc" | ||||
if field and field[0] == "-": | if field and field[0] == "-": | ||||
field = field[1:] | field = field[1:] | ||||
▲ Show 20 Lines • Show All 64 Lines • Show Last 20 Lines |
looks like this isn't covered by tests