Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
Show All 14 Lines | |||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
def _sanitize_origin(origin): | def _sanitize_origin(origin): | ||||
origin = origin.copy() | origin = origin.copy() | ||||
res = { | res = { | ||||
'url': origin.pop('url') | 'url': origin.pop('url') | ||||
} | } | ||||
for field_name in ('intrinsic_metadata',): | for field_name in ('intrinsic_metadata', 'has_visits'): | ||||
if field_name in origin: | if field_name in origin: | ||||
res[field_name] = origin.pop(field_name) | res[field_name] = origin.pop(field_name) | ||||
return res | return res | ||||
class ElasticSearch: | class ElasticSearch: | ||||
def __init__(self, hosts: List[str]): | def __init__(self, hosts: List[str]): | ||||
self._backend = Elasticsearch(hosts=hosts) | self._backend = Elasticsearch(hosts=hosts) | ||||
Show All 26 Lines | def initialize(self) -> None: | ||||
'analyzer': 'simple', | 'analyzer': 'simple', | ||||
'fields': { | 'fields': { | ||||
'as_you_type': { | 'as_you_type': { | ||||
'type': 'search_as_you_type', | 'type': 'search_as_you_type', | ||||
'analyzer': 'simple', | 'analyzer': 'simple', | ||||
} | } | ||||
} | } | ||||
}, | }, | ||||
'has_visits': { | |||||
'type': 'boolean', | |||||
'store': True, | |||||
}, | |||||
'intrinsic_metadata': { | 'intrinsic_metadata': { | ||||
'type': 'nested', | 'type': 'nested', | ||||
'properties': { | 'properties': { | ||||
'@context': { | '@context': { | ||||
# don't bother indexing tokens | # don't bother indexing tokens | ||||
'type': 'keyword', | 'type': 'keyword', | ||||
} | } | ||||
}, | }, | ||||
Show All 28 Lines | def origin_dump(self) -> Iterator[model.Origin]: | ||||
yield self._backend.termvectors( | yield self._backend.termvectors( | ||||
index='origin', id=hit['_id'], | index='origin', id=hit['_id'], | ||||
fields=['*']) | fields=['*']) | ||||
@remote_api_endpoint('origin/search') | @remote_api_endpoint('origin/search') | ||||
def origin_search( | def origin_search( | ||||
self, *, | self, *, | ||||
url_pattern: str = None, metadata_pattern: str = None, | url_pattern: str = None, metadata_pattern: str = None, | ||||
with_visit: bool = False, | |||||
scroll_token: str = None, count: int = 50 | scroll_token: str = None, count: int = 50 | ||||
) -> Dict[str, object]: | ) -> Dict[str, object]: | ||||
"""Searches for origins matching the `url_pattern`. | """Searches for origins matching the `url_pattern`. | ||||
Args: | Args: | ||||
url_pattern (str): Part of thr URL to search for | url_pattern (str): Part of thr URL to search for | ||||
scroll_token (str): `scroll_token` is an opaque value used for | with_visit (bool): Whether origins with no visit are to be | ||||
pagination. | filtered out | ||||
scroll_token (str): Opaque value used for pagination. | |||||
count (int): number of results to return. | count (int): number of results to return. | ||||
Returns: | Returns: | ||||
a dictionary with keys: | a dictionary with keys: | ||||
* `scroll_token`: | * `scroll_token`: | ||||
opaque value used for fetching more results. `None` if there | opaque value used for fetching more results. `None` if there | ||||
are no more result. | are no more result. | ||||
* `results`: | * `results`: | ||||
Show All 28 Lines | def origin_search( | ||||
} | } | ||||
}) | }) | ||||
if not query_clauses: | if not query_clauses: | ||||
raise ValueError( | raise ValueError( | ||||
'At least one of url_pattern and metadata_pattern ' | 'At least one of url_pattern and metadata_pattern ' | ||||
'must be provided.') | 'must be provided.') | ||||
if with_visit: | |||||
query_clauses.append({ | |||||
'term': { | |||||
'has_visits': True, | |||||
} | |||||
}) | |||||
body = { | body = { | ||||
'query': { | 'query': { | ||||
'bool': { | 'bool': { | ||||
'should': query_clauses, # TODO: must? | 'must': query_clauses, | ||||
} | } | ||||
}, | }, | ||||
'size': count, | 'size': count, | ||||
'sort': [ | 'sort': [ | ||||
{'_score': 'desc'}, | {'_score': 'desc'}, | ||||
{'_id': 'asc'}, | {'_id': 'asc'}, | ||||
] | ] | ||||
} | } | ||||
Show All 35 Lines |