Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
Show First 20 Lines • Show All 101 Lines • ▼ Show 20 Lines | def origin_dump(self) -> Iterator[model.Origin]: | ||||
yield self._backend.termvectors( | yield self._backend.termvectors( | ||||
index='origin', id=hit['_id'], | index='origin', id=hit['_id'], | ||||
fields=['*']) | fields=['*']) | ||||
@remote_api_endpoint('origin/search') | @remote_api_endpoint('origin/search') | ||||
def origin_search( | def origin_search( | ||||
self, *, | self, *, | ||||
url_pattern: str = None, metadata_pattern: str = None, | url_pattern: str = None, metadata_pattern: str = None, | ||||
cursor: str = None, count: int = 50 | scroll_token: str = None, count: int = 50 | ||||
) -> Dict[str, object]: | ) -> Dict[str, object]: | ||||
"""Searches for origins matching the `url_pattern`. | """Searches for origins matching the `url_pattern`. | ||||
Args: | Args: | ||||
url_pattern (str): Part of thr URL to search for | url_pattern (str): Part of thr URL to search for | ||||
cursor (str): `cursor` is opaque value used for pagination. | scroll_token (str): `scroll_token` is opaque value used for | ||||
pagination. | |||||
olasd: This sentence is missing word. | |||||
count (int): number of results to return. | count (int): number of results to return. | ||||
Returns: | Returns: | ||||
a dictionary with keys: | a dictionary with keys: | ||||
* `cursor`: | * `scroll_token`: | ||||
opaque value used for fetching more results. `None` if there | opaque value used for fetching more results. `None` if there | ||||
are no more result. | are no more result. | ||||
* `results`: | * `results`: | ||||
list of dictionaries with key: | list of dictionaries with key: | ||||
* `url`: URL of a matching origin | * `url`: URL of a matching origin | ||||
""" | """ | ||||
# TODO: find a better name for "cursor" | # TODO: find a better name for "scroll_token" | ||||
query_clauses = [] | query_clauses = [] | ||||
if url_pattern: | if url_pattern: | ||||
query_clauses.append({ | query_clauses.append({ | ||||
'multi_match': { | 'multi_match': { | ||||
'query': url_pattern, | 'query': url_pattern, | ||||
'type': 'bool_prefix', | 'type': 'bool_prefix', | ||||
'fields': [ | 'fields': [ | ||||
Show All 29 Lines | def origin_search( | ||||
} | } | ||||
}, | }, | ||||
'size': count, | 'size': count, | ||||
'sort': [ | 'sort': [ | ||||
{'_score': 'desc'}, | {'_score': 'desc'}, | ||||
{'_id': 'asc'}, | {'_id': 'asc'}, | ||||
] | ] | ||||
} | } | ||||
if cursor: | if scroll_token: | ||||
# TODO: use ElasticSearch's scroll API? | # TODO: use ElasticSearch's scroll API? | ||||
cursor = msgpack.loads(base64.b64decode(cursor)) | scroll_token = msgpack.loads(base64.b64decode(scroll_token)) | ||||
body['search_after'] = \ | body['search_after'] = \ | ||||
[cursor[b'score'], cursor[b'id'].decode('ascii')] | [scroll_token[b'score'], scroll_token[b'id'].decode('ascii')] | ||||
res = self._backend.search( | res = self._backend.search( | ||||
index='origin', | index='origin', | ||||
body=body, | body=body, | ||||
size=count, | size=count, | ||||
) | ) | ||||
hits = res['hits']['hits'] | hits = res['hits']['hits'] | ||||
if len(hits) == count: | if len(hits) == count: | ||||
last_hit = hits[-1] | last_hit = hits[-1] | ||||
next_cursor = { | next_scroll_token = { | ||||
b'score': last_hit['_score'], | b'score': last_hit['_score'], | ||||
b'id': last_hit['_id'], | b'id': last_hit['_id'], | ||||
} | } | ||||
next_cursor = base64.b64encode(msgpack.dumps(next_cursor)) | next_scroll_token = base64.b64encode(msgpack.dumps( | ||||
next_scroll_token)) | |||||
else: | else: | ||||
next_cursor = None | next_scroll_token = None | ||||
return { | return { | ||||
'cursor': next_cursor, | 'scroll_token': next_scroll_token, | ||||
'results': [ | 'results': [ | ||||
{ | { | ||||
# TODO: also add 'id'? | # TODO: also add 'id'? | ||||
'url': hit['_source']['url'], | 'url': hit['_source']['url'], | ||||
} | } | ||||
for hit in hits | for hit in hits | ||||
] | ] | ||||
} | } |
This sentence is missing word.