Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
Show First 20 Lines • Show All 105 Lines • ▼ Show 20 Lines | def origin_dump(self) -> Iterator[model.Origin]: | ||||
index='origin', id=hit['_id'], | index='origin', id=hit['_id'], | ||||
fields=['*']) | fields=['*']) | ||||
@remote_api_endpoint('origin/search') | @remote_api_endpoint('origin/search') | ||||
def origin_search( | def origin_search( | ||||
self, *, | self, *, | ||||
url_pattern: str = None, metadata_pattern: str = None, | url_pattern: str = None, metadata_pattern: str = None, | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
scroll_token: str = None, count: int = 50 | page_token: str = None, count: int = 50 | ||||
) -> Dict[str, object]: | ) -> Dict[str, object]: | ||||
"""Searches for origins matching the `url_pattern`. | """Searches for origins matching the `url_pattern`. | ||||
Args: | Args: | ||||
url_pattern (str): Part of thr URL to search for | url_pattern (str): Part of thr URL to search for | ||||
with_visit (bool): Whether origins with no visit are to be | with_visit (bool): Whether origins with no visit are to be | ||||
filtered out | filtered out | ||||
scroll_token (str): Opaque value used for pagination. | page_token (str): Opaque value used for pagination. | ||||
count (int): number of results to return. | count (int): number of results to return. | ||||
Returns: | Returns: | ||||
a dictionary with keys: | a dictionary with keys: | ||||
* `scroll_token`: | * `next_page_token`: | ||||
opaque value used for fetching more results. `None` if there | opaque value used for fetching more results. `None` if there | ||||
are no more result. | are no more result. | ||||
* `results`: | * `results`: | ||||
list of dictionaries with key: | list of dictionaries with key: | ||||
* `url`: URL of a matching origin | * `url`: URL of a matching origin | ||||
""" | """ | ||||
query_clauses = [] # type: List[Dict[str, Any]] | query_clauses = [] # type: List[Dict[str, Any]] | ||||
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines | def origin_search( | ||||
} | } | ||||
}, | }, | ||||
'size': count, | 'size': count, | ||||
'sort': [ | 'sort': [ | ||||
{'_score': 'desc'}, | {'_score': 'desc'}, | ||||
{'_id': 'asc'}, | {'_id': 'asc'}, | ||||
] | ] | ||||
} | } | ||||
if scroll_token: | if page_token: | ||||
# TODO: use ElasticSearch's scroll API? | # TODO: use ElasticSearch's scroll API? | ||||
scroll_token_content = msgpack.loads( | page_token_content = msgpack.loads( | ||||
base64.b64decode(scroll_token)) | base64.b64decode(page_token)) | ||||
body['search_after'] = \ | body['search_after'] = \ | ||||
[scroll_token_content[b'score'], | [page_token_content[b'score'], | ||||
scroll_token_content[b'id'].decode('ascii')] | page_token_content[b'id'].decode('ascii')] | ||||
res = self._backend.search( | res = self._backend.search( | ||||
index='origin', | index='origin', | ||||
body=body, | body=body, | ||||
size=count, | size=count, | ||||
) | ) | ||||
hits = res['hits']['hits'] | hits = res['hits']['hits'] | ||||
if len(hits) == count: | if len(hits) == count: | ||||
last_hit = hits[-1] | last_hit = hits[-1] | ||||
next_scroll_token_content = { | next_page_token_content = { | ||||
b'score': last_hit['_score'], | b'score': last_hit['_score'], | ||||
b'id': last_hit['_id'], | b'id': last_hit['_id'], | ||||
} | } | ||||
next_scroll_token = base64.b64encode(msgpack.dumps( | next_page_token = base64.b64encode(msgpack.dumps( | ||||
next_scroll_token_content)) # type: Optional[bytes] | next_page_token_content)) # type: Optional[bytes] | ||||
else: | else: | ||||
next_scroll_token = None | next_page_token = None | ||||
return { | return { | ||||
'scroll_token': next_scroll_token, | 'next_page_token': next_page_token, | ||||
'results': [ | 'results': [ | ||||
{ | { | ||||
# TODO: also add 'id'? | # TODO: also add 'id'? | ||||
'url': hit['_source']['url'], | 'url': hit['_source']['url'], | ||||
} | } | ||||
for hit in hits | for hit in hits | ||||
] | ] | ||||
} | } |