Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import base64 | import base64 | ||||
from collections import Counter | |||||
import logging | import logging | ||||
import pprint | import pprint | ||||
from textwrap import dedent | from textwrap import dedent | ||||
from typing import Any, Dict, Iterable, Iterator, List, Optional | from typing import Any, Dict, Iterable, Iterator, List, Optional | ||||
from elasticsearch import Elasticsearch, helpers | from elasticsearch import Elasticsearch, helpers | ||||
import msgpack | import msgpack | ||||
▲ Show 20 Lines • Show All 507 Lines • ▼ Show 20 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
next_page_token = token_encode(next_page_token_content) | next_page_token = token_encode(next_page_token_content) | ||||
assert len(hits) <= limit | assert len(hits) <= limit | ||||
return PagedResult( | return PagedResult( | ||||
results=[{"url": hit["_source"]["url"]} for hit in hits], | results=[{"url": hit["_source"]["url"]} for hit in hits], | ||||
next_page_token=next_page_token, | next_page_token=next_page_token, | ||||
) | ) | ||||
def visit_types_count(self) -> Counter: | |||||
body = { | |||||
"aggs": { | |||||
"not_blocklisted": { | |||||
"filter": {"bool": {"must_not": [{"term": {"blocklisted": True}}]}}, | |||||
"aggs": { | |||||
"visit_types": {"terms": {"field": "visit_types", "size": 1000}} | |||||
}, | |||||
} | |||||
} | |||||
} | |||||
res = self._backend.search( | |||||
index=self._get_origin_read_alias(), body=body, size=0 | |||||
) | |||||
buckets = ( | |||||
res.get("aggregations", {}) | |||||
.get("not_blocklisted", {}) | |||||
.get("visit_types", {}) | |||||
.get("buckets", []) | |||||
) | |||||
return Counter({bucket["key"]: bucket["doc_count"] for bucket in buckets}) |