Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
# Copyright (C) 2019-2022 The Software Heritage developers | # Copyright (C) 2019-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import base64 | import base64 | ||||
from collections import Counter | from collections import Counter | ||||
import logging | import logging | ||||
import pprint | import pprint | ||||
from textwrap import dedent | from textwrap import dedent | ||||
from typing import Any, Dict, Iterable, List, Optional, cast | from typing import Any, Dict, Iterable, List, Optional, cast | ||||
from elasticsearch import Elasticsearch, helpers | from elasticsearch import Elasticsearch, NotFoundError, helpers | ||||
import msgpack | import msgpack | ||||
from swh.indexer import codemeta | from swh.indexer import codemeta | ||||
from swh.model import model | from swh.model import model | ||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | ||||
from swh.search.interface import ( | from swh.search.interface import ( | ||||
SORT_BY_OPTIONS, | SORT_BY_OPTIONS, | ||||
MinimalOriginDict, | MinimalOriginDict, | ||||
▲ Show 20 Lines • Show All 359 Lines • ▼ Show 20 Lines | def origin_update(self, documents: Iterable[OriginDict]) -> None: | ||||
indexed_count, errors = helpers.bulk(self._backend, actions, index=write_index) | indexed_count, errors = helpers.bulk(self._backend, actions, index=write_index) | ||||
assert isinstance(errors, List) # Make mypy happy | assert isinstance(errors, List) # Make mypy happy | ||||
send_metric("document:index", count=indexed_count, method_name="origin_update") | send_metric("document:index", count=indexed_count, method_name="origin_update") | ||||
send_metric( | send_metric( | ||||
"document:index_error", count=len(errors), method_name="origin_update" | "document:index_error", count=len(errors), method_name="origin_update" | ||||
) | ) | ||||
olasd: I believe there's more than `str`s in the values of these returned dicts (e.g. I think… | |||||
def origin_get(self, url: str) -> Optional[Dict[str, Any]]: | |||||
origin_id = hash_to_hex(model.Origin(url=url).id) | |||||
try: | |||||
document = self._backend.get( | |||||
index=self._get_origin_read_alias(), id=origin_id | |||||
) | |||||
except NotFoundError: | |||||
return None | |||||
else: | |||||
return document["_source"] | |||||
@timed | @timed | ||||
def origin_search( | def origin_search( | ||||
self, | self, | ||||
*, | *, | ||||
query: str = "", | query: str = "", | ||||
url_pattern: Optional[str] = None, | url_pattern: Optional[str] = None, | ||||
metadata_pattern: Optional[str] = None, | metadata_pattern: Optional[str] = None, | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
▲ Show 20 Lines • Show All 203 Lines • Show Last 20 Lines |
I believe there's more than strs in the values of these returned dicts (e.g. I think visit_types is a list).