Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
# Copyright (C) 2019-2020 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import base64 | import base64 | ||||
from typing import Any, Dict, Iterable, Iterator, List, Optional | from typing import Any, Dict, Iterable, Iterator, List, Optional | ||||
from elasticsearch import Elasticsearch | from elasticsearch import Elasticsearch | ||||
from elasticsearch.helpers import bulk, scan | from elasticsearch.helpers import bulk, scan | ||||
import msgpack | import msgpack | ||||
from swh.indexer import codemeta | |||||
from swh.model import model | from swh.model import model | ||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.search.interface import PagedResult | from swh.search.interface import PagedResult | ||||
def _sanitize_origin(origin): | def _sanitize_origin(origin): | ||||
ardumont: what about:
```
from swh.indexer.codemeta import expand as normalize_intrinsic_metadata
```
? | |||||
origin = origin.copy() | origin = origin.copy() | ||||
res = {"url": origin.pop("url")} | res = {"url": origin.pop("url")} | ||||
for field_name in ("intrinsic_metadata", "has_visits"): | for field_name in ("intrinsic_metadata", "has_visits"): | ||||
if field_name in origin: | if field_name in origin: | ||||
res[field_name] = origin.pop(field_name) | res[field_name] = origin.pop(field_name) | ||||
if "intrinsic_metadata" in res: | |||||
res["intrinsic_metadata"] = codemeta.expand(res["intrinsic_metadata"]) | |||||
return res | return res | ||||
Not Done Inline ActionsI kinda found the following nicer to the eyes. intrinsic_metadata = res.get("intrinsic_metadata") if intrinsic_metadata: res["intrinsic_metadata"] = normalize_intrinsic_metadata(intrinsic_metadata) What do you think? ardumont: I kinda found the following nicer to the eyes.
```
intrinsic_metadata = res.get… | |||||
def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: | def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: | ||||
"""Tokenize as string an index page result from a search | """Tokenize as string an index page result from a search | ||||
""" | """ | ||||
page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) | page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) | ||||
return page_token.decode() | return page_token.decode() | ||||
▲ Show 20 Lines • Show All 163 Lines • Show Last 20 Lines |
what about:
?