diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -8,14 +8,14 @@ import logging import pprint from textwrap import dedent -from typing import Any, Dict, Iterable, Iterator, List, Optional +from typing import Any, Dict, Iterable, List, Optional from elasticsearch import Elasticsearch, helpers import msgpack from swh.indexer import codemeta from swh.model import model -from swh.model.identifiers import origin_identifier +from swh.model.hashutil import hash_to_hex from swh.search.interface import ( SORT_BY_OPTIONS, MinimalOriginDict, @@ -130,12 +130,12 @@ if not self._backend.indices.exists(index=self._get_origin_index()): self._backend.indices.create(index=self._get_origin_index()) - if not self._backend.indices.exists_alias(self._get_origin_read_alias()): + if not self._backend.indices.exists_alias(name=self._get_origin_read_alias()): self._backend.indices.put_alias( index=self._get_origin_index(), name=self._get_origin_read_alias() ) - if not self._backend.indices.exists_alias(self._get_origin_write_alias()): + if not self._backend.indices.exists_alias(name=self._get_origin_write_alias()): self._backend.indices.put_alias( index=self._get_origin_index(), name=self._get_origin_write_alias() ) @@ -221,7 +221,8 @@ write_index = self._get_origin_write_alias() documents = map(_sanitize_origin, documents) documents_with_sha1 = ( - (origin_identifier(document), document) for document in documents + (hash_to_hex(model.Origin(url=document["url"]).id), document) + for document in documents ) # painless script that will be executed when updating an origin document update_script = dedent( @@ -341,13 +342,6 @@ "document:index_error", count=len(errors), method_name="origin_update" ) - def origin_dump(self) -> Iterator[model.Origin]: - results = helpers.scan(self._backend, index=self._get_origin_read_alias()) - for hit in results: - yield self._backend.termvectors( - index=self._get_origin_read_alias(), id=hit["_id"], fields=["*"] - ) - @timed def origin_search( self, diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -10,7 +10,8 @@ from typing import Any, Dict, Iterable, Iterator, List, Optional from swh.indexer import codemeta -from swh.model.identifiers import origin_identifier +from swh.model import model +from swh.model.hashutil import hash_to_hex from swh.search.interface import ( SORT_BY_OPTIONS, MinimalOriginDict, @@ -171,7 +172,7 @@ def origin_update(self, documents: Iterable[OriginDict]) -> None: for source_document in documents: document: Dict[str, Any] = dict(source_document) - id_ = origin_identifier(document) + id_ = hash_to_hex(model.Origin(url=document["url"]).id) if "url" in document: document["_url_tokens"] = set( self._url_splitter.split(source_document["url"]) diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -9,6 +9,7 @@ import pytest +from swh.model.hashutil import hash_to_bytes from swh.model.model import ( ObjectType, Person, @@ -20,7 +21,6 @@ TargetType, Timestamp, TimestampWithTimezone, - hash_to_bytes, ) from swh.search.journal_client import ( fetch_last_revision_release_date,