Page MenuHomeSoftware Heritage

D2389.diff
No OneTemporary

D2389.diff

diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -46,14 +46,12 @@
index='origin',
body={
'properties': {
+ 'sha1': {
+ 'type': 'keyword',
+ 'doc_values': True,
+ },
'url': {
'type': 'text',
- # TODO: consider removing fielddata when
- # swh-storage allows querying by hash, so the
- # full URL does not have to be stored in ES'
- # memory. See:
- # https://www.elastic.co/guide/en/elasticsearch/reference/current/fielddata.html#before-enabling-fielddata
- 'fielddata': True,
# To split URLs into token on any character
# that is not alphanumerical
'analyzer': 'simple',
@@ -83,15 +81,20 @@
@remote_api_endpoint('origin/update')
def origin_update(self, documents: Iterable[dict]) -> None:
documents = map(_sanitize_origin, documents)
+ documents_with_sha1 = ((origin_identifier(document), document)
+ for document in documents)
actions = [
{
'_op_type': 'update',
- '_id': origin_identifier(document),
+ '_id': sha1,
'_index': 'origin',
- 'doc': document,
+ 'doc': {
+ **document,
+ 'sha1': sha1,
+ },
'doc_as_upsert': True,
}
- for document in documents
+ for (sha1, document) in documents_with_sha1
]
# TODO: make refresh='wait_for' configurable (we don't need it
# in production, it will probably be a performance issue)
@@ -180,7 +183,7 @@
'size': count,
'sort': [
{'_score': 'desc'},
- {'_id': 'asc'},
+ {'sha1': 'asc'},
]
}
if page_token:
@@ -189,7 +192,7 @@
base64.b64decode(page_token))
body['search_after'] = \
[page_token_content[b'score'],
- page_token_content[b'id'].decode('ascii')]
+ page_token_content[b'sha1'].decode('ascii')]
res = self._backend.search(
index='origin',
@@ -203,7 +206,7 @@
last_hit = hits[-1]
next_page_token_content = {
b'score': last_hit['_score'],
- b'id': last_hit['_id'],
+ b'sha1': last_hit['_source']['sha1'],
}
next_page_token = base64.b64encode(msgpack.dumps(
next_page_token_content)) # type: Optional[bytes]

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 4:59 PM (18 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216212

Event Timeline