Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
Show All 30 Lines | def _sanitize_origin(origin): | ||||
# Whitelist fields to be saved in Elasticsearch | # Whitelist fields to be saved in Elasticsearch | ||||
res = {"url": origin.pop("url")} | res = {"url": origin.pop("url")} | ||||
for field_name in ( | for field_name in ( | ||||
"blocklisted", | "blocklisted", | ||||
"has_visits", | "has_visits", | ||||
"intrinsic_metadata", | "intrinsic_metadata", | ||||
"visit_types", | "visit_types", | ||||
"nb_visit", | |||||
"last_visit_date", | |||||
): | ): | ||||
if field_name in origin: | if field_name in origin: | ||||
res[field_name] = origin.pop(field_name) | res[field_name] = origin.pop(field_name) | ||||
# Run the JSON-LD expansion algorithm | # Run the JSON-LD expansion algorithm | ||||
# <https://www.w3.org/TR/json-ld-api/#expansion> | # <https://www.w3.org/TR/json-ld-api/#expansion> | ||||
# to normalize the Codemeta metadata. | # to normalize the Codemeta metadata. | ||||
# This is required as Elasticsearch will needs each field to have a consistent | # This is required as Elasticsearch will needs each field to have a consistent | ||||
▲ Show 20 Lines • Show All 119 Lines • ▼ Show 20 Lines | def origin_update(self, documents: Iterable[OriginDict]) -> None: | ||||
documents = map(_sanitize_origin, documents) | documents = map(_sanitize_origin, documents) | ||||
documents_with_sha1 = ( | documents_with_sha1 = ( | ||||
(origin_identifier(document), document) for document in documents | (origin_identifier(document), document) for document in documents | ||||
) | ) | ||||
# painless script that will be executed when updating an origin document | # painless script that will be executed when updating an origin document | ||||
update_script = """ | update_script = """ | ||||
// backup current visit_types field value | // backup current visit_types field value | ||||
List visit_types = ctx._source.getOrDefault("visit_types", []); | List visit_types = ctx._source.getOrDefault("visit_types", []); | ||||
int nb_visit = ctx._source.getOrDefault("nb_visit", 0); | |||||
ZonedDateTime last_visit_date = ZonedDateTime.parse( | |||||
ctx._source.getOrDefault("last_visit_date", "")); | |||||
// update origin document with new field values | // update origin document with new field values | ||||
ctx._source.putAll(params); | ctx._source.putAll(params); | ||||
// restore previous visit types after visit_types field overriding | // restore previous visit types after visit_types field overriding | ||||
if (ctx._source.containsKey("visit_types")) { | if (ctx._source.containsKey("visit_types")) { | ||||
for (int i = 0; i < visit_types.length; ++i) { | for (int i = 0; i < visit_types.length; ++i) { | ||||
if (!ctx._source.visit_types.contains(visit_types[i])) { | if (!ctx._source.visit_types.contains(visit_types[i])) { | ||||
ctx._source.visit_types.add(visit_types[i]); | ctx._source.visit_types.add(visit_types[i]); | ||||
} | } | ||||
} | } | ||||
} | } | ||||
// Undo overwrite if incoming nb_visit is smaller | |||||
if (ctx._source.containsKey("nb_visit")) { | |||||
int incoming_nb_visit = ctx._source.getOrDefault("nb_visit", ""); | |||||
if(incoming_nb_visit < nb_visit){ | |||||
ctx._source.nb_visit = nb_visit; | |||||
} | |||||
} | |||||
// Undo overwrite if incoming last_visit_date is older | |||||
if (ctx._source.containsKey("last_visit_date")) { | |||||
ZonedDateTime incoming_last_visit_date = ZonedDateTime.parse( | |||||
ctx._source.getOrDefault("last_visit_date", "")); | |||||
if(incoming_last_visit_date < last_visit_date){ | |||||
ctx._source.last_visit_date = last_visit_date; | |||||
} | |||||
} | |||||
""" | """ | ||||
actions = [ | actions = [ | ||||
{ | { | ||||
"_op_type": "update", | "_op_type": "update", | ||||
"_id": sha1, | "_id": sha1, | ||||
"_index": write_index, | "_index": write_index, | ||||
"scripted_upsert": True, | "scripted_upsert": True, | ||||
▲ Show 20 Lines • Show All 131 Lines • Show Last 20 Lines |