Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
Show All 30 Lines | def _sanitize_origin(origin): | ||||
# Whitelist fields to be saved in Elasticsearch | # Whitelist fields to be saved in Elasticsearch | ||||
res = {"url": origin.pop("url")} | res = {"url": origin.pop("url")} | ||||
for field_name in ( | for field_name in ( | ||||
"blocklisted", | "blocklisted", | ||||
"has_visits", | "has_visits", | ||||
"intrinsic_metadata", | "intrinsic_metadata", | ||||
"visit_types", | "visit_types", | ||||
"nb_visit", | |||||
"last_visit_date", | |||||
): | ): | ||||
if field_name in origin: | if field_name in origin: | ||||
res[field_name] = origin.pop(field_name) | res[field_name] = origin.pop(field_name) | ||||
# Run the JSON-LD expansion algorithm | # Run the JSON-LD expansion algorithm | ||||
# <https://www.w3.org/TR/json-ld-api/#expansion> | # <https://www.w3.org/TR/json-ld-api/#expansion> | ||||
# to normalize the Codemeta metadata. | # to normalize the Codemeta metadata. | ||||
# This is required as Elasticsearch will needs each field to have a consistent | # This is required as Elasticsearch will needs each field to have a consistent | ||||
▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines | def initialize(self) -> None: | ||||
"type": "search_as_you_type", | "type": "search_as_you_type", | ||||
"analyzer": "simple", | "analyzer": "simple", | ||||
} | } | ||||
}, | }, | ||||
}, | }, | ||||
"visit_types": {"type": "keyword"}, | "visit_types": {"type": "keyword"}, | ||||
# used to filter out origins that were never visited | # used to filter out origins that were never visited | ||||
"has_visits": {"type": "boolean",}, | "has_visits": {"type": "boolean",}, | ||||
"nb_visit": {"type": "integer"}, | |||||
"last_visit_date": {"type": "date"}, | |||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"type": "nested", | "type": "nested", | ||||
"properties": { | "properties": { | ||||
"@context": { | "@context": { | ||||
# don't bother indexing tokens in these URIs, as the | # don't bother indexing tokens in these URIs, as the | ||||
# are used as namespaces | # are used as namespaces | ||||
"type": "keyword", | "type": "keyword", | ||||
} | } | ||||
Show All 15 Lines | def origin_update(self, documents: Iterable[OriginDict]) -> None: | ||||
documents = map(_sanitize_origin, documents) | documents = map(_sanitize_origin, documents) | ||||
documents_with_sha1 = ( | documents_with_sha1 = ( | ||||
(origin_identifier(document), document) for document in documents | (origin_identifier(document), document) for document in documents | ||||
) | ) | ||||
# painless script that will be executed when updating an origin document | # painless script that will be executed when updating an origin document | ||||
update_script = """ | update_script = """ | ||||
// backup current visit_types field value | // backup current visit_types field value | ||||
List visit_types = ctx._source.getOrDefault("visit_types", []); | List visit_types = ctx._source.getOrDefault("visit_types", []); | ||||
int nb_visit = ctx._source.getOrDefault("nb_visit", 0); | |||||
ZonedDateTime last_visit_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_visit_date", "0001-01-01T00:00:00Z")); | |||||
// update origin document with new field values | // update origin document with new field values | ||||
ctx._source.putAll(params); | ctx._source.putAll(params); | ||||
// restore previous visit types after visit_types field overriding | // restore previous visit types after visit_types field overriding | ||||
if (ctx._source.containsKey("visit_types")) { | if (ctx._source.containsKey("visit_types")) { | ||||
for (int i = 0; i < visit_types.length; ++i) { | for (int i = 0; i < visit_types.length; ++i) { | ||||
if (!ctx._source.visit_types.contains(visit_types[i])) { | if (!ctx._source.visit_types.contains(visit_types[i])) { | ||||
ctx._source.visit_types.add(visit_types[i]); | ctx._source.visit_types.add(visit_types[i]); | ||||
} | } | ||||
} | } | ||||
} | } | ||||
""" | |||||
// Undo overwrite if incoming nb_visit is smaller | |||||
if (ctx._source.containsKey("nb_visit")) { | |||||
int incoming_nb_visit = ctx._source.getOrDefault("nb_visit", 0); | |||||
if(incoming_nb_visit < nb_visit){ | |||||
ctx._source.nb_visit = nb_visit; | |||||
} | |||||
} | |||||
// Undo overwrite if incoming last_visit_date is older | |||||
if (ctx._source.containsKey("last_visit_date")) { | |||||
ZonedDateTime incoming_last_visit_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_visit_date", "0001-01-01T00:00:00Z")); | |||||
int difference = incoming_last_visit_date.compareTo(last_visit_date); // returns -1, 0 or 1 | |||||
if(difference < 0){ | |||||
ctx._source.last_visit_date = last_visit_date; | |||||
} | |||||
} | |||||
""" # noqa | |||||
actions = [ | actions = [ | ||||
{ | { | ||||
"_op_type": "update", | "_op_type": "update", | ||||
"_id": sha1, | "_id": sha1, | ||||
"_index": write_index, | "_index": write_index, | ||||
"scripted_upsert": True, | "scripted_upsert": True, | ||||
"upsert": {**document, "sha1": sha1,}, | "upsert": {**document, "sha1": sha1,}, | ||||
Show All 24 Lines | class ElasticSearch: | ||||
@timed | @timed | ||||
def origin_search( | def origin_search( | ||||
self, | self, | ||||
*, | *, | ||||
url_pattern: Optional[str] = None, | url_pattern: Optional[str] = None, | ||||
metadata_pattern: Optional[str] = None, | metadata_pattern: Optional[str] = None, | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
visit_types: Optional[List[str]] = None, | visit_types: Optional[List[str]] = None, | ||||
nb_visit: int = None, | |||||
last_visit_date: str = None, | |||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
limit: int = 50, | limit: int = 50, | ||||
) -> PagedResult[MinimalOriginDict]: | ) -> PagedResult[MinimalOriginDict]: | ||||
query_clauses: List[Dict[str, Any]] = [] | query_clauses: List[Dict[str, Any]] = [] | ||||
if url_pattern: | if url_pattern: | ||||
query_clauses.append( | query_clauses.append( | ||||
{ | { | ||||
Show All 37 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
if not query_clauses: | if not query_clauses: | ||||
raise ValueError( | raise ValueError( | ||||
"At least one of url_pattern and metadata_pattern must be provided." | "At least one of url_pattern and metadata_pattern must be provided." | ||||
) | ) | ||||
if with_visit: | if with_visit: | ||||
query_clauses.append({"term": {"has_visits": True,}}) | query_clauses.append({"term": {"has_visits": True,}}) | ||||
if nb_visit: | |||||
query_clauses.append({"term": {"nb_visit": nb_visit,}}) | |||||
if last_visit_date: | |||||
query_clauses.append( | |||||
{"term": {"last_visit_date": last_visit_date.replace("Z", "+00:00"),}} | |||||
) | |||||
if visit_types is not None: | if visit_types is not None: | ||||
query_clauses.append({"terms": {"visit_types": visit_types}}) | query_clauses.append({"terms": {"visit_types": visit_types}}) | ||||
body = { | body = { | ||||
"query": { | "query": { | ||||
"bool": { | "bool": { | ||||
"must": query_clauses, | "must": query_clauses, | ||||
Show All 37 Lines |