Changeset View
Changeset View
Standalone View
Standalone View
swh/search/elasticsearch.py
Show All 39 Lines | |||||
ORIGIN_MAPPING = { | ORIGIN_MAPPING = { | ||||
"dynamic_templates": [ | "dynamic_templates": [ | ||||
{ | { | ||||
"booleans_as_string": { | "booleans_as_string": { | ||||
# All fields stored as string in the metadata | # All fields stored as string in the metadata | ||||
# even the booleans | # even the booleans | ||||
"match_mapping_type": "boolean", | "match_mapping_type": "boolean", | ||||
"path_match": "intrinsic_metadata.*", | "path_match": "jsonld.*", | ||||
"mapping": {"type": "keyword"}, | "mapping": {"type": "keyword"}, | ||||
} | } | ||||
}, | }, | ||||
{ | { | ||||
"floats_as_string": { | "floats_as_string": { | ||||
# All fields stored as string in the metadata | # All fields stored as string in the metadata | ||||
# even the floats | # even the floats | ||||
"match_mapping_type": "double", | "match_mapping_type": "double", | ||||
"path_match": "intrinsic_metadata.*", | "path_match": "jsonld.*", | ||||
"mapping": {"type": "text"}, | "mapping": {"type": "text"}, | ||||
} | } | ||||
}, | }, | ||||
{ | { | ||||
"longs_as_string": { | "longs_as_string": { | ||||
# All fields stored as string in the metadata | # All fields stored as string in the metadata | ||||
# even the longs | # even the longs | ||||
"match_mapping_type": "long", | "match_mapping_type": "long", | ||||
"path_match": "intrinsic_metadata.*", | "path_match": "jsonld.*", | ||||
"mapping": {"type": "text"}, | "mapping": {"type": "text"}, | ||||
} | } | ||||
}, | }, | ||||
], | ], | ||||
"date_detection": False, | "date_detection": False, | ||||
"properties": { | "properties": { | ||||
# sha1 of the URL; used as the document id | # sha1 of the URL; used as the document id | ||||
"sha1": { | "sha1": { | ||||
Show All 22 Lines | "properties": { | ||||
"type": "boolean", | "type": "boolean", | ||||
}, | }, | ||||
"nb_visits": {"type": "integer"}, | "nb_visits": {"type": "integer"}, | ||||
"snapshot_id": {"type": "keyword"}, | "snapshot_id": {"type": "keyword"}, | ||||
"last_visit_date": {"type": "date"}, | "last_visit_date": {"type": "date"}, | ||||
"last_eventful_visit_date": {"type": "date"}, | "last_eventful_visit_date": {"type": "date"}, | ||||
"last_release_date": {"type": "date"}, | "last_release_date": {"type": "date"}, | ||||
"last_revision_date": {"type": "date"}, | "last_revision_date": {"type": "date"}, | ||||
"intrinsic_metadata": { | "jsonld": { | ||||
"type": "nested", | "type": "nested", | ||||
"properties": { | "properties": { | ||||
"@context": { | "@context": { | ||||
# don't bother indexing tokens in these URIs, as the | # don't bother indexing tokens in these URIs, as the | ||||
# are used as namespaces | # are used as namespaces | ||||
"type": "keyword", | "type": "keyword", | ||||
}, | }, | ||||
"http://schema": { | "http://schema": { | ||||
▲ Show 20 Lines • Show All 127 Lines • ▼ Show 20 Lines | |||||
def _sanitize_origin(origin): | def _sanitize_origin(origin): | ||||
origin = origin.copy() | origin = origin.copy() | ||||
# Whitelist fields to be saved in Elasticsearch | # Whitelist fields to be saved in Elasticsearch | ||||
res = {"url": origin.pop("url")} | res = {"url": origin.pop("url")} | ||||
for field_name in ( | for field_name in ( | ||||
"blocklisted", | "blocklisted", | ||||
"has_visits", | "has_visits", | ||||
"intrinsic_metadata", | "jsonld", | ||||
"visit_types", | "visit_types", | ||||
"nb_visits", | "nb_visits", | ||||
"snapshot_id", | "snapshot_id", | ||||
"last_visit_date", | "last_visit_date", | ||||
"last_eventful_visit_date", | "last_eventful_visit_date", | ||||
"last_revision_date", | "last_revision_date", | ||||
"last_release_date", | "last_release_date", | ||||
): | ): | ||||
if field_name in origin: | if field_name in origin: | ||||
res[field_name] = origin.pop(field_name) | res[field_name] = origin.pop(field_name) | ||||
# Run the JSON-LD expansion algorithm | # Run the JSON-LD expansion algorithm | ||||
# <https://www.w3.org/TR/json-ld-api/#expansion> | # <https://www.w3.org/TR/json-ld-api/#expansion> | ||||
# to normalize the Codemeta metadata. | # to normalize the Codemeta metadata. | ||||
# This is required as Elasticsearch will needs each field to have a consistent | # This is required as Elasticsearch will needs each field to have a consistent | ||||
# type across documents to be searchable; and non-expanded JSON-LD documents | # type across documents to be searchable; and non-expanded JSON-LD documents | ||||
# can have various types in the same field. For example, all these are | # can have various types in the same field. For example, all these are | ||||
# equivalent in JSON-LD: | # equivalent in JSON-LD: | ||||
# * {"author": "Jane Doe"} | # * {"author": "Jane Doe"} | ||||
# * {"author": ["Jane Doe"]} | # * {"author": ["Jane Doe"]} | ||||
# * {"author": {"@value": "Jane Doe"}} | # * {"author": {"@value": "Jane Doe"}} | ||||
# * {"author": [{"@value": "Jane Doe"}]} | # * {"author": [{"@value": "Jane Doe"}]} | ||||
# and JSON-LD expansion will convert them all to the last one. | # and JSON-LD expansion will convert them all to the last one. | ||||
if "intrinsic_metadata" in res: | if "jsonld" in res: | ||||
intrinsic_metadata = res["intrinsic_metadata"] | jsonld = res["jsonld"] | ||||
for date_field in ["dateCreated", "dateModified", "datePublished"]: | for date_field in ["dateCreated", "dateModified", "datePublished"]: | ||||
if date_field in intrinsic_metadata: | if date_field in jsonld: | ||||
date = intrinsic_metadata[date_field] | date = jsonld[date_field] | ||||
# If date{Created,Modified,Published} value isn't parsable | # If date{Created,Modified,Published} value isn't parsable | ||||
# It gets rejected and isn't stored (unlike other fields) | # It gets rejected and isn't stored (unlike other fields) | ||||
formatted_date = parse_and_format_date(date) | formatted_date = parse_and_format_date(date) | ||||
if formatted_date is None: | if formatted_date is None: | ||||
intrinsic_metadata.pop(date_field) | jsonld.pop(date_field) | ||||
else: | else: | ||||
intrinsic_metadata[date_field] = formatted_date | jsonld[date_field] = formatted_date | ||||
res["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) | res["jsonld"] = codemeta.expand(jsonld) | ||||
return res | return res | ||||
def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: | def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: | ||||
"""Tokenize as string an index page result from a search""" | """Tokenize as string an index page result from a search""" | ||||
page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) | page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) | ||||
return page_token.decode() | return page_token.decode() | ||||
▲ Show 20 Lines • Show All 212 Lines • ▼ Show 20 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
if field and field[0] == "-": | if field and field[0] == "-": | ||||
field = field[1:] | field = field[1:] | ||||
order = "desc" | order = "desc" | ||||
if field in ["date_created", "date_modified", "date_published"]: | if field in ["date_created", "date_modified", "date_published"]: | ||||
sorting_params.append( | sorting_params.append( | ||||
{ | { | ||||
get_expansion(field, "."): { | get_expansion(field, "."): { | ||||
"nested_path": "intrinsic_metadata", | "nested_path": "jsonld", | ||||
"order": order, | "order": order, | ||||
} | } | ||||
} | } | ||||
) | ) | ||||
elif field in SORT_BY_OPTIONS: | elif field in SORT_BY_OPTIONS: | ||||
sorting_params.append({field: order}) | sorting_params.append({field: order}) | ||||
sorting_params.extend( | sorting_params.extend( | ||||
▲ Show 20 Lines • Show All 76 Lines • Show Last 20 Lines |