Differential D5964 Diff 21613 swh/search/elasticsearch.py

Changeset View

Standalone View

swh/search/elasticsearch.py

Show All 12 Lines
from swh.indexer import codemeta		from swh.indexer import codemeta
from swh.model import model		from swh.model import model
from swh.model.identifiers import origin_identifier		from swh.model.identifiers import origin_identifier
from swh.search.interface import (		from swh.search.interface import (
SORT_BY_OPTIONS,		SORT_BY_OPTIONS,
MinimalOriginDict,		MinimalOriginDict,
OriginDict,		OriginDict,
PagedResult,		PagedResult,
get_expansion,
)		)
from swh.search.metrics import send_metric, timed		from swh.search.metrics import send_metric, timed
		from swh.search.utils import get_expansion, is_date_parsable

INDEX_NAME_PARAM = "index"		INDEX_NAME_PARAM = "index"
READ_ALIAS_PARAM = "read_alias"		READ_ALIAS_PARAM = "read_alias"
WRITE_ALIAS_PARAM = "write_alias"		WRITE_ALIAS_PARAM = "write_alias"

ORIGIN_DEFAULT_CONFIG = {		ORIGIN_DEFAULT_CONFIG = {
INDEX_NAME_PARAM: "origin",		INDEX_NAME_PARAM: "origin",
READ_ALIAS_PARAM: "origin-read",		READ_ALIAS_PARAM: "origin-read",
Show All 29 Lines	def _sanitize_origin(origin):
# can have various types in the same field. For example, all these are		# can have various types in the same field. For example, all these are
# equivalent in JSON-LD:		# equivalent in JSON-LD:
# * {"author": "Jane Doe"}		# * {"author": "Jane Doe"}
# * {"author": ["Jane Doe"]}		# * {"author": ["Jane Doe"]}
# * {"author": {"@value": "Jane Doe"}}		# * {"author": {"@value": "Jane Doe"}}
# * {"author": [{"@value": "Jane Doe"}]}		# * {"author": [{"@value": "Jane Doe"}]}
# and JSON-LD expansion will convert them all to the last one.		# and JSON-LD expansion will convert them all to the last one.
if "intrinsic_metadata" in res:		if "intrinsic_metadata" in res:
res["intrinsic_metadata"] = codemeta.expand(res["intrinsic_metadata"])		intrinsic_metadata = res["intrinsic_metadata"]
		for date_field in ["dateCreated", "dateModified", "datePublished"]:
		if date_field in intrinsic_metadata:
		date = intrinsic_metadata[date_field]

		# If date{Created,Modified,Published} value isn't parsable
		# It gets rejected and isn't stored (unlike other fields)
		if not is_date_parsable(date):
		intrinsic_metadata.pop(date_field)

		res["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata)

		vlorentzUnsubmitted Not Done Inline Actions Can you document this new behavior in a comment? vlorentz: Can you document this new behavior in a comment?
return res		return res


def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str:		def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str:
"""Tokenize as string an index page result from a search		"""Tokenize as string an index page result from a search"""

"""
page_token = base64.b64encode(msgpack.dumps(index_to_tokenize))		page_token = base64.b64encode(msgpack.dumps(index_to_tokenize))
return page_token.decode()		return page_token.decode()


def token_decode(page_token: str) -> Dict[bytes, Any]:		def token_decode(page_token: str) -> Dict[bytes, Any]:
"""Read the page_token		"""Read the page_token"""

"""
return msgpack.loads(base64.b64decode(page_token.encode()), raw=True)		return msgpack.loads(base64.b64decode(page_token.encode()), raw=True)


class ElasticSearch:		class ElasticSearch:
def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}):		def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}):
self._backend = Elasticsearch(hosts=hosts)		self._backend = Elasticsearch(hosts=hosts)

# Merge current configuration with default values		# Merge current configuration with default values
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines	def initialize(self) -> None:
"last_revision_date": {"type": "date"},		"last_revision_date": {"type": "date"},
"intrinsic_metadata": {		"intrinsic_metadata": {
"type": "nested",		"type": "nested",
"properties": {		"properties": {
"@context": {		"@context": {
# don't bother indexing tokens in these URIs, as the		# don't bother indexing tokens in these URIs, as the
# are used as namespaces		# are used as namespaces
"type": "keyword",		"type": "keyword",
		},
		"http://schema": {
		"properties": {
		"org/dateCreated": {
		"properties": {"@value": {"type": "date",}}
		},
		"org/dateModified": {
		"properties": {"@value": {"type": "date",}}
		},
		"org/datePublished": {
		"properties": {"@value": {"type": "date",}}
		},
}		}
		KShivenduAuthorUnsubmitted Done Inline Actions Afaik, It's important to define date type for date{Published,Modified,Created} for filters/sorting options to work since we've used `"date_detection": False,` (Automatic date detection won't work) But, If I use uncomment these lines of code, most of the `test_elasticsearch.py` tests start to fail. The error that gets thrown in that case looks like : https://forge.softwareheritage.org/P1089. KShivendu: Afaik, It's important to define date type for date{Published,Modified,Created} for…
		vsellierUnsubmitted Not Done Inline Actions Actually, all the fields in the nested document need to be strings. It seems it's possible to configure the mapping as you want and to add the lenient[1] parameter to the search query. Some tests will need to be adapted (test_origin_intrinsic_metadata_string_mapping for example) as they are trying to set random text on the dateCreated field and it will fail with the new mapping. [1] https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html#match-field-params vsellier: Actually, all the fields in the nested document need to be strings. It seems it's possible to…
},		},
},		},
		},
# Has this origin been taken down?		# Has this origin been taken down?
"blocklisted": {"type": "boolean",},		"blocklisted": {"type": "boolean",},
},		},
},		},
)		)

@timed		@timed
def flush(self) -> None:		def flush(self) -> None:
▲ Show 20 Lines • Show All 138 Lines • ▼ Show 20 Lines	def origin_search(
metadata_pattern: Optional[str] = None,		metadata_pattern: Optional[str] = None,
with_visit: bool = False,		with_visit: bool = False,
visit_types: Optional[List[str]] = None,		visit_types: Optional[List[str]] = None,
min_nb_visits: int = 0,		min_nb_visits: int = 0,
min_last_visit_date: str = "",		min_last_visit_date: str = "",
min_last_eventful_visit_date: str = "",		min_last_eventful_visit_date: str = "",
min_last_revision_date: str = "",		min_last_revision_date: str = "",
min_last_release_date: str = "",		min_last_release_date: str = "",
		min_date_created: str = "",
		min_date_modified: str = "",
		min_date_published: str = "",
programming_languages: Optional[List[str]] = None,		programming_languages: Optional[List[str]] = None,
licenses: Optional[List[str]] = None,		licenses: Optional[List[str]] = None,
keywords: Optional[List[str]] = None,		keywords: Optional[List[str]] = None,
sort_by: Optional[List[str]] = None,		sort_by: Optional[List[str]] = None,
page_token: Optional[str] = None,		page_token: Optional[str] = None,
limit: int = 50,		limit: int = 50,
) -> PagedResult[MinimalOriginDict]:		) -> PagedResult[MinimalOriginDict]:
query_clauses: List[Dict[str, Any]] = []		query_clauses: List[Dict[str, Any]] = []
Show All 28 Lines	) -> PagedResult[MinimalOriginDict]:
"type": "cross_fields",		"type": "cross_fields",
# All keywords must be found in a document for it to		# All keywords must be found in a document for it to
# be considered a match.		# be considered a match.
# TODO: allow missing keywords?		# TODO: allow missing keywords?
"operator": "and",		"operator": "and",
# Searches on all fields of the intrinsic_metadata dict,		# Searches on all fields of the intrinsic_metadata dict,
# recursively.		# recursively.
"fields": ["intrinsic_metadata.*"],		"fields": ["intrinsic_metadata.*"],
		# date{Created,Modified,Published} are of type date
		"lenient": True,
}		}
},		},
}		}
}		}
)		)

if not query_clauses:		if not query_clauses:
raise ValueError(		raise ValueError(
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines	) -> PagedResult[MinimalOriginDict]:
if programming_languages:		if programming_languages:
language_filters: List[Dict[str, Any]] = []		language_filters: List[Dict[str, Any]] = []
for language in programming_languages:		for language in programming_languages:
language_filters.append(		language_filters.append(
{"match": {get_expansion("programming_languages", "."): language}}		{"match": {get_expansion("programming_languages", "."): language}}
)		)
intrinsic_metadata_filters.append({"bool": {"should": language_filters}})		intrinsic_metadata_filters.append({"bool": {"should": language_filters}})

		if min_date_created:
		intrinsic_metadata_filters.append(
		{
		"range": {
		get_expansion("date_created", "."): {"gte": min_date_created,}
		}
		}
		)
		if min_date_modified:
		intrinsic_metadata_filters.append(
		{
		"range": {
		get_expansion("date_modified", "."): {"gte": min_date_modified,}
		}
		}
		)
		if min_date_published:
		intrinsic_metadata_filters.append(
		{
		"range": {
		get_expansion("date_published", "."): {
		"gte": min_date_published,
		}
		}
		}
		)

if intrinsic_metadata_filters:		if intrinsic_metadata_filters:
query_clauses.append(		query_clauses.append(
{		{
"nested": {		"nested": {
"path": "intrinsic_metadata",		"path": "intrinsic_metadata",
"query": {"bool": {"must": intrinsic_metadata_filters,}},		"query": {"bool": {"must": intrinsic_metadata_filters,}},
# "must" is equivalent to "AND"		# "must" is equivalent to "AND"
# "should" is equivalent to "OR"		# "should" is equivalent to "OR"
# Resulting origins must return true for the following:		# Resulting origins must return true for the following:
# (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..)		# (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..)
# This is equivalent to {"must": [		# This is equivalent to {"must": [
# {"should": [license_1,license_2] },		# {"should": [license_1,license_2] },
# {"should": [lang_1,lang_2]}] }		# {"should": [lang_1,lang_2]}] }
# ]}		# ]}
# Note: Usage of "bool" has been omitted for readability		# Note: Usage of "bool" has been omitted for readability
}		}
}		}
)		)

if visit_types is not None:		if visit_types is not None:
query_clauses.append({"terms": {"visit_types": visit_types}})		query_clauses.append({"terms": {"visit_types": visit_types}})

sorting_params = []		sorting_params: List[Dict[str, Any]] = []

if sort_by:		if sort_by:
for field in sort_by:		for field in sort_by:
order = "asc"		order = "asc"
if field and field[0] == "-":		if field and field[0] == "-":
field = field[1:]		field = field[1:]
order = "desc"		order = "desc"

if field in SORT_BY_OPTIONS:		if field in ["date_created", "date_modified", "date_published"]:
		sorting_params.append(
		{
		get_expansion(field, "."): {
		"nested_path": "intrinsic_metadata",
		"order": order,
		}
		}
		)
		elif field in SORT_BY_OPTIONS:
sorting_params.append({field: order})		sorting_params.append({field: order})

sorting_params.extend(		sorting_params.extend(
[{"_score": "desc"}, {"sha1": "asc"},]		[{"_score": "desc"}, {"sha1": "asc"},]
)		)

body = {		body = {
"query": {		"query": {
Show All 40 Lines