Changeset View
Standalone View
swh/search/elasticsearch.py
Show All 12 Lines | |||||
from swh.indexer import codemeta | from swh.indexer import codemeta | ||||
from swh.model import model | from swh.model import model | ||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.search.interface import ( | from swh.search.interface import ( | ||||
SORT_BY_OPTIONS, | SORT_BY_OPTIONS, | ||||
MinimalOriginDict, | MinimalOriginDict, | ||||
OriginDict, | OriginDict, | ||||
PagedResult, | PagedResult, | ||||
get_expansion, | |||||
) | ) | ||||
from swh.search.metrics import send_metric, timed | from swh.search.metrics import send_metric, timed | ||||
from swh.search.utils import get_expansion, is_date_parsable | |||||
INDEX_NAME_PARAM = "index" | INDEX_NAME_PARAM = "index" | ||||
READ_ALIAS_PARAM = "read_alias" | READ_ALIAS_PARAM = "read_alias" | ||||
WRITE_ALIAS_PARAM = "write_alias" | WRITE_ALIAS_PARAM = "write_alias" | ||||
ORIGIN_DEFAULT_CONFIG = { | ORIGIN_DEFAULT_CONFIG = { | ||||
INDEX_NAME_PARAM: "origin", | INDEX_NAME_PARAM: "origin", | ||||
READ_ALIAS_PARAM: "origin-read", | READ_ALIAS_PARAM: "origin-read", | ||||
Show All 29 Lines | def _sanitize_origin(origin): | ||||
# can have various types in the same field. For example, all these are | # can have various types in the same field. For example, all these are | ||||
# equivalent in JSON-LD: | # equivalent in JSON-LD: | ||||
# * {"author": "Jane Doe"} | # * {"author": "Jane Doe"} | ||||
# * {"author": ["Jane Doe"]} | # * {"author": ["Jane Doe"]} | ||||
# * {"author": {"@value": "Jane Doe"}} | # * {"author": {"@value": "Jane Doe"}} | ||||
# * {"author": [{"@value": "Jane Doe"}]} | # * {"author": [{"@value": "Jane Doe"}]} | ||||
# and JSON-LD expansion will convert them all to the last one. | # and JSON-LD expansion will convert them all to the last one. | ||||
if "intrinsic_metadata" in res: | if "intrinsic_metadata" in res: | ||||
res["intrinsic_metadata"] = codemeta.expand(res["intrinsic_metadata"]) | intrinsic_metadata = res["intrinsic_metadata"] | ||||
for date_field in ["dateCreated", "dateModified", "datePublished"]: | |||||
if date_field in intrinsic_metadata: | |||||
date = intrinsic_metadata[date_field] | |||||
# If date{Created,Modified,Published} value isn't parsable | |||||
# It gets rejected and isn't stored (unlike other fields) | |||||
if not is_date_parsable(date): | |||||
intrinsic_metadata.pop(date_field) | |||||
res["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) | |||||
vlorentz: Can you document this new behavior in a comment? | |||||
return res | return res | ||||
def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: | def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: | ||||
"""Tokenize as string an index page result from a search | """Tokenize as string an index page result from a search""" | ||||
""" | |||||
page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) | page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) | ||||
return page_token.decode() | return page_token.decode() | ||||
def token_decode(page_token: str) -> Dict[bytes, Any]: | def token_decode(page_token: str) -> Dict[bytes, Any]: | ||||
"""Read the page_token | """Read the page_token""" | ||||
""" | |||||
return msgpack.loads(base64.b64decode(page_token.encode()), raw=True) | return msgpack.loads(base64.b64decode(page_token.encode()), raw=True) | ||||
class ElasticSearch: | class ElasticSearch: | ||||
def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}): | def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}): | ||||
self._backend = Elasticsearch(hosts=hosts) | self._backend = Elasticsearch(hosts=hosts) | ||||
# Merge current configuration with default values | # Merge current configuration with default values | ||||
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines | def initialize(self) -> None: | ||||
"last_revision_date": {"type": "date"}, | "last_revision_date": {"type": "date"}, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"type": "nested", | "type": "nested", | ||||
"properties": { | "properties": { | ||||
"@context": { | "@context": { | ||||
# don't bother indexing tokens in these URIs, as the | # don't bother indexing tokens in these URIs, as the | ||||
# are used as namespaces | # are used as namespaces | ||||
"type": "keyword", | "type": "keyword", | ||||
}, | |||||
"http://schema": { | |||||
"properties": { | |||||
"org/dateCreated": { | |||||
"properties": {"@value": {"type": "date",}} | |||||
}, | |||||
"org/dateModified": { | |||||
"properties": {"@value": {"type": "date",}} | |||||
}, | |||||
"org/datePublished": { | |||||
"properties": {"@value": {"type": "date",}} | |||||
}, | |||||
} | } | ||||
Done Inline ActionsAfaik, It's important to define date type for date{Published,Modified,Created} for filters/sorting options to work since we've used "date_detection": False, (Automatic date detection won't work) But, If I use uncomment these lines of code, most of the test_elasticsearch.py tests start to fail. The error that gets thrown in that case looks like : https://forge.softwareheritage.org/P1089. KShivendu: Afaik, It's important to define date type for date{Published,Modified,Created} for… | |||||
Not Done Inline ActionsActually, all the fields in the nested document need to be strings. Some tests will need to be adapted (test_origin_intrinsic_metadata_string_mapping for example) as they are trying to set random text on the dateCreated field and it will fail with the new mapping. vsellier: Actually, all the fields in the nested document need to be strings.
It seems it's possible to… | |||||
}, | }, | ||||
}, | }, | ||||
}, | |||||
# Has this origin been taken down? | # Has this origin been taken down? | ||||
"blocklisted": {"type": "boolean",}, | "blocklisted": {"type": "boolean",}, | ||||
}, | }, | ||||
}, | }, | ||||
) | ) | ||||
@timed | @timed | ||||
def flush(self) -> None: | def flush(self) -> None: | ||||
▲ Show 20 Lines • Show All 138 Lines • ▼ Show 20 Lines | def origin_search( | ||||
metadata_pattern: Optional[str] = None, | metadata_pattern: Optional[str] = None, | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
visit_types: Optional[List[str]] = None, | visit_types: Optional[List[str]] = None, | ||||
min_nb_visits: int = 0, | min_nb_visits: int = 0, | ||||
min_last_visit_date: str = "", | min_last_visit_date: str = "", | ||||
min_last_eventful_visit_date: str = "", | min_last_eventful_visit_date: str = "", | ||||
min_last_revision_date: str = "", | min_last_revision_date: str = "", | ||||
min_last_release_date: str = "", | min_last_release_date: str = "", | ||||
min_date_created: str = "", | |||||
min_date_modified: str = "", | |||||
min_date_published: str = "", | |||||
programming_languages: Optional[List[str]] = None, | programming_languages: Optional[List[str]] = None, | ||||
licenses: Optional[List[str]] = None, | licenses: Optional[List[str]] = None, | ||||
keywords: Optional[List[str]] = None, | keywords: Optional[List[str]] = None, | ||||
sort_by: Optional[List[str]] = None, | sort_by: Optional[List[str]] = None, | ||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
limit: int = 50, | limit: int = 50, | ||||
) -> PagedResult[MinimalOriginDict]: | ) -> PagedResult[MinimalOriginDict]: | ||||
query_clauses: List[Dict[str, Any]] = [] | query_clauses: List[Dict[str, Any]] = [] | ||||
Show All 28 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
"type": "cross_fields", | "type": "cross_fields", | ||||
# All keywords must be found in a document for it to | # All keywords must be found in a document for it to | ||||
# be considered a match. | # be considered a match. | ||||
# TODO: allow missing keywords? | # TODO: allow missing keywords? | ||||
"operator": "and", | "operator": "and", | ||||
# Searches on all fields of the intrinsic_metadata dict, | # Searches on all fields of the intrinsic_metadata dict, | ||||
# recursively. | # recursively. | ||||
"fields": ["intrinsic_metadata.*"], | "fields": ["intrinsic_metadata.*"], | ||||
# date{Created,Modified,Published} are of type date | |||||
"lenient": True, | |||||
} | } | ||||
}, | }, | ||||
} | } | ||||
} | } | ||||
) | ) | ||||
if not query_clauses: | if not query_clauses: | ||||
raise ValueError( | raise ValueError( | ||||
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
if programming_languages: | if programming_languages: | ||||
language_filters: List[Dict[str, Any]] = [] | language_filters: List[Dict[str, Any]] = [] | ||||
for language in programming_languages: | for language in programming_languages: | ||||
language_filters.append( | language_filters.append( | ||||
{"match": {get_expansion("programming_languages", "."): language}} | {"match": {get_expansion("programming_languages", "."): language}} | ||||
) | ) | ||||
intrinsic_metadata_filters.append({"bool": {"should": language_filters}}) | intrinsic_metadata_filters.append({"bool": {"should": language_filters}}) | ||||
if min_date_created: | |||||
intrinsic_metadata_filters.append( | |||||
{ | |||||
"range": { | |||||
get_expansion("date_created", "."): {"gte": min_date_created,} | |||||
} | |||||
} | |||||
) | |||||
if min_date_modified: | |||||
intrinsic_metadata_filters.append( | |||||
{ | |||||
"range": { | |||||
get_expansion("date_modified", "."): {"gte": min_date_modified,} | |||||
} | |||||
} | |||||
) | |||||
if min_date_published: | |||||
intrinsic_metadata_filters.append( | |||||
{ | |||||
"range": { | |||||
get_expansion("date_published", "."): { | |||||
"gte": min_date_published, | |||||
} | |||||
} | |||||
} | |||||
) | |||||
if intrinsic_metadata_filters: | if intrinsic_metadata_filters: | ||||
query_clauses.append( | query_clauses.append( | ||||
{ | { | ||||
"nested": { | "nested": { | ||||
"path": "intrinsic_metadata", | "path": "intrinsic_metadata", | ||||
"query": {"bool": {"must": intrinsic_metadata_filters,}}, | "query": {"bool": {"must": intrinsic_metadata_filters,}}, | ||||
# "must" is equivalent to "AND" | # "must" is equivalent to "AND" | ||||
# "should" is equivalent to "OR" | # "should" is equivalent to "OR" | ||||
# Resulting origins must return true for the following: | # Resulting origins must return true for the following: | ||||
# (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..) | # (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..) | ||||
# This is equivalent to {"must": [ | # This is equivalent to {"must": [ | ||||
# {"should": [license_1,license_2] }, | # {"should": [license_1,license_2] }, | ||||
# {"should": [lang_1,lang_2]}] } | # {"should": [lang_1,lang_2]}] } | ||||
# ]} | # ]} | ||||
# Note: Usage of "bool" has been omitted for readability | # Note: Usage of "bool" has been omitted for readability | ||||
} | } | ||||
} | } | ||||
) | ) | ||||
if visit_types is not None: | if visit_types is not None: | ||||
query_clauses.append({"terms": {"visit_types": visit_types}}) | query_clauses.append({"terms": {"visit_types": visit_types}}) | ||||
sorting_params = [] | sorting_params: List[Dict[str, Any]] = [] | ||||
if sort_by: | if sort_by: | ||||
for field in sort_by: | for field in sort_by: | ||||
order = "asc" | order = "asc" | ||||
if field and field[0] == "-": | if field and field[0] == "-": | ||||
field = field[1:] | field = field[1:] | ||||
order = "desc" | order = "desc" | ||||
if field in SORT_BY_OPTIONS: | if field in ["date_created", "date_modified", "date_published"]: | ||||
sorting_params.append( | |||||
{ | |||||
get_expansion(field, "."): { | |||||
"nested_path": "intrinsic_metadata", | |||||
"order": order, | |||||
} | |||||
} | |||||
) | |||||
elif field in SORT_BY_OPTIONS: | |||||
sorting_params.append({field: order}) | sorting_params.append({field: order}) | ||||
sorting_params.extend( | sorting_params.extend( | ||||
[{"_score": "desc"}, {"sha1": "asc"},] | [{"_score": "desc"}, {"sha1": "asc"},] | ||||
) | ) | ||||
body = { | body = { | ||||
"query": { | "query": { | ||||
Show All 40 Lines |
Can you document this new behavior in a comment?