diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -18,12 +18,28 @@ def _sanitize_origin(origin): origin = origin.copy() + + # Whitelist fields to be saved in Elasticsearch res = {"url": origin.pop("url")} for field_name in ("intrinsic_metadata", "has_visits"): if field_name in origin: res[field_name] = origin.pop(field_name) + + # Run the JSON-LD expansion algorithm + # + # to normalize the Codemeta metadata. + # This is required as Elasticsearch will needs each field to have a consistent + # type across documents to be searchable; and non-expanded JSON-LD documents + # can have various types in the same field. For example, all these are + # equivalent in JSON-LD: + # * {"author": "Jane Doe"} + # * {"author": ["Jane Doe"]} + # * {"author": {"@value": "Jane Doe"}} + # * {"author": [{"@value": "Jane Doe"}]} + # and JSON-LD expansion will convert them all to the last one. if "intrinsic_metadata" in res: res["intrinsic_metadata"] = codemeta.expand(res["intrinsic_metadata"]) + return res @@ -61,12 +77,17 @@ index="origin", body={ "properties": { + # sha1 of the URL; used as the document id "sha1": {"type": "keyword", "doc_values": True,}, + # Used both to search URLs, and as the result to return + # as a response to queries "url": { "type": "text", # To split URLs into token on any character # that is not alphanumerical "analyzer": "simple", + # 2-gram and partial-3-gram search (ie. with the end of the + # third word potentially missing) "fields": { "as_you_type": { "type": "search_as_you_type", @@ -74,12 +95,14 @@ } }, }, + # used to filter out origins that were never visited "has_visits": {"type": "boolean",}, "intrinsic_metadata": { "type": "nested", "properties": { "@context": { - # don't bother indexing tokens + # don't bother indexing tokens in these URIs, as the + # are used as namespaces "type": "keyword", } }, @@ -148,8 +171,16 @@ "query": { "multi_match": { "query": metadata_pattern, + # Makes it so that the "foo bar" query returns + # documents which contain "foo" in a field and "bar" + # in a different field "type": "cross_fields", + # All keywords must be found in a document for it to + # be considered a match. + # TODO: allow missing keywords? "operator": "and", + # Searches on all fields of the intrinsic_metadata dict, + # recursively. "fields": ["intrinsic_metadata.*"], } }, @@ -162,8 +193,6 @@ "At least one of url_pattern and metadata_pattern must be provided." ) - next_page_token: Optional[str] = None - if with_visit: query_clauses.append({"term": {"has_visits": True,}}) @@ -183,7 +212,11 @@ hits = res["hits"]["hits"] + next_page_token: Optional[str] = None + if len(hits) == limit: + # There are more results after this page; return a pagination token + # to get them in a future query last_hit = hits[-1] next_page_token_content = { b"score": last_hit["_score"],