diff --git a/query_language/grammar.js b/query_language/grammar.js --- a/query_language/grammar.js +++ b/query_language/grammar.js @@ -14,7 +14,7 @@ name: 'swh_search_ql', rules: { - query: $ => seq($.filters, optional($.sortBy) ,optional($.limit)), + query: $ => seq($.filters, optional($.sortBy), optional($.limit)), filters: $ => choice( prec.left(PRECEDENCE.and, @@ -41,7 +41,7 @@ sortByField: $ => token('sort_by'), sortByOp: $ => $.equalOp, sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), - sortByOptions: $ => seq(optional(token.immediate('-')) ,choice( + sortByOptions: $ => seq(optional(token.immediate('-')), choice( 'visits', 'last_visit', 'last_eventful_visit', @@ -106,15 +106,15 @@ sortByOptions: $ => seq( optional(token.immediate('-')), choice( - 'visits', - 'last_visit', - 'last_eventful_visit', - 'last_revision', - 'last_release', - 'created', - 'modified', - 'published' - )), + 'visits', + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' + )), unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), listField: $ => token(choice('language', 'license', 'keyword')), @@ -142,7 +142,13 @@ equalOp: $ => token('='), choiceOp: $ => token(choice('in', 'not in')), - isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/, + isoDateTime: $ => { + const dateRegex = (/\d{4}[-]\d{2}[-]\d{2}/).source + const dateTimeSepRegex = (/(\s|T)*/).source + const timeRegex = (/(\d{2}:\d{2}(:\d{2}(\.\d{6})?)?)?/).source + const timezoneRegex = (/(\+\d{2}:\d{2}|Z)?/).source + return new RegExp(dateRegex + dateTimeSepRegex + timeRegex + timezoneRegex) + }, string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord), number: $ => /\d+/, @@ -153,7 +159,7 @@ and: $ => "and", stringContent: $ => repeat1(choice( - token.immediate(/[^\\"\n]+/), + token.immediate(/[^\\'"\n]+/), $.escape_sequence )), singleWord: $ => /[^\s"'\[\]\(\),]+/, diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -22,7 +22,8 @@ PagedResult, ) from swh.search.metrics import send_metric, timed -from swh.search.utils import get_expansion, is_date_parsable +from swh.search.translator import Translator +from swh.search.utils import get_expansion, is_date_parsable, to_raw logger = logging.getLogger(__name__) @@ -99,6 +100,7 @@ class ElasticSearch: def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}): self._backend = Elasticsearch(hosts=hosts) + self._translator = Translator() # Merge current configuration with default values origin_config = indexes.get("origin", {}) @@ -348,6 +350,7 @@ def origin_search( self, *, + query: str = "", url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, @@ -369,185 +372,89 @@ ) -> PagedResult[MinimalOriginDict]: query_clauses: List[Dict[str, Any]] = [] + query_filters = [] if url_pattern: - query_clauses.append( - { - "multi_match": { - "query": url_pattern, - "type": "bool_prefix", - "operator": "and", - "fields": [ - "url.as_you_type", - "url.as_you_type._2gram", - "url.as_you_type._3gram", - ], - } - } - ) + query_filters.append(f"origin = {to_raw(url_pattern)}") if metadata_pattern: - query_clauses.append( - { - "nested": { - "path": "intrinsic_metadata", - "query": { - "multi_match": { - "query": metadata_pattern, - # Makes it so that the "foo bar" query returns - # documents which contain "foo" in a field and "bar" - # in a different field - "type": "cross_fields", - # All keywords must be found in a document for it to - # be considered a match. - # TODO: allow missing keywords? - "operator": "and", - # Searches on all fields of the intrinsic_metadata dict, - # recursively. - "fields": ["intrinsic_metadata.*"], - # date{Created,Modified,Published} are of type date - "lenient": True, - } - }, - } - } - ) + query_filters.append(f"metadata = {to_raw(metadata_pattern)}") - if not query_clauses: - raise ValueError( - "At least one of url_pattern and metadata_pattern must be provided." - ) + # if not query_clauses: + # raise ValueError( + # "At least one of url_pattern and metadata_pattern must be provided." + # ) if with_visit: - query_clauses.append({"term": {"has_visits": True,}}) + query_filters.append(f"visited = {'true' if with_visit else 'false'}") if min_nb_visits: - query_clauses.append({"range": {"nb_visits": {"gte": min_nb_visits,},}}) + query_filters.append(f"visits >= {min_nb_visits}") if min_last_visit_date: - query_clauses.append( - { - "range": { - "last_visit_date": { - "gte": min_last_visit_date.replace("Z", "+00:00"), - } - } - } + query_filters.append( + f"last_visit >= {min_last_visit_date.replace('Z', '+00:00')}" ) if min_last_eventful_visit_date: - query_clauses.append( - { - "range": { - "last_eventful_visit_date": { - "gte": min_last_eventful_visit_date.replace("Z", "+00:00"), - } - } - } + query_filters.append( + "last_eventful_visit >= " + f"{min_last_eventful_visit_date.replace('Z', '+00:00')}" ) if min_last_revision_date: - query_clauses.append( - { - "range": { - "last_revision_date": { - "gte": min_last_revision_date.replace("Z", "+00:00"), - } - } - } + query_filters.append( + f"last_revision >= {min_last_revision_date.replace('Z', '+00:00')}" ) if min_last_release_date: - query_clauses.append( - { - "range": { - "last_release_date": { - "gte": min_last_release_date.replace("Z", "+00:00"), - } - } - } + query_filters.append( + f"last_release >= {min_last_release_date.replace('Z', '+00:00')}" ) if keywords: - query_clauses.append( - { - "nested": { - "path": "intrinsic_metadata", - "query": { - "multi_match": { - "query": " ".join(keywords), - "fields": [ - get_expansion("keywords", ".") + "^2", - get_expansion("descriptions", "."), - # "^2" boosts an origin's score by 2x - # if it the queried keywords are - # found in its intrinsic_metadata.keywords - ], - } - }, - } - } - ) - - intrinsic_metadata_filters: List[Dict[str, Dict]] = [] - + query_filters.append(f"keyword in {to_raw(keywords)}") if licenses: - license_filters: List[Dict[str, Any]] = [] - for license in licenses: - license_filters.append( - {"match": {get_expansion("licenses", "."): license}} - ) - intrinsic_metadata_filters.append({"bool": {"should": license_filters}}) + query_filters.append(f"license in {to_raw(licenses)}") if programming_languages: - language_filters: List[Dict[str, Any]] = [] - for language in programming_languages: - language_filters.append( - {"match": {get_expansion("programming_languages", "."): language}} - ) - intrinsic_metadata_filters.append({"bool": {"should": language_filters}}) + query_filters.append(f"language in {to_raw(programming_languages)}") if min_date_created: - intrinsic_metadata_filters.append( - { - "range": { - get_expansion("date_created", "."): {"gte": min_date_created,} - } - } + query_filters.append( + f"created >= {min_date_created.replace('Z', '+00:00')}" ) if min_date_modified: - intrinsic_metadata_filters.append( - { - "range": { - get_expansion("date_modified", "."): {"gte": min_date_modified,} - } - } + query_filters.append( + f"modified >= {min_date_modified.replace('Z', '+00:00')}" ) if min_date_published: - intrinsic_metadata_filters.append( - { - "range": { - get_expansion("date_published", "."): { - "gte": min_date_published, - } - } - } - ) - - if intrinsic_metadata_filters: - query_clauses.append( - { - "nested": { - "path": "intrinsic_metadata", - "query": {"bool": {"must": intrinsic_metadata_filters,}}, - # "must" is equivalent to "AND" - # "should" is equivalent to "OR" - # Resulting origins must return true for the following: - # (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..) - # This is equivalent to {"must": [ - # {"should": [license_1,license_2] }, - # {"should": [lang_1,lang_2]}] } - # ]} - # Note: Usage of "bool" has been omitted for readability - } - } + query_filters.append( + f"published >= {min_date_published.replace('Z', '+00:00')}" ) if visit_types is not None: - query_clauses.append({"terms": {"visit_types": visit_types}}) + query_filters.append(f"visit_type = {to_raw(visit_types)}") + + combined_filters = f"({' and '.join(query_filters)})" + query = f"{combined_filters}{' and ' if query != '' else ' '}{query}" + parsed_query = self._translator.parse_query(query) + query_clauses.append(parsed_query["filters"]) + + field_map = { + "visits": "nb_visits", + "last_visit": "last_visit_date", + "last_eventful_visit": "last_eventful_visit_date", + "last_revision": "last_revision_date", + "last_release": "last_release_date", + "created": "date_created", + "modified": "date_modified", + "published": "date_published", + } + + if "sort_by" in parsed_query: + if sort_by is None: + sort_by = [] + for sort_by_option in parsed_query["sort_by"]: + if sort_by_option[0] == "-": + sort_by.append("-" + field_map[sort_by_option[1:]]) + else: + sort_by.append(field_map[sort_by_option]) + if parsed_query.get("limit", 0): + limit = parsed_query["limit"] sorting_params: List[Dict[str, Any]] = [] diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -268,6 +268,7 @@ def origin_search( self, *, + query: str = "", url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -65,6 +65,7 @@ def origin_search( self, *, + query: str = "", url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, @@ -87,11 +88,12 @@ """Searches for origins matching the `url_pattern`. Args: + query: Find origins according the queries written as per the + swh-search query language syntax. url_pattern: Part of the URL to search for metadata_pattern: Keywords to look for (across all the fields of intrinsic_metadata) - with_visit: Whether origins with no visit are to be - filtered out + with_visit: Whether origins with no visits are to be filtered out visit_types: Only origins having any of the provided visit types (e.g. git, svn, pypi) will be returned min_nb_visits: Filter origins that have number of visits >= diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -1151,3 +1151,18 @@ result_page = self.search.origin_search(url_pattern="origin") assert result_page.next_page_token is None assert result_page.results == [] + + def test_filter_keyword_in_filter(self): + origin1 = { + "url": "foo language in ['foo baz'] bar", + } + self.search.origin_update([origin1]) + self.search.flush() + + result_page = self.search.origin_search(url_pattern="language in ['foo bar']") + assert result_page.next_page_token is None + assert result_page.results == [origin1] + + result_page = self.search.origin_search(url_pattern="baaz") + assert result_page.next_page_token is None + assert result_page.results == [] diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py --- a/swh/search/tests/test_translator.py +++ b/swh/search/tests/test_translator.py @@ -146,14 +146,14 @@ def test_keyword_filter(): - query = r"""keyword in [word1, "word2 \" ' word3"]""" + query = r"""keyword in [word1, "word2 \" \' word3"]""" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { - "query": r"""word1 word2 \" ' word3""", + "query": r"""word1 word2 \" \' word3""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), @@ -307,3 +307,46 @@ expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}} _test_results(query, expected) + + +def test_keyword_no_escape_inside_filter(): + # any keyword (filter name/operator/value) inside a filter + # must be considered a string. + query = r'''origin = "language in [\'go lang\', python]"''' + expected = { + "filters": { + "multi_match": { + "query": r"""language in [\'go lang\', python]""", + "type": "bool_prefix", + "operator": "and", + "fields": [ + "url.as_you_type", + "url.as_you_type._2gram", + "url.as_you_type._3gram", + ], + } + } + } + _test_results(query, expected) + + +def test_escaped_punctutation_parsing(): + query = r"""keyword in ["foo \'\" bar"]""" + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": r"foo \'\" bar", + # FIXME: should be r"""foo '" bar""" + "fields": [ + get_expansion("keywords", ".") + "^2", + get_expansion("descriptions", "."), + ], + } + }, + } + } + } + _test_results(query, expected) diff --git a/swh/search/translator.py b/swh/search/translator.py --- a/swh/search/translator.py +++ b/swh/search/translator.py @@ -70,8 +70,10 @@ filters2 = self._traverse(node.children[2]) if conj_op == "and": + # "must" is equivalent to "AND" return {"bool": {"must": [filters1, filters2]}} if conj_op == "or": + # "should" is equivalent to "OR" return {"bool": {"should": [filters1, filters2]}} if node.type == "filter": @@ -104,13 +106,13 @@ value = self.query[start:end] if len(value) > 1 and ( - (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"') + (value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"') ): return value[1:-1] if node.type in ["number", "numberVal"]: return int(value) - + # TODO: Escape special characters (`'`, `"` and `\`) return value def _parse_filter(self, filter): @@ -145,9 +147,18 @@ "query": { "multi_match": { "query": value, + # Makes it so that the "foo bar" query returns + # documents which contain "foo" in a field and "bar" + # in a different field "type": "cross_fields", + # All keywords must be found in a document for it to + # be considered a match. + # TODO: allow missing keywords? "operator": "and", + # Searches on all fields of the intrinsic_metadata dict, + # recursively. "fields": ["intrinsic_metadata.*"], + # date{Created,Modified,Published} are of type date "lenient": True, } }, @@ -190,6 +201,9 @@ "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), + # "^2" boosts an origin's score by 2x + # if it the queried keywords are + # found in its intrinsic_metadata.keywords ], } }, diff --git a/swh/search/utils.py b/swh/search/utils.py --- a/swh/search/utils.py +++ b/swh/search/utils.py @@ -55,3 +55,31 @@ return True except Exception: return False + + +def to_raw(obj): + r"""Makes the object directly injectable into the + query language. + + For strings, appends \ before special characters like ', ", and \ + + For arrays, applies the same transformation on each element, joins the + elements and returns a string-like representation of the list. + + >>> to_raw("foo ' bar") + '"foo \\\' bar"' + + >>> to_raw([r"foo ' bar", r"bar \\\' baz", r'foo " baz']) + '["foo \\\' bar", "bar \\\\\\\' baz", "foo \\" baz"]' + """ + if type(obj) == list: + items = [to_raw(item) for item in obj] + return "[" + ", ".join(items) + "]" + elif type(obj) == str: + return ( + '"' + + obj.translate({ord("'"): r"\'", ord('"'): r"\"", ord("\\"): r"\\",}) + + '"' + ) + else: + raise Exception(f"Unexpected item type {type(obj)}")