diff --git a/query_language/grammar.js b/query_language/grammar.js --- a/query_language/grammar.js +++ b/query_language/grammar.js @@ -156,7 +156,7 @@ token.immediate(/[^\\"\n]+/), $.escape_sequence )), - singleWord: $ => /[^\s"'\[\]\(\)]+/, + singleWord: $ => /[^\s"'\[\]\(\),]+/, escape_sequence: $ => token.immediate(seq( '\\', /(\"|\'|\\|\/|b|n|r|t|u)/ diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py new file mode 100644 --- /dev/null +++ b/swh/search/tests/test_translator.py @@ -0,0 +1,309 @@ +import pytest + +from swh.search.translator import Translator +from swh.search.utils import get_expansion + + +def _test_results(query, expected): + output = Translator().parse_query(query) + assert output == expected + + +def test_empty_query(): + query = "" + with pytest.raises(Exception): + _test_results(query, {}) + + +def test_conjunction_operators(): + query = "visited = true or visits > 2 and visits < 5" + expected = { + "filters": { + "bool": { + "should": [ + {"term": {"has_visits": True}}, + { + "bool": { + "must": [ + {"range": {"nb_visits": {"gt": 2}}}, + {"range": {"nb_visits": {"lt": 5}}}, + ] + } + }, + ] + } + } + } + _test_results(query, expected) + + +def test_conjunction_op_precedence_override(): + query = "(visited = false or visits > 2) and visits < 5" + expected = { + "filters": { + "bool": { + "must": [ + { + "bool": { + "should": [ + {"term": {"has_visits": False}}, + {"range": {"nb_visits": {"gt": 2}}}, + ] + } + }, + {"range": {"nb_visits": {"lt": 5}}}, + ] + } + } + } + + _test_results(query, expected) + + +def test_limit_and_sortby(): + query = "visited = true sort_by = [-visits,last_visit] limit = 15" + expected = { + "filters": {"term": {"has_visits": True}}, + "sortBy": ["-visits", "last_visit"], + "limit": 15, + } + + _test_results(query, expected) + + +def test_deeply_nested_filters(): + query = "(((visited = true and visits > 0)))" + expected = { + "filters": { + "bool": { + "must": [ + {"term": {"has_visits": True},}, + {"range": {"nb_visits": {"gt": 0}}}, + ] + } + }, + } + + _test_results(query, expected) + + +def test_origin_and_metadata_filters(): + query = 'origin = django or metadata = "framework and web"' + expected = { + "filters": { + "bool": { + "should": [ + { + "multi_match": { + "query": "django", + "type": "bool_prefix", + "operator": "and", + "fields": [ + "url.as_you_type", + "url.as_you_type._2gram", + "url.as_you_type._3gram", + ], + } + }, + { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": "framework and web", + "type": "cross_fields", + "operator": "and", + "fields": ["intrinsic_metadata.*"], + "lenient": True, + } + }, + } + }, + ] + } + } + } + + _test_results(query, expected) + + +def test_visits_not_equal_to_filter(): + query = "visits != 5" + expected = { + "filters": { + "bool": {"must_not": [{"range": {"nb_visits": {"gte": 5, "lte": 5}}},]} + }, + } + + _test_results(query, expected) + + +def test_visit_type_filter(): + query = 'visit_type = [git,"pypi"]' + expected = {"filters": {"terms": {"visit_types": ["git", "pypi"]}}} + + _test_results(query, expected) + + +def test_keyword_filter(): + query = 'keyword in [word1, "word2 \\" \' word3"]' + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": "word1 word2 \\\" ' word3", + "fields": [ + get_expansion("keywords", ".") + "^2", + get_expansion("descriptions", "."), + ], + } + }, + } + } + } + + _test_results(query, expected) + + +def test_language_filter(): + query = 'language in [python, "go lang", cpp]' + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "should": [ + { + "match": { + get_expansion( + "programming_languages", "." + ): "python" + } + }, + { + "match": { + get_expansion( + "programming_languages", "." + ): "go lang" + } + }, + { + "match": { + get_expansion("programming_languages", "."): "cpp" + } + }, + ] + } + }, + } + } + } + + _test_results(query, expected) + + +def test_license_filter(): + query = 'license in ["GPL 3", Apache, MIT]' + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "should": [ + {"match": {get_expansion("licenses", "."): "GPL 3"}}, + {"match": {get_expansion("licenses", "."): "Apache"}}, + {"match": {get_expansion("licenses", "."): "MIT"}}, + ] + } + }, + } + } + } + + _test_results(query, expected) + + +def test_date_created_not_equal_to_filter(): + query = "created != 2020-01-01" + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "must_not": [ + { + "range": { + get_expansion("date_created", "."): { + "gte": "2020-01-01", + "lte": "2020-01-01", + } + } + } + ] + } + }, + } + } + } + + _test_results(query, expected) + + +def test_date_created_greater_than_filter(): + query = "created >= 2020-01-01" + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "must": [ + { + "range": { + get_expansion("date_created", "."): { + "gte": "2020-01-01", + } + } + } + ] + } + }, + } + } + } + + _test_results(query, expected) + + +def test_last_eventful_visit_not_equal_to_filter(): + query = "last_visit != 2020-01-01" + expected = { + "filters": { + "bool": { + "must_not": [ + { + "range": { + "last_visit_date": { + "gte": "2020-01-01", + "lte": "2020-01-01", + } + } + } + ] + } + } + } + + _test_results(query, expected) + + +def test_last_eventful_visit_less_than_to_filter(): + query = "last_visit < 2020-01-01" + expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}} + + _test_results(query, expected) diff --git a/swh/search/translator.py b/swh/search/translator.py new file mode 100644 --- /dev/null +++ b/swh/search/translator.py @@ -0,0 +1,288 @@ +import os + +from pkg_resources import resource_filename +from tree_sitter import Language, Parser + +from swh.search.utils import get_expansion + + +class Translator: + + RANGE_OPERATOR_MAP = { + ">": "gt", + "<": "lt", + ">=": "gte", + "<=": "lte", + } + + def __init__(self): + ql_rel_paths = [ + "swh_ql.so", # installed + "../../query_language/swh_ql.so", # development + ] + for ql_rel_path in ql_rel_paths: + ql_path = resource_filename("swh.search", ql_rel_path) + if os.path.exists(ql_path): + break + else: + assert False, "not found" + + search_ql = Language(ql_path, "swh_search_ql") + + self.parser = Parser() + self.parser.set_language(search_ql) + self.query = "" + + def parse_query(self, query): + self.query = query + tree = self.parser.parse(query.encode("utf8")) + self.query_node = tree.root_node + + if self.query_node.has_error: + raise Exception("Invalid query") + + return self._traverse(self.query_node) + + def _traverse(self, node): + if len(node.children) == 3 and node.children[1].type == "filters": + # filters => ( filters ) + return self._traverse(node.children[1]) # Go past the () brackets + if node.type == "query": + result = {} + for child in node.children: + # query => filters sort_by limit + result[child.type] = self._traverse(child) + + return result + + if node.type == "filters": + if len(node.children) == 1: + # query => filters + # filters => filters + # filters => filter + # Current node is just a wrapper, so go one level deep + return self._traverse(node.children[0]) + + if len(node.children) == 3: + # filters => filters conj_op filters + filters1 = self._traverse(node.children[0]) + conj_op = self._get_value(node.children[1]) + filters2 = self._traverse(node.children[2]) + + if conj_op == "and": + return {"bool": {"must": [filters1, filters2]}} + if conj_op == "or": + return {"bool": {"should": [filters1, filters2]}} + + if node.type == "filter": + filter_category = node.children[0] + return self._parse_filter(filter_category) + + if node.type == "sortBy": + return self._parse_filter(node) + + if node.type == "limit": + return self._parse_filter(node) + + return Exception( + f"Unknown node type ({node.type}) " + f"or unexpected number of children ({node.children})" + ) + + def _get_value(self, node): + if ( + len(node.children) > 0 + and node.children[0].type == "[" + and node.children[-1].type == "]" + ): + # array + return [self._get_value(child) for child in node.children if child.is_named] + + start = node.start_point[1] + end = node.end_point[1] + + value = self.query[start:end] + + if len(value) > 1 and ( + (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"') + ): + return value[1:-1] + + if node.type in ["number", "numberVal"]: + return int(value) + + return value + + def _parse_filter(self, filter): + + if filter.type == "boundedListFilter": + filter = filter.children[0] + + children = filter.children + assert len(children) == 3 + + category = filter.type + name, op, value = [self._get_value(child) for child in children] + + if category == "patternFilter": + if name == "origin": + return { + "multi_match": { + "query": value, + "type": "bool_prefix", + "operator": "and", + "fields": [ + "url.as_you_type", + "url.as_you_type._2gram", + "url.as_you_type._3gram", + ], + } + } + elif name == "metadata": + return { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": value, + "type": "cross_fields", + "operator": "and", + "fields": ["intrinsic_metadata.*"], + "lenient": True, + } + }, + } + } + + if category == "booleanFilter": + if name == "visited": + return {"term": {"has_visits": value == "true"}} + + if category == "numericFilter": + if name == "visits": + if op in ["=", "!="]: + return { + "bool": { + ("must" if op == "=" else "must_not"): [ + {"range": {"nb_visits": {"gte": value, "lte": value}}} + ] + } + } + else: + return { + "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}} + } + + if category == "visitTypeFilter": + if name == "visit_type": + return {"terms": {"visit_types": value}} + + if category == "unboundedListFilter": + value_array = value + + if name == "keyword": + return { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": " ".join(value_array), + "fields": [ + get_expansion("keywords", ".") + "^2", + get_expansion("descriptions", "."), + ], + } + }, + } + } + elif name in ["language", "license"]: + name_mapping = { + "language": "programming_languages", + "license": "licenses", + } + name = name_mapping[name] + + return { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "should": [ + {"match": {get_expansion(name, "."): val}} + for val in value_array + ], + } + }, + } + } + + if category == "dateFilter": + + if name in ["created", "modified", "published"]: + if op in ["=", "!="]: + return { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + ("must" if op == "=" else "must_not"): [ + { + "range": { + get_expansion(f"date_{name}", "."): { + "gte": value, + "lte": value, + } + } + } + ], + } + }, + } + } + + return { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "must": [ + { + "range": { + get_expansion(f"date_{name}", "."): { + self.RANGE_OPERATOR_MAP[op]: value, + } + } + } + ], + } + }, + } + } + else: + if op in ["=", "!="]: + return { + "bool": { + ("must" if op == "=" else "must_not"): [ + { + "range": { + f"{name}_date": {"gte": value, "lte": value,} + } + } + ], + } + } + return { + "range": { + f"{name}_date": { + self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"), + } + } + } + + if category == "sortBy": + return value + + if category == "limit": + return value + + raise Exception(f"Unknown filter {category}.{name}")