diff --git a/query_language/grammar.js b/query_language/grammar.js index aa94745..1d397c6 100644 --- a/query_language/grammar.js +++ b/query_language/grammar.js @@ -1,200 +1,200 @@ // Copyright (C) 2019-2021 The Software Heritage developers // See the AUTHORS file at the top-level directory of this distribution // License: GNU General Public License version 3, or any later version // See top-level LICENSE file for more information const PRECEDENCE = { or: 2, and: 3, bracket: 4, } module.exports = grammar({ name: 'swh_search_ql', rules: { query: $ => seq($.filters, optional($.sortBy) ,optional($.limit)), filters: $ => choice( prec.left(PRECEDENCE.and, seq( field('left', $.filters), field('operator', $.and), field('right', $.filters), ) ), prec.left(PRECEDENCE.or, seq( field('left', $.filters), field('operator', $.or), field('right', $.filters), ) ), prec.left(PRECEDENCE.bracket, seq("(", $.filters, ")"), ), $.filter ), sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal), sortByField: $ => token('sort_by'), sortByOp: $ => $.equalOp, sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), sortByOptions: $ => seq(optional(token.immediate('-')) ,choice( 'visits', 'last_visit', 'last_eventful_visit', 'last_revision', 'last_release', 'created', 'modified', 'published' )), limit: $ => seq('limit', $.equalOp, $.number), filter: $ => choice( $.patternFilter, $.booleanFilter, $.numericFilter, $.boundedListFilter, $.unboundedListFilter, $.dateFilter ), patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal), patternField: $ => token(choice('origin', 'metadata')), patternOp: $ => $.equalOp, patternVal: $ => $.string, booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal), booleanField: $ => token(choice('visited')), booleanOp: $ => $.equalOp, booleanVal: $ => choice($.booleanTrue, $.booleanFalse), numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal), numericField: $ => token(choice('visits')), numericOp: $ => $.rangeOp, numberVal: $ => $.number, boundedListFilter: $ => choice($.visitTypeFilter), visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal), visitTypeField: $ => token(choice('visit_type')), visitTypeOp: $ => $.equalOp, visitTypeVal: $ => createArray(optionalWrapWith($.visitTypeOptions, ["'", '"'])), visitTypeOptions: $ => choice( "any", "cran", "deb", "deposit", "ftp", "hg", "git", "nixguix", "npm", "pypi", "svn", "tar" ), // TODO: fetch this list dynamically from other swh services? sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal), sortByField: $ => token(choice('sort_by')), sortByOp: $ => $.equalOp, sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), sortByOptions: $ => seq( optional(token.immediate('-')), choice( 'visits', 'last_visit', 'last_eventful_visit', 'last_revision', 'last_release', 'created', 'modified', 'published' )), unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), listField: $ => token(choice('language', 'license', 'keyword')), listOp: $ => $.choiceOp, listVal: $ => createArray($.string), dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal), dateField: $ => token(choice( 'last_visit', 'last_eventful_visit', 'last_revision', 'last_release', 'created', 'modified', 'published' )), dateOp: $ => $.rangeOp, dateVal: $ => $.isoDateTime, limit: $ => seq('limit', $.equalOp, $.number), rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')), equalOp: $ => token('='), choiceOp: $ => token(choice('in', 'not in')), isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/, string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord), number: $ => /\d+/, booleanTrue: $ => "true", booleanFalse: $ => "false", or: $ => "or", and: $ => "and", stringContent: $ => repeat1(choice( token.immediate(/[^\\"\n]+/), $.escape_sequence )), - singleWord: $ => /[^\s"'\[\]\(\)]+/, + singleWord: $ => /[^\s"'\[\]\(\),]+/, escape_sequence: $ => token.immediate(seq( '\\', /(\"|\'|\\|\/|b|n|r|t|u)/ )), } }); function joinBySep1(rule, sep) { // At least one repetition of the rule separated by `sep` return seq(rule, repeat(seq(sep, optional(rule)))) } function joinBySep(rule, sep = ",") { // Any number of repetitions of the rule separated by `sep` return optional(joinBySep1(rule, sep)) } function createArray(rule) { // An array having `rule` as its member return seq( "[", joinBySep( field('array_member', rule), "," ), "]" ) } function wrapWith(rule, wrappers = ["'", '"']) { // The rule must be wrapped with one of the wrappers const wrappedRules = wrappers.map(wrapper => seq(wrapper, rule, wrapper)) return choice(...wrappedRules) } function optionalWrapWith(rule, wrappers = ["'", '"']) { // The rule may or may not be wrapped with the wrappers return choice(wrapWith(rule, wrappers), rule) } diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py new file mode 100644 index 0000000..2c85d1c --- /dev/null +++ b/swh/search/tests/test_translator.py @@ -0,0 +1,309 @@ +import pytest + +from swh.search.translator import Translator +from swh.search.utils import get_expansion + + +def _test_results(query, expected): + output = Translator().parse_query(query) + assert output == expected + + +def test_empty_query(): + query = "" + with pytest.raises(Exception): + _test_results(query, {}) + + +def test_conjunction_operators(): + query = "visited = true or visits > 2 and visits < 5" + expected = { + "filters": { + "bool": { + "should": [ + {"term": {"has_visits": True}}, + { + "bool": { + "must": [ + {"range": {"nb_visits": {"gt": 2}}}, + {"range": {"nb_visits": {"lt": 5}}}, + ] + } + }, + ] + } + } + } + _test_results(query, expected) + + +def test_conjunction_op_precedence_override(): + query = "(visited = false or visits > 2) and visits < 5" + expected = { + "filters": { + "bool": { + "must": [ + { + "bool": { + "should": [ + {"term": {"has_visits": False}}, + {"range": {"nb_visits": {"gt": 2}}}, + ] + } + }, + {"range": {"nb_visits": {"lt": 5}}}, + ] + } + } + } + + _test_results(query, expected) + + +def test_limit_and_sortby(): + query = "visited = true sort_by = [-visits,last_visit] limit = 15" + expected = { + "filters": {"term": {"has_visits": True}}, + "sortBy": ["-visits", "last_visit"], + "limit": 15, + } + + _test_results(query, expected) + + +def test_deeply_nested_filters(): + query = "(((visited = true and visits > 0)))" + expected = { + "filters": { + "bool": { + "must": [ + {"term": {"has_visits": True},}, + {"range": {"nb_visits": {"gt": 0}}}, + ] + } + }, + } + + _test_results(query, expected) + + +def test_origin_and_metadata_filters(): + query = 'origin = django or metadata = "framework and web"' + expected = { + "filters": { + "bool": { + "should": [ + { + "multi_match": { + "query": "django", + "type": "bool_prefix", + "operator": "and", + "fields": [ + "url.as_you_type", + "url.as_you_type._2gram", + "url.as_you_type._3gram", + ], + } + }, + { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": "framework and web", + "type": "cross_fields", + "operator": "and", + "fields": ["intrinsic_metadata.*"], + "lenient": True, + } + }, + } + }, + ] + } + } + } + + _test_results(query, expected) + + +def test_visits_not_equal_to_filter(): + query = "visits != 5" + expected = { + "filters": { + "bool": {"must_not": [{"range": {"nb_visits": {"gte": 5, "lte": 5}}},]} + }, + } + + _test_results(query, expected) + + +def test_visit_type_filter(): + query = 'visit_type = [git,"pypi"]' + expected = {"filters": {"terms": {"visit_types": ["git", "pypi"]}}} + + _test_results(query, expected) + + +def test_keyword_filter(): + query = r"""keyword in [word1, "word2 \" ' word3"]""" + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": r"""word1 word2 \" ' word3""", + "fields": [ + get_expansion("keywords", ".") + "^2", + get_expansion("descriptions", "."), + ], + } + }, + } + } + } + + _test_results(query, expected) + + +def test_language_filter(): + query = 'language in [python, "go lang", cpp]' + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "should": [ + { + "match": { + get_expansion( + "programming_languages", "." + ): "python" + } + }, + { + "match": { + get_expansion( + "programming_languages", "." + ): "go lang" + } + }, + { + "match": { + get_expansion("programming_languages", "."): "cpp" + } + }, + ] + } + }, + } + } + } + + _test_results(query, expected) + + +def test_license_filter(): + query = 'license in ["GPL 3", Apache, MIT]' + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "should": [ + {"match": {get_expansion("licenses", "."): "GPL 3"}}, + {"match": {get_expansion("licenses", "."): "Apache"}}, + {"match": {get_expansion("licenses", "."): "MIT"}}, + ] + } + }, + } + } + } + + _test_results(query, expected) + + +def test_date_created_not_equal_to_filter(): + query = "created != 2020-01-01" + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "must_not": [ + { + "range": { + get_expansion("date_created", "."): { + "gte": "2020-01-01", + "lte": "2020-01-01", + } + } + } + ] + } + }, + } + } + } + + _test_results(query, expected) + + +def test_date_created_greater_than_filter(): + query = "created >= 2020-01-01" + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "must": [ + { + "range": { + get_expansion("date_created", "."): { + "gte": "2020-01-01", + } + } + } + ] + } + }, + } + } + } + + _test_results(query, expected) + + +def test_last_eventful_visit_not_equal_to_filter(): + query = "last_visit != 2020-01-01" + expected = { + "filters": { + "bool": { + "must_not": [ + { + "range": { + "last_visit_date": { + "gte": "2020-01-01", + "lte": "2020-01-01", + } + } + } + ] + } + } + } + + _test_results(query, expected) + + +def test_last_eventful_visit_less_than_to_filter(): + query = "last_visit < 2020-01-01" + expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}} + + _test_results(query, expected) diff --git a/swh/search/translator.py b/swh/search/translator.py new file mode 100644 index 0000000..5c2b9d7 --- /dev/null +++ b/swh/search/translator.py @@ -0,0 +1,288 @@ +import os + +from pkg_resources import resource_filename +from tree_sitter import Language, Parser + +from swh.search.utils import get_expansion + + +class Translator: + + RANGE_OPERATOR_MAP = { + ">": "gt", + "<": "lt", + ">=": "gte", + "<=": "lte", + } + + def __init__(self): + ql_rel_paths = [ + "swh_ql.so", # installed + "../../query_language/swh_ql.so", # development + ] + for ql_rel_path in ql_rel_paths: + ql_path = resource_filename("swh.search", ql_rel_path) + if os.path.exists(ql_path): + break + else: + assert False, "swh_ql.so was not found in any of the expected paths" + + search_ql = Language(ql_path, "swh_search_ql") + + self.parser = Parser() + self.parser.set_language(search_ql) + self.query = "" + + def parse_query(self, query): + self.query = query + tree = self.parser.parse(query.encode("utf8")) + self.query_node = tree.root_node + + if self.query_node.has_error: + raise Exception("Invalid query") + + return self._traverse(self.query_node) + + def _traverse(self, node): + if len(node.children) == 3 and node.children[1].type == "filters": + # filters => ( filters ) + return self._traverse(node.children[1]) # Go past the () brackets + if node.type == "query": + result = {} + for child in node.children: + # query => filters sort_by limit + result[child.type] = self._traverse(child) + + return result + + if node.type == "filters": + if len(node.children) == 1: + # query => filters + # filters => filters + # filters => filter + # Current node is just a wrapper, so go one level deep + return self._traverse(node.children[0]) + + if len(node.children) == 3: + # filters => filters conj_op filters + filters1 = self._traverse(node.children[0]) + conj_op = self._get_value(node.children[1]) + filters2 = self._traverse(node.children[2]) + + if conj_op == "and": + return {"bool": {"must": [filters1, filters2]}} + if conj_op == "or": + return {"bool": {"should": [filters1, filters2]}} + + if node.type == "filter": + filter_category = node.children[0] + return self._parse_filter(filter_category) + + if node.type == "sortBy": + return self._parse_filter(node) + + if node.type == "limit": + return self._parse_filter(node) + + return Exception( + f"Unknown node type ({node.type}) " + f"or unexpected number of children ({node.children})" + ) + + def _get_value(self, node): + if ( + len(node.children) > 0 + and node.children[0].type == "[" + and node.children[-1].type == "]" + ): + # array + return [self._get_value(child) for child in node.children if child.is_named] + + start = node.start_point[1] + end = node.end_point[1] + + value = self.query[start:end] + + if len(value) > 1 and ( + (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"') + ): + return value[1:-1] + + if node.type in ["number", "numberVal"]: + return int(value) + + return value + + def _parse_filter(self, filter): + + if filter.type == "boundedListFilter": + filter = filter.children[0] + + children = filter.children + assert len(children) == 3 + + category = filter.type + name, op, value = [self._get_value(child) for child in children] + + if category == "patternFilter": + if name == "origin": + return { + "multi_match": { + "query": value, + "type": "bool_prefix", + "operator": "and", + "fields": [ + "url.as_you_type", + "url.as_you_type._2gram", + "url.as_you_type._3gram", + ], + } + } + elif name == "metadata": + return { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": value, + "type": "cross_fields", + "operator": "and", + "fields": ["intrinsic_metadata.*"], + "lenient": True, + } + }, + } + } + + if category == "booleanFilter": + if name == "visited": + return {"term": {"has_visits": value == "true"}} + + if category == "numericFilter": + if name == "visits": + if op in ["=", "!="]: + return { + "bool": { + ("must" if op == "=" else "must_not"): [ + {"range": {"nb_visits": {"gte": value, "lte": value}}} + ] + } + } + else: + return { + "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}} + } + + if category == "visitTypeFilter": + if name == "visit_type": + return {"terms": {"visit_types": value}} + + if category == "unboundedListFilter": + value_array = value + + if name == "keyword": + return { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": " ".join(value_array), + "fields": [ + get_expansion("keywords", ".") + "^2", + get_expansion("descriptions", "."), + ], + } + }, + } + } + elif name in ["language", "license"]: + name_mapping = { + "language": "programming_languages", + "license": "licenses", + } + name = name_mapping[name] + + return { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "should": [ + {"match": {get_expansion(name, "."): val}} + for val in value_array + ], + } + }, + } + } + + if category == "dateFilter": + + if name in ["created", "modified", "published"]: + if op in ["=", "!="]: + return { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + ("must" if op == "=" else "must_not"): [ + { + "range": { + get_expansion(f"date_{name}", "."): { + "gte": value, + "lte": value, + } + } + } + ], + } + }, + } + } + + return { + "nested": { + "path": "intrinsic_metadata", + "query": { + "bool": { + "must": [ + { + "range": { + get_expansion(f"date_{name}", "."): { + self.RANGE_OPERATOR_MAP[op]: value, + } + } + } + ], + } + }, + } + } + else: + if op in ["=", "!="]: + return { + "bool": { + ("must" if op == "=" else "must_not"): [ + { + "range": { + f"{name}_date": {"gte": value, "lte": value,} + } + } + ], + } + } + return { + "range": { + f"{name}_date": { + self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"), + } + } + } + + if category == "sortBy": + return value + + if category == "limit": + return value + + raise Exception(f"Unknown filter {category}.{name}")