diff --git a/swh/search/query_language/grammar.js b/swh/search/query_language/grammar.js index eb901f1..7dca22d 100644 --- a/swh/search/query_language/grammar.js +++ b/swh/search/query_language/grammar.js @@ -1,193 +1,193 @@ // Copyright (C) 2021 The Software Heritage developers // See the AUTHORS file at the top-level directory of this distribution // License: GNU General Public License version 3, or any later version // See top-level LICENSE file for more information const { visitTypeField, sortByField, limitField } = require("./tokens.js"); const { patternFields, booleanFields, numericFields, listFields, dateFields } = require("./tokens.js"); const { equalOp, containOp, rangeOp, choiceOp } = require("./tokens.js"); const { sortByOptions, visitTypeOptions } = require("./tokens.js"); const { OR, AND, TRUE, FALSE } = require("./tokens.js"); const PRECEDENCE = { or: 2, and: 3, bracket: 4, } module.exports = grammar({ name: 'swh_search_ql', rules: { query: $ => seq( $.filters, optional(seq( optional($.and), choice( seq($.sortBy, optional($.and), optional($.limit)), seq($.limit, optional($.and), optional($.sortBy)), ), )) ), filters: $ => choice( prec.left(PRECEDENCE.and, seq( field('left', $.filters), field('operator', $.and), field('right', $.filters), ) ), prec.left(PRECEDENCE.or, seq( field('left', $.filters), field('operator', $.or), field('right', $.filters), ) ), prec.left(PRECEDENCE.bracket, seq("(", $.filters, ")"), ), $.filter ), sortBy: $ => annotateFilter($.sortByField, $.sortByOp, $.sortByVal), sortByField: $ => token(sortByField), sortByOp: $ => $.equalOp, sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), sortByOptions: $ => seq( optional('-'), choice(...sortByOptions) ), limit: $ => annotateFilter($.limitField, $.equalOp, $.number), limitField: $ => token(limitField), filter: $ => field('category', choice( $.patternFilter, $.booleanFilter, $.numericFilter, $.boundedListFilter, $.unboundedListFilter, $.dateFilter )), patternFilter: $ => annotateFilter($.patternField, $.patternOp, $.patternVal), patternField: $ => token(choice(...patternFields)), patternOp: $ => $.containOp, patternVal: $ => $.string, booleanFilter: $ => annotateFilter($.booleanField, $.booleanOp, $.booleanVal), booleanField: $ => token(choice(...booleanFields)), booleanOp: $ => $.equalOp, booleanVal: $ => choice($.booleanTrue, $.booleanFalse), numericFilter: $ => annotateFilter($.numericField, $.numericOp, $.numberVal), numericField: $ => token(choice(...numericFields)), numericOp: $ => $.rangeOp, numberVal: $ => $.number, // Array members must be from the given options boundedListFilter: $ => choice($.visitTypeFilter), visitTypeFilter: $ => annotateFilter($.visitTypeField, $.visitTypeOp, $.visitTypeVal), visitTypeField: $ => token(visitTypeField), visitTypeOp: $ => $.equalOp, visitTypeVal: $ => createArray(optionalWrapWith($.visitTypeOptions, ["'", '"'])), visitTypeOptions: $ => choice(...visitTypeOptions), // TODO: fetch visitTypeOptions choices dynamically from other swh services? // Array members can be any string unboundedListFilter: $ => annotateFilter($.listField, $.listOp, $.listVal), listField: $ => token(choice(...listFields)), listOp: $ => $.choiceOp, listVal: $ => createArray($.string), dateFilter: $ => annotateFilter($.dateField, $.dateOp, $.dateVal), dateField: $ => token(choice(...dateFields)), dateOp: $ => $.rangeOp, dateVal: $ => $.isoDateTime, rangeOp: $ => token(choice(...rangeOp)), equalOp: $ => token(choice(...equalOp)), containOp: $ => token(choice(...containOp)), choiceOp: $ => token(choice(...choiceOp)), isoDateTime: $ => { const dateRegex = (/\d{4}[-]\d{2}[-]\d{2}/).source const dateTimeSepRegex = (/(\s|T)*/).source - const timeRegex = (/(\d{2}:\d{2}(:\d{2}(\.\d{6})?)?)?/).source - const timezoneRegex = (/(\+\d{2}:\d{2}|Z)?/).source - return new RegExp(dateRegex + dateTimeSepRegex + timeRegex + timezoneRegex) + const timeRegex = (/\d{2}:\d{2}(:\d{2}(\.\d{6})?)?/).source + const timezoneRegex = (/\+\d{2}:\d{2}|Z/).source + return new RegExp(`${dateRegex}(${dateTimeSepRegex}${timeRegex}(${timezoneRegex})?)?`) }, string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord), number: $ => /\d+/, booleanTrue: $ => TRUE, booleanFalse: $ => FALSE, or: $ => OR, and: $ => AND, singleWord: $ => /[^\s"'\[\]\(\),]+/, // Based on tree-sitter-json grammar: stringContent: $ => repeat1(choice( token.immediate(/[^\\'"\n]+/), $.escape_sequence )), escape_sequence: $ => token.immediate(seq( '\\', /(\"|\'|\\|\/|b|n|r|t|u)/ )), } }); function joinBySep1(rule, sep) { // At least one repetition of the rule separated by `sep` return seq(rule, repeat(seq(sep, optional(rule)))) } function joinBySep(rule, sep = ",") { // Any number of repetitions of the rule separated by `sep` return optional(joinBySep1(rule, sep)) } function createArray(rule) { // An array having `rule` as its member return seq( "[", joinBySep( field('array_member', rule), "," ), "]" ) } function wrapWith(rule, wrappers = ["'", '"']) { // The rule must be wrapped with one of the wrappers const wrappedRules = wrappers.map(wrapper => seq(wrapper, rule, wrapper)) return choice(...wrappedRules) } function optionalWrapWith(rule, wrappers = ["'", '"']) { // The rule may or may not be wrapped with the wrappers return choice(wrapWith(rule, wrappers), rule) } function annotateFilter(filterField, filterOp, filterVal) { return seq( field('field', filterField), field('op', filterOp), field('value', filterVal) ); } diff --git a/swh/search/tests/test_elasticsearch.py b/swh/search/tests/test_elasticsearch.py index 1af86e5..7e460d5 100644 --- a/swh/search/tests/test_elasticsearch.py +++ b/swh/search/tests/test_elasticsearch.py @@ -1,226 +1,278 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from textwrap import dedent import types import unittest from elasticsearch.helpers.errors import BulkIndexError import pytest from swh.search.exc import SearchQuerySyntaxError from swh.search.metrics import OPERATIONS_METRIC from .test_search import CommonSearchTest now = datetime.now(tz=timezone.utc).isoformat() -now_minus_5_hours = (datetime.now(tz=timezone.utc) - timedelta(hours=5)).isoformat() -now_plus_5_hours = (datetime.now(tz=timezone.utc) + timedelta(hours=5)).isoformat() +now_minus_5_days = (datetime.now(tz=timezone.utc) - timedelta(days=5)).isoformat() +now_plus_5_days = (datetime.now(tz=timezone.utc) + timedelta(days=5)).isoformat() ORIGINS = [ { "url": "http://foobar.1.com", "nb_visits": 1, - "last_visit_date": now_minus_5_hours, - "last_eventful_visit_date": now_minus_5_hours, + "last_visit_date": now_minus_5_days, + "last_eventful_visit_date": now_minus_5_days, }, { "url": "http://foobar.2.com", "nb_visits": 2, "last_visit_date": now, "last_eventful_visit_date": now, }, { "url": "http://foobar.3.com", "nb_visits": 3, - "last_visit_date": now_plus_5_hours, - "last_eventful_visit_date": now_minus_5_hours, + "last_visit_date": now_plus_5_days, + "last_eventful_visit_date": now_minus_5_days, }, { "url": "http://barbaz.4.com", "nb_visits": 3, - "last_visit_date": now_plus_5_hours, - "last_eventful_visit_date": now_minus_5_hours, + "last_visit_date": now_plus_5_days, + "last_eventful_visit_date": now_minus_5_days, }, ] class BaseElasticsearchTest(unittest.TestCase): @pytest.fixture(autouse=True) def _instantiate_search(self, swh_search, elasticsearch_host, mocker): self._elasticsearch_host = elasticsearch_host self.search = swh_search self.mocker = mocker # override self.search.origin_update to catch painless script errors # and pretty print them origin_update = self.search.origin_update def _origin_update(self, *args, **kwargs): script_error = False error_detail = "" try: origin_update(*args, **kwargs) except BulkIndexError as e: error = e.errors[0].get("update", {}).get("error", {}).get("caused_by") if error and "script_stack" in error: script_error = True error_detail = dedent( f""" Painless update script failed ({error.get('reason')}). error type: {error.get('caused_by', {}).get('type')} error reason: {error.get('caused_by', {}).get('reason')} script stack: """ ) error_detail += "\n".join(error["script_stack"]) else: raise e assert script_error is False, error_detail[1:] self.search.origin_update = types.MethodType(_origin_update, self.search) def reset(self): self.search.deinitialize() self.search.initialize() class TestElasticsearchSearch(CommonSearchTest, BaseElasticsearchTest): def test_metrics_update_duration(self): mock = self.mocker.patch("swh.search.metrics.statsd.timing") for url in ["http://foobar.bar", "http://foobar.baz"]: self.search.origin_update([{"url": url}]) assert mock.call_count == 2 def test_metrics_search_duration(self): mock = self.mocker.patch("swh.search.metrics.statsd.timing") for url_pattern in ["foobar", "foobaz"]: self.search.origin_search(url_pattern=url_pattern, with_visit=True) assert mock.call_count == 2 def test_metrics_indexation_counters(self): mock_es = self.mocker.patch("elasticsearch.helpers.bulk") mock_es.return_value = 2, ["error"] mock_metrics = self.mocker.patch("swh.search.metrics.statsd.increment") self.search.origin_update([{"url": "http://foobar.baz"}]) assert mock_metrics.call_count == 2 mock_metrics.assert_any_call( OPERATIONS_METRIC, 2, tags={ "endpoint": "origin_update", "object_type": "document", "operation": "index", }, ) mock_metrics.assert_any_call( OPERATIONS_METRIC, 1, tags={ "endpoint": "origin_update", "object_type": "document", "operation": "index_error", }, ) def test_write_alias_usage(self): mock = self.mocker.patch("elasticsearch.helpers.bulk") mock.return_value = 2, ["result"] self.search.origin_update([{"url": "http://foobar.baz"}]) assert mock.call_args[1]["index"] == "test-write" def test_read_alias_usage(self): mock = self.mocker.patch("elasticsearch.Elasticsearch.search") mock.return_value = {"hits": {"hits": []}} self.search.origin_search(url_pattern="foobar.baz") assert mock.call_args[1]["index"] == "test-read" def test_sort_by_and_limit_query(self): self.search.origin_update(ORIGINS) self.search.flush() def _check_results(query, origin_indices): page = self.search.origin_search(url_pattern="foobar", query=query) results = [r["url"] for r in page.results] assert results == [ORIGINS[index]["url"] for index in origin_indices] _check_results("sort_by = [-visits]", [2, 1, 0]) _check_results("sort_by = [last_visit]", [0, 1, 2]) _check_results("sort_by = [-last_eventful_visit, visits]", [1, 0, 2]) _check_results("sort_by = [last_eventful_visit,-last_visit]", [2, 0, 1]) _check_results("sort_by = [-visits] limit = 1", [2]) _check_results("sort_by = [last_visit] and limit = 2", [0, 1]) _check_results("sort_by = [-last_eventful_visit, visits] limit = 3", [1, 0, 2]) def test_search_ql_simple(self): self.search.origin_update(ORIGINS) self.search.flush() results = { r["url"] for r in self.search.origin_search(query='origin : "foobar"').results } assert results == { "http://foobar.1.com", "http://foobar.2.com", "http://foobar.3.com", } + def test_search_ql_datetimes(self): + self.search.origin_update(ORIGINS) + self.search.flush() + + now_minus_5_minutes = ( + datetime.now(tz=timezone.utc) - timedelta(minutes=5) + ).isoformat() + now_plus_5_minutes = ( + datetime.now(tz=timezone.utc) + timedelta(minutes=5) + ).isoformat() + + results = { + r["url"] + for r in self.search.origin_search( + query=( + f"last_visit < {now_minus_5_minutes} " + f"or last_visit > {now_plus_5_minutes}" + ) + ).results + } + assert results == { + "http://foobar.1.com", + "http://foobar.3.com", + "http://barbaz.4.com", + } + + def test_search_ql_dates(self): + self.search.origin_update(ORIGINS) + self.search.flush() + + now_minus_2_days = ( + (datetime.now(tz=timezone.utc) - timedelta(days=2)).date().isoformat() + ) + now_plus_2_days = ( + (datetime.now(tz=timezone.utc) + timedelta(days=2)).date().isoformat() + ) + + results = { + r["url"] + for r in self.search.origin_search( + query=( + f"last_visit < {now_minus_2_days} " + f"or last_visit > {now_plus_2_days}" + ) + ).results + } + assert results == { + "http://foobar.1.com", + "http://foobar.3.com", + "http://barbaz.4.com", + } + def test_search_ql_visited(self): self.search.origin_update( [ { "url": "http://foobar.1.com", "has_visits": True, "nb_visits": 1, - "last_visit_date": now_minus_5_hours, - "last_eventful_visit_date": now_minus_5_hours, + "last_visit_date": now_minus_5_days, + "last_eventful_visit_date": now_minus_5_days, }, {"url": "http://foobar.2.com",}, {"url": "http://foobar.3.com", "has_visits": False,}, ] ) self.search.flush() assert { r["url"] for r in self.search.origin_search(query="visited = true").results } == {"http://foobar.1.com"} assert { r["url"] for r in self.search.origin_search(query="visited = false").results } == {"http://foobar.2.com", "http://foobar.3.com"} assert ( self.search.origin_search( query="visited = true and visited = false" ).results == [] ) assert ( self.search.origin_search(query="visited = false", with_visit=True).results == [] ) def test_query_syntax_error(self): self.search.origin_update(ORIGINS) self.search.flush() with pytest.raises(SearchQuerySyntaxError): self.search.origin_search(query="foobar") diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py index a265e8c..9789c62 100644 --- a/swh/search/tests/test_translator.py +++ b/swh/search/tests/test_translator.py @@ -1,426 +1,442 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.search.translator import Translator from swh.search.utils import get_expansion def _test_results(query, expected): output = Translator().parse_query(query) assert output == expected def test_empty_query(): query = "" with pytest.raises(Exception): _test_results(query, {}) def test_conjunction_operators(): query = "visited = true or visits > 2 and visits < 5" expected = { "filters": { "bool": { "should": [ {"term": {"has_visits": True}}, { "bool": { "must": [ {"range": {"nb_visits": {"gt": 2}}}, {"range": {"nb_visits": {"lt": 5}}}, ] } }, ] } } } _test_results(query, expected) def test_visited(): query = "visited = true" expected = { "filters": {"term": {"has_visits": True}}, } _test_results(query, expected) query = "visited = false" expected = { "filters": { "bool": { "should": [ {"term": {"has_visits": False}}, {"bool": {"must_not": {"exists": {"field": "has_visits"}}}}, ] } } } _test_results(query, expected) def test_conjunction_op_precedence_override(): query = "(visited = true or visits > 2) and visits < 5" expected = { "filters": { "bool": { "must": [ { "bool": { "should": [ {"term": {"has_visits": True}}, {"range": {"nb_visits": {"gt": 2}}}, ] } }, {"range": {"nb_visits": {"lt": 5}}}, ] } } } _test_results(query, expected) def test_limit_and_sortby(): query = "visited = true sort_by = [-visits,last_visit] limit = 15" expected = { "filters": {"term": {"has_visits": True}}, "sortBy": ["-visits", "last_visit"], "limit": 15, } _test_results(query, expected) def test_deeply_nested_filters(): query = "(((visited = true and visits > 0)))" expected = { "filters": { "bool": { "must": [ {"term": {"has_visits": True},}, {"range": {"nb_visits": {"gt": 0}}}, ] } }, } _test_results(query, expected) def test_origin_and_metadata_filters(): query = 'origin : django or metadata : "framework and web"' expected = { "filters": { "bool": { "should": [ { "multi_match": { "query": "django", "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } }, { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": "framework and web", "type": "cross_fields", "operator": "and", "fields": ["intrinsic_metadata.*"], "lenient": True, } }, } }, ] } } } _test_results(query, expected) def test_visits_not_equal_to_filter(): query = "visits != 5" expected = { "filters": { "bool": {"must_not": [{"range": {"nb_visits": {"gte": 5, "lte": 5}}},]} }, } _test_results(query, expected) def test_visit_type_filter(): query = 'visit_type = [git,"pypi"]' expected = {"filters": {"terms": {"visit_types": ["git", "pypi"]}}} _test_results(query, expected) def test_keyword_filter(): query = r"""keyword in [word1, "word2 \" \' word3"]""" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": r"""word1 word2 " ' word3""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, } } } _test_results(query, expected) def test_language_filter(): query = 'language in [python, "go lang", cpp]' expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ { "match": { get_expansion( "programming_languages", "." ): "python" } }, { "match": { get_expansion( "programming_languages", "." ): "go lang" } }, { "match": { get_expansion("programming_languages", "."): "cpp" } }, ] } }, } } } _test_results(query, expected) def test_license_filter(): query = 'license in ["GPL 3", Apache, MIT]' expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ {"match": {get_expansion("licenses", "."): "GPL 3"}}, {"match": {get_expansion("licenses", "."): "Apache"}}, {"match": {get_expansion("licenses", "."): "MIT"}}, ] } }, } } } _test_results(query, expected) def test_date_created_not_equal_to_filter(): query = "created != 2020-01-01" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must_not": [ { "range": { get_expansion("date_created", "."): { "gte": "2020-01-01", "lte": "2020-01-01", } } } ] } }, } } } _test_results(query, expected) def test_date_created_greater_than_filter(): query = "created >= 2020-01-01" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must": [ { "range": { get_expansion("date_created", "."): { "gte": "2020-01-01", } } } ] } }, } } } _test_results(query, expected) +def test_visit_date_range(): + query = "last_visit >= 2020-01-01 and last_visit < 2021-01-01" + expected = { + "filters": { + "bool": { + "must": [ + {"range": {"last_visit_date": {"gte": "2020-01-01"}}}, + {"range": {"last_visit_date": {"lt": "2021-01-01"}}}, + ] + } + }, + } + + _test_results(query, expected) + + def test_last_eventful_visit_not_equal_to_filter(): query = "last_visit != 2020-01-01" expected = { "filters": { "bool": { "must_not": [ { "range": { "last_visit_date": { "gte": "2020-01-01", "lte": "2020-01-01", } } } ] } } } _test_results(query, expected) def test_last_eventful_visit_less_than_to_filter(): query = "last_visit < 2020-01-01" expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}} _test_results(query, expected) def test_keyword_no_escape_inside_filter(): # any keyword (filter name/operator/value) inside a filter # must be considered a string. query = r'''origin : "language in [\'go lang\', python]"''' expected = { "filters": { "multi_match": { "query": r"""language in ['go lang', python]""", "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } } _test_results(query, expected) def test_escaped_punctuation_parsing(): query = r"""keyword in ["foo \'\" bar"]""" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": r"""foo '" bar""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, } } } _test_results(query, expected) def test_nonascii(): query = r"""keyword in ["café"]""" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": r"""café""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, } } } _test_results(query, expected) def test_nonascii_before_operator(): query = r"""keyword in ["🐍"] and visited = true""" expected = { "filters": { "bool": { "must": [ { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": r"""🐍""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, }, }, {"term": {"has_visits": True,},}, ], } } } _test_results(query, expected)