diff --git a/query_language/grammar.js b/query_language/grammar.js index 4a02fe3..aa94745 100644 --- a/query_language/grammar.js +++ b/query_language/grammar.js @@ -1,184 +1,200 @@ // Copyright (C) 2019-2021 The Software Heritage developers // See the AUTHORS file at the top-level directory of this distribution // License: GNU General Public License version 3, or any later version // See top-level LICENSE file for more information const PRECEDENCE = { or: 2, and: 3, bracket: 4, } module.exports = grammar({ name: 'swh_search_ql', rules: { - query: $ => $.filters, + query: $ => seq($.filters, optional($.sortBy) ,optional($.limit)), filters: $ => choice( prec.left(PRECEDENCE.and, seq( field('left', $.filters), field('operator', $.and), field('right', $.filters), ) ), prec.left(PRECEDENCE.or, seq( field('left', $.filters), field('operator', $.or), field('right', $.filters), ) ), prec.left(PRECEDENCE.bracket, seq("(", $.filters, ")"), ), $.filter ), + sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal), + sortByField: $ => token('sort_by'), + sortByOp: $ => $.equalOp, + sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), + sortByOptions: $ => seq(optional(token.immediate('-')) ,choice( + 'visits', + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' + )), + + limit: $ => seq('limit', $.equalOp, $.number), + filter: $ => choice( $.patternFilter, $.booleanFilter, $.numericFilter, $.boundedListFilter, $.unboundedListFilter, - $.dateFilter, - $.limitFilter + $.dateFilter ), patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal), patternField: $ => token(choice('origin', 'metadata')), patternOp: $ => $.equalOp, patternVal: $ => $.string, booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal), booleanField: $ => token(choice('visited')), booleanOp: $ => $.equalOp, booleanVal: $ => choice($.booleanTrue, $.booleanFalse), numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal), numericField: $ => token(choice('visits')), numericOp: $ => $.rangeOp, numberVal: $ => $.number, - boundedListFilter: $ => choice($.visitTypeFilter, $.sortByFilter), + boundedListFilter: $ => choice($.visitTypeFilter), visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal), visitTypeField: $ => token(choice('visit_type')), visitTypeOp: $ => $.equalOp, visitTypeVal: $ => createArray(optionalWrapWith($.visitTypeOptions, ["'", '"'])), visitTypeOptions: $ => choice( "any", "cran", "deb", "deposit", "ftp", "hg", "git", "nixguix", "npm", "pypi", "svn", "tar" ), // TODO: fetch this list dynamically from other swh services? - sortByFilter: $ => seq($.sortByField, $.sortByOp, $.sortByVal), + sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal), sortByField: $ => token(choice('sort_by')), sortByOp: $ => $.equalOp, sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), sortByOptions: $ => seq( optional(token.immediate('-')), choice( 'visits', 'last_visit', 'last_eventful_visit', 'last_revision', 'last_release', 'created', 'modified', 'published' )), unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), listField: $ => token(choice('language', 'license', 'keyword')), listOp: $ => $.choiceOp, listVal: $ => createArray($.string), dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal), dateField: $ => token(choice( 'last_visit', 'last_eventful_visit', 'last_revision', 'last_release', 'created', 'modified', 'published' )), dateOp: $ => $.rangeOp, dateVal: $ => $.isoDateTime, - limitFilter: $ => seq('limit', $.equalOp, $.number), + limit: $ => seq('limit', $.equalOp, $.number), rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')), equalOp: $ => token('='), choiceOp: $ => token(choice('in', 'not in')), isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/, string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord), number: $ => /\d+/, booleanTrue: $ => "true", booleanFalse: $ => "false", or: $ => "or", and: $ => "and", stringContent: $ => repeat1(choice( token.immediate(/[^\\"\n]+/), $.escape_sequence )), singleWord: $ => /[^\s"'\[\]\(\)]+/, escape_sequence: $ => token.immediate(seq( '\\', /(\"|\'|\\|\/|b|n|r|t|u)/ )), } }); function joinBySep1(rule, sep) { // At least one repetition of the rule separated by `sep` return seq(rule, repeat(seq(sep, optional(rule)))) } function joinBySep(rule, sep = ",") { // Any number of repetitions of the rule separated by `sep` return optional(joinBySep1(rule, sep)) } function createArray(rule) { // An array having `rule` as its member return seq( "[", joinBySep( field('array_member', rule), "," ), "]" ) } function wrapWith(rule, wrappers = ["'", '"']) { // The rule must be wrapped with one of the wrappers const wrappedRules = wrappers.map(wrapper => seq(wrapper, rule, wrapper)) return choice(...wrappedRules) } function optionalWrapWith(rule, wrappers = ["'", '"']) { // The rule may or may not be wrapped with the wrappers return choice(wrapWith(rule, wrappers), rule) } diff --git a/query_language/test/corpus/combinations.txt b/query_language/test/corpus/combinations.txt index cf66e84..07802ea 100644 --- a/query_language/test/corpus/combinations.txt +++ b/query_language/test/corpus/combinations.txt @@ -1,76 +1,75 @@ ============================== Empty query (should throw error) ============================== --- (ERROR) ================== Origins with django as keyword, python language, and more than 5 visits ================== origin = django and language in ["python"] and visits >= 5 --- (query (filters (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (singleWord)))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent))))))) (and) (filters (filter (numericFilter (numericField) (numericOp (rangeOp)) (numberVal (number))))))) ================== 10 origins with latest revision after 2020-01-01 ================== -last_revision > 2020-01-01 and limit = 10 +last_revision > 2020-01-01 limit = 10 --- - -(query (filters (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (and) (filters (filter (limitFilter (equalOp) (number)))))) +(query (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (limit (equalOp) (number))) ================== -Origins with last visit date not in 2020-2021 +Origins with last visit date not in 2020-2021 (sorted by number of visits) ================== -last_visit > 2021-01-01 or last_visit < 2020-01-01 +last_visit > 2021-01-01 or last_visit < 2020-01-01 sort_by = ["visits"] --- -(query (filters (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (or) (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))))) +(query (filters (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (or) (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime)))))) (sortBy (sortByField) (sortByOp (equalOp)) (sortByVal (sortByOptions)))) ================== Unvisited origins with kubernetes in metadata or minikube in url ================== visited = false and metadata = "kubernetes" or origin = "minikube" --- (query (filters (filters (filters (filter (booleanFilter (booleanField) (booleanOp (equalOp)) (booleanVal (booleanFalse))))) (and) (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent))))))) (or) (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent)))))))) ================== Origins with "orchestration" or "kubectl" as keywords and language as "go" or "rust" ================== keyword in ["orchestration", "kubectl"] and language in ["go", "rust"] --- (query (filters (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)) (string (stringContent)))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)) (string (stringContent)))))))) ================== Origins with a GPL-3 license that have "debian" in their url or have visit type as "deb" ================== (origin = debian or visit_type = ["deb"]) and license in ["GPL-3"] --- (query (filters (filters (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (singleWord)))))) (or) (filters (filter (boundedListFilter (visitTypeFilter (visitTypeField) (visitTypeOp (equalOp)) (visitTypeVal (visitTypeOptions)))))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)))))))) ================== -Origins with 'and' and 'or' inside filter values +Origins with `and` and `or` inside filter values ================== (origin = "foo and bar or baz") --- (query (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent)))))))) ================== Origins with `'` and `"` inside filter values ================== (origin = "foo \\ \'bar\' \"baz\" ") --- (query (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent (escape_sequence) (escape_sequence) (escape_sequence) (escape_sequence) (escape_sequence)))))))))