diff --git a/query_language/grammar.js b/query_language/grammar.js index ab5320b..594a934 100644 --- a/query_language/grammar.js +++ b/query_language/grammar.js @@ -1,216 +1,192 @@ -// Copyright (C) 2019-2021 The Software Heritage developers +// Copyright (C) 2021 The Software Heritage developers // See the AUTHORS file at the top-level directory of this distribution // License: GNU General Public License version 3, or any later version // See top-level LICENSE file for more information +const { visitTypeField, sortByField, limitField } = require("./tokens.js"); +const { + patternFields, + booleanFields, + numericFields, + listFields, + dateFields +} = require("./tokens.js"); +const { equalOp, rangeOp, choiceOp } = require("./tokens.js"); +const { sortByOptions, visitTypeOptions } = require("./tokens.js"); +const { OR, AND, TRUE, FALSE } = require("./tokens.js"); const PRECEDENCE = { or: 2, and: 3, bracket: 4, } module.exports = grammar({ name: 'swh_search_ql', rules: { query: $ => seq( $.filters, - optional($.and), - choice( - seq(optional($.sortBy), optional($.and), optional($.limit)), - seq(optional($.limit), optional($.and), optional($.sortBy)), - ), + optional(seq( + optional($.and), + choice( + seq($.sortBy, optional($.and), optional($.limit)), + seq($.limit, optional($.and), optional($.sortBy)), + ), + )) ), - - filters: $ => choice( prec.left(PRECEDENCE.and, seq( field('left', $.filters), field('operator', $.and), field('right', $.filters), ) ), prec.left(PRECEDENCE.or, seq( field('left', $.filters), field('operator', $.or), field('right', $.filters), ) ), prec.left(PRECEDENCE.bracket, seq("(", $.filters, ")"), ), $.filter ), - sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal), - sortByField: $ => token('sort_by'), + sortBy: $ => annotateFilter($.sortByField, $.sortByOp, $.sortByVal), + sortByField: $ => token(sortByField), sortByOp: $ => $.equalOp, sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), - sortByOptions: $ => seq(optional(token.immediate('-')), choice( - 'visits', - 'last_visit', - 'last_eventful_visit', - 'last_revision', - 'last_release', - 'created', - 'modified', - 'published' - )), + sortByOptions: $ => seq( + optional('-'), + choice(...sortByOptions) + ), - limit: $ => seq('limit', $.equalOp, $.number), + limit: $ => annotateFilter($.limitField, $.equalOp, $.number), + limitField: $ => token(limitField), - filter: $ => choice( + filter: $ => field('category', choice( $.patternFilter, $.booleanFilter, $.numericFilter, $.boundedListFilter, $.unboundedListFilter, $.dateFilter - ), + )), - patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal), - patternField: $ => token(choice('origin', 'metadata')), + patternFilter: $ => annotateFilter($.patternField, $.patternOp, $.patternVal), + patternField: $ => token(choice(...patternFields)), patternOp: $ => $.equalOp, patternVal: $ => $.string, - booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal), - booleanField: $ => token(choice('visited')), + booleanFilter: $ => annotateFilter($.booleanField, $.booleanOp, $.booleanVal), + booleanField: $ => token(choice(...booleanFields)), booleanOp: $ => $.equalOp, booleanVal: $ => choice($.booleanTrue, $.booleanFalse), - numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal), - numericField: $ => token(choice('visits')), + numericFilter: $ => annotateFilter($.numericField, $.numericOp, $.numberVal), + numericField: $ => token(choice(...numericFields)), numericOp: $ => $.rangeOp, numberVal: $ => $.number, + // Array members must be from the given options boundedListFilter: $ => choice($.visitTypeFilter), - visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal), - visitTypeField: $ => token(choice('visit_type')), + visitTypeFilter: $ => annotateFilter($.visitTypeField, $.visitTypeOp, $.visitTypeVal), + visitTypeField: $ => token(visitTypeField), visitTypeOp: $ => $.equalOp, visitTypeVal: $ => createArray(optionalWrapWith($.visitTypeOptions, ["'", '"'])), - visitTypeOptions: $ => choice( - "any", - "cran", - "deb", - "deposit", - "ftp", - "hg", - "git", - "nixguix", - "npm", - "pypi", - "svn", - "tar" - ), // TODO: fetch this list dynamically from other swh services? - - sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal), - sortByField: $ => token(choice('sort_by')), - sortByOp: $ => $.equalOp, - sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), - sortByOptions: $ => seq( - optional('-'), - choice( - 'visits', - 'last_visit', - 'last_eventful_visit', - 'last_revision', - 'last_release', - 'created', - 'modified', - 'published' - ) - ), + visitTypeOptions: $ => choice(...visitTypeOptions), + // TODO: fetch visitTypeOptions choices dynamically from other swh services? - unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), - listField: $ => token(choice('language', 'license', 'keyword')), + // Array members can be any string + unboundedListFilter: $ => annotateFilter($.listField, $.listOp, $.listVal), + listField: $ => token(choice(...listFields)), listOp: $ => $.choiceOp, listVal: $ => createArray($.string), - - dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal), - dateField: $ => token(choice( - 'last_visit', - 'last_eventful_visit', - 'last_revision', - 'last_release', - 'created', - 'modified', - 'published' - )), + dateFilter: $ => annotateFilter($.dateField, $.dateOp, $.dateVal), + dateField: $ => token(choice(...dateFields)), dateOp: $ => $.rangeOp, dateVal: $ => $.isoDateTime, - limit: $ => seq('limit', $.equalOp, $.number), - - - rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')), - equalOp: $ => token('='), - choiceOp: $ => token(choice('in', 'not in')), + rangeOp: $ => token(choice(...rangeOp)), + equalOp: $ => token(choice(...equalOp)), + choiceOp: $ => token(choice(...choiceOp)), isoDateTime: $ => { const dateRegex = (/\d{4}[-]\d{2}[-]\d{2}/).source const dateTimeSepRegex = (/(\s|T)*/).source const timeRegex = (/(\d{2}:\d{2}(:\d{2}(\.\d{6})?)?)?/).source const timezoneRegex = (/(\+\d{2}:\d{2}|Z)?/).source return new RegExp(dateRegex + dateTimeSepRegex + timeRegex + timezoneRegex) }, string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord), number: $ => /\d+/, - booleanTrue: $ => "true", - booleanFalse: $ => "false", + booleanTrue: $ => TRUE, + booleanFalse: $ => FALSE, - or: $ => "or", - and: $ => "and", + or: $ => OR, + and: $ => AND, + singleWord: $ => /[^\s"'\[\]\(\),]+/, + + // Based on tree-sitter-json grammar: stringContent: $ => repeat1(choice( token.immediate(/[^\\'"\n]+/), $.escape_sequence )), - singleWord: $ => /[^\s"'\[\]\(\),]+/, escape_sequence: $ => token.immediate(seq( '\\', /(\"|\'|\\|\/|b|n|r|t|u)/ )), } }); function joinBySep1(rule, sep) { // At least one repetition of the rule separated by `sep` return seq(rule, repeat(seq(sep, optional(rule)))) } function joinBySep(rule, sep = ",") { // Any number of repetitions of the rule separated by `sep` return optional(joinBySep1(rule, sep)) } function createArray(rule) { // An array having `rule` as its member return seq( "[", joinBySep( field('array_member', rule), "," ), "]" ) } function wrapWith(rule, wrappers = ["'", '"']) { // The rule must be wrapped with one of the wrappers const wrappedRules = wrappers.map(wrapper => seq(wrapper, rule, wrapper)) return choice(...wrappedRules) } function optionalWrapWith(rule, wrappers = ["'", '"']) { // The rule may or may not be wrapped with the wrappers return choice(wrapWith(rule, wrappers), rule) } + +function annotateFilter(filterField, filterOp, filterVal) { + return seq( + field('field', filterField), + field('op', filterOp), + field('value', filterVal) + ); +} diff --git a/query_language/test/corpus/combinations.txt b/query_language/test/corpus/combinations.txt index 07802ea..999ed76 100644 --- a/query_language/test/corpus/combinations.txt +++ b/query_language/test/corpus/combinations.txt @@ -1,75 +1,82 @@ ============================== Empty query (should throw error) ============================== --- (ERROR) ================== Origins with django as keyword, python language, and more than 5 visits ================== origin = django and language in ["python"] and visits >= 5 --- (query (filters (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (singleWord)))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent))))))) (and) (filters (filter (numericFilter (numericField) (numericOp (rangeOp)) (numberVal (number))))))) ================== 10 origins with latest revision after 2020-01-01 ================== last_revision > 2020-01-01 limit = 10 --- -(query (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (limit (equalOp) (number))) +(query (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (limit (limitField) (equalOp) (number))) ================== Origins with last visit date not in 2020-2021 (sorted by number of visits) ================== last_visit > 2021-01-01 or last_visit < 2020-01-01 sort_by = ["visits"] --- (query (filters (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (or) (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime)))))) (sortBy (sortByField) (sortByOp (equalOp)) (sortByVal (sortByOptions)))) ================== Unvisited origins with kubernetes in metadata or minikube in url ================== visited = false and metadata = "kubernetes" or origin = "minikube" --- (query (filters (filters (filters (filter (booleanFilter (booleanField) (booleanOp (equalOp)) (booleanVal (booleanFalse))))) (and) (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent))))))) (or) (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent)))))))) ================== Origins with "orchestration" or "kubectl" as keywords and language as "go" or "rust" ================== keyword in ["orchestration", "kubectl"] and language in ["go", "rust"] --- (query (filters (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)) (string (stringContent)))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)) (string (stringContent)))))))) ================== Origins with a GPL-3 license that have "debian" in their url or have visit type as "deb" ================== (origin = debian or visit_type = ["deb"]) and license in ["GPL-3"] --- (query (filters (filters (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (singleWord)))))) (or) (filters (filter (boundedListFilter (visitTypeFilter (visitTypeField) (visitTypeOp (equalOp)) (visitTypeVal (visitTypeOptions)))))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)))))))) ================== Origins with `and` and `or` inside filter values ================== (origin = "foo and bar or baz") --- (query (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent)))))))) ================== Origins with `'` and `"` inside filter values ================== (origin = "foo \\ \'bar\' \"baz\" ") --- (query (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent (escape_sequence) (escape_sequence) (escape_sequence) (escape_sequence) (escape_sequence))))))))) + +================== +Incomplete conjunction operators should throw error +================== +visits > 5 and +--- +(query (filters (filter (numericFilter (numericField) (numericOp (rangeOp)) (numberVal (number))))) (ERROR (and))) diff --git a/query_language/tokens.js b/query_language/tokens.js new file mode 100644 index 0000000..47c9968 --- /dev/null +++ b/query_language/tokens.js @@ -0,0 +1,105 @@ +// Copyright (C) 2021 The Software Heritage developers +// See the AUTHORS file at the top-level directory of this distribution +// License: GNU General Public License version 3, or any later version +// See top-level LICENSE file for more information + +// Field tokens +const visitTypeField = 'visit_type'; +const sortByField = 'sort_by'; +const limitField = 'limit'; + +// Field categories +const patternFields = ['origin', 'metadata']; +const booleanFields = ['visited']; +const numericFields = ['visits']; +const boundedListFields = [visitTypeField]; +const listFields = ['language', 'license', 'keyword']; +const dateFields = [ + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' +]; + +const fields = [].concat( + patternFields, + booleanFields, + numericFields, + boundedListFields, + listFields, + dateFields +); + +// Operators +const equalOp = ['=']; +const rangeOp = ['<', '<=', '=', '!=', '>=', '>']; +const choiceOp = ['in', 'not in']; + + +// Values +const sortByOptions = [ + 'visits', + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' +]; + +const visitTypeOptions = [ + "any", + "cran", + "deb", + "deposit", + "ftp", + "hg", + "git", + "nixguix", + "npm", + "pypi", + "svn", + "tar" +]; + +// Extra tokens +const OR = "or"; +const AND = "and"; + +const TRUE = "true"; +const FALSE = "false"; + +module.exports = { + // Field tokens + visitTypeField, + sortByField, + limitField, + + // Field categories + patternFields, + booleanFields, + numericFields, + boundedListFields, + listFields, + dateFields, + fields, + + // Operators + equalOp, + rangeOp, + choiceOp, + + // Values + sortByOptions, + visitTypeOptions, + + // Extra tokens + OR, + AND, + TRUE, + FALSE +}