diff --git a/swh/search/parser/.gitignore b/swh/search/parser/.gitignore new file mode 100644 --- /dev/null +++ b/swh/search/parser/.gitignore @@ -0,0 +1,5 @@ +bindings +node_modules +src +binding.gyp +Cargo.toml diff --git a/swh/search/parser/grammar.js b/swh/search/parser/grammar.js new file mode 100644 --- /dev/null +++ b/swh/search/parser/grammar.js @@ -0,0 +1,137 @@ +module.exports = grammar({ + name: 'swh_search_query_language', + + rules: { + query: $ => repeat( + choice( + $.patternFilter, + $.booleanFilter, + $.numericFilter, + $.unboundedListFilter, + $.boundedListFilter, + $.dateFilter, + ) + ), + + patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal), + patternField: $ => token(choice('url', 'metadata')), + patternOp: $ => choice(' : ', ' = '), + patternVal: $ => $.string, // should it be RegEx instead ? + + booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal), + booleanField: $ => choice('with_visit'), + booleanOp: $ => choice(' = ', ' : '), + booleanVal: $ => choice($.True, $.False), + + numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal), + numericField: $ => choice('nb_visits', 'limit'), + numericOp: $ => $.rangeOp, + numberVal: $ => $.number, + + boundedListFilter: $ => choice($.visitTypeFilter, $.sortByFilter), + + visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal), + visitTypeField: $ => 'visit_types', + visitTypeOp: $ => choice(':'), + visitTypeVal: $ => seq('[', commaSepStr($.visitTypeOptions), ']'), + visitTypeOptions: $ => choice( + "any", + "cran", + "deb", + "deposit", + "ftp", + "hg", + "git", + "nixguix", + "npm", + "pypi", + "svn", + "tar" + ), + + sortByFilter: $ => seq($.sortByField, $.sortByOp, $.sortByVal), + sortByField: $ => 'sort_by', + sortByOp: $ => choice(':'), + sortByVal: $ => seq('[', commaSepStr($.sortByOptions), ']'), + sortByOptions: $ => choice( + 'nb_visits', + 'last_visit_date', + 'last_eventful_visit_date', + 'last_revision_date', + 'last_release_date', + 'date_created', + 'date_modified', + 'date_published' + ), + + unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), + listField: $ => choice('programming_languages', 'licenses', 'keywords'), + listOp: $ => token(choice('in', 'not in')), + listVal: $ => $.array, + + + dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal), + dateField: $ => choice( + 'last_visit_date', + 'last_eventful_visit_date', + 'last_revision_date', + 'last_release_date', + 'date_created', + 'date_modified', + 'date_published' + ), + dateOp: $ => $.rangeOp, + dateVal: $ => $.dateWithOptionalTime, + + + rangeOp: $ => choice('<', '<=', '=', '!=', '>=', '>'), + + array: $ => seq( + "[", commaSepStr($.string_content), "]" + ), + + dateWithOptionalTime: $ => /\d{4}[-]\d{2}[-]\d{2}\s*(\d{2}:\d{2}(:\d{2})*)*/, + // ^\d{2}\/\d{2}\/\d{4}\s*(?:\d{2}:\d{2}(?::\d{2})?)?$ + // Reference : + // https://stackoverflow.com/questions/23786905/regex-for-validating-dd-mm-yyyy-with-optional-time + // Matches : + // 21-05-2014 + // 21-05-2014 15:54 + // 21-05-2014 15:54:12 + + string: $ => wrapWithInvertedComma($.string_content), + number: $ => /\d+/, + True: $ => "true", + False: $ => "false", + + string_content: $ => repeat1(choice( + token.immediate(/[^\\"\n]+/), + $.escape_sequence + )), + escape_sequence: $ => token.immediate(seq( + '\\', + /(\"|\\|\/|b|n|r|t|u)/ + )), + + } +}); + +function commaSep1(rule) { + return seq(rule, repeat(seq(",", rule))) +} + +function commaSep(rule) { + return optional(commaSep1(rule)) +} + +function commaSepStr(rule) { + return commaSep(wrapWithInvertedComma(rule)) +} + +function wrapWithInvertedComma(rule) { + return choice( + seq("'", rule, "'"), + seq('"', rule, '"'), + rule, + ) +} diff --git a/swh/search/parser/package.json b/swh/search/parser/package.json new file mode 100644 --- /dev/null +++ b/swh/search/parser/package.json @@ -0,0 +1,32 @@ +{ + "name": "swh-search-query-language-parser", + "version": "1.0.0", + "description": "Parser for Software Heritage archive search query language", + "main": "grammar.js", + "scripts": { + "dev": "tree-sitter generate && tree-sitter parse sample_query", + "test": "tree-sitter generate && tree-sitter test", + "repl": "tree-sitter build-wasm && tree-sitter playground" + }, + "repository": { + "type": "git", + "url": "https://forge.softwareheritage.org/source/swh-search.git" + }, + "keywords": [ + "swh", + "Software Heritage", + "treesitter", + "parser", + "custom", + "query", + "language" + ], + "author": "Software Heritage", + "license": "GPL-2.0-only", + "dependencies": { + "nan": "^2.14.2" + }, + "devDependencies": { + "tree-sitter-cli": "^0.20.0" + } +} \ No newline at end of file diff --git a/swh/search/parser/sample_query b/swh/search/parser/sample_query new file mode 100644 --- /dev/null +++ b/swh/search/parser/sample_query @@ -0,0 +1,6 @@ +url : "github.com/django/Django" metadata : something in metadata +with_visit : true with_visit : false +nb_visits >= 0 nb_visits = 10 nb_visits != 256 nb_visits < 1000 +sort_by : ["nb_visits", "last_revision_date", last_release_date] +last_release_date < 2001-02-13 15:54:21 +licenses in ["MIT","BSD X","Apache"] diff --git a/swh/search/parser/test/corpus/statements.txt b/swh/search/parser/test/corpus/statements.txt new file mode 100644 --- /dev/null +++ b/swh/search/parser/test/corpus/statements.txt @@ -0,0 +1,82 @@ +================== +Combinations +================== + +url : "github.com/django/Django" metadata : something qewq +with_visit : true with_visit : false +nb_visits >= 0 nb_visits = 10 nb_visits != 256 nb_visits < 1000 +sort_by : ["nb_visits", "last_revision_date", last_release_date] +last_release_date < 2001-02-13 15:54:21 +licenses in ["MIT","BSD X","Apache"] + +--- + +(query [0, 0] - [5, 36] + (patternFilter [0, 0] - [0, 32] + (patternField [0, 0] - [0, 3]) + (patternOp [0, 3] - [0, 6]) + (patternVal [0, 6] - [0, 32] + (string [0, 6] - [0, 32] + (string_content [0, 7] - [0, 31])))) + (patternFilter [0, 33] - [0, 58] + (patternField [0, 33] - [0, 41]) + (patternOp [0, 41] - [0, 44]) + (patternVal [0, 44] - [0, 58] + (string [0, 44] - [0, 58] + (string_content [0, 44] - [0, 58])))) + (booleanFilter [1, 0] - [1, 17] + (booleanField [1, 0] - [1, 10]) + (booleanOp [1, 10] - [1, 13]) + (booleanVal [1, 13] - [1, 17] + (True [1, 13] - [1, 17]))) + (booleanFilter [1, 18] - [1, 36] + (booleanField [1, 18] - [1, 28]) + (booleanOp [1, 28] - [1, 31]) + (booleanVal [1, 31] - [1, 36] + (False [1, 31] - [1, 36]))) + (numericFilter [2, 0] - [2, 14] + (numericField [2, 0] - [2, 9]) + (numericOp [2, 10] - [2, 12] + (rangeOp [2, 10] - [2, 12])) + (numberVal [2, 13] - [2, 14] + (number [2, 13] - [2, 14]))) + (numericFilter [2, 16] - [2, 30] + (numericField [2, 16] - [2, 25]) + (numericOp [2, 26] - [2, 27] + (rangeOp [2, 26] - [2, 27])) + (numberVal [2, 28] - [2, 30] + (number [2, 28] - [2, 30]))) + (numericFilter [2, 31] - [2, 47] + (numericField [2, 31] - [2, 40]) + (numericOp [2, 41] - [2, 43] + (rangeOp [2, 41] - [2, 43])) + (numberVal [2, 44] - [2, 47] + (number [2, 44] - [2, 47]))) + (numericFilter [2, 48] - [2, 64] + (numericField [2, 48] - [2, 57]) + (numericOp [2, 58] - [2, 59] + (rangeOp [2, 58] - [2, 59])) + (numberVal [2, 60] - [2, 64] + (number [2, 60] - [2, 64]))) + (boundedListFilter [3, 0] - [3, 64] + (sortByFilter [3, 0] - [3, 64] + (sortByField [3, 0] - [3, 7]) + (sortByOp [3, 8] - [3, 9]) + (sortByVal [3, 10] - [3, 64] + (sortByOptions [3, 12] - [3, 21]) + (sortByOptions [3, 25] - [3, 43]) + (sortByOptions [3, 46] - [3, 63])))) + (dateFilter [4, 0] - [4, 39] + (dateField [4, 0] - [4, 17]) + (dateOp [4, 18] - [4, 19] + (rangeOp [4, 18] - [4, 19])) + (dateVal [4, 20] - [4, 39] + (dateWithOptionalTime [4, 20] - [4, 39]))) + (unboundedListFilter [5, 0] - [5, 36] + (listField [5, 0] - [5, 8]) + (listOp [5, 9] - [5, 11]) + (listVal [5, 12] - [5, 36] + (array [5, 12] - [5, 36] + (string_content [5, 14] - [5, 17]) + (string_content [5, 20] - [5, 25]) + (string_content [5, 28] - [5, 34]))))) diff --git a/swh/search/parser/yarn.lock b/swh/search/parser/yarn.lock new file mode 100644 --- /dev/null +++ b/swh/search/parser/yarn.lock @@ -0,0 +1,13 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +nan@^2.14.2: + version "2.14.2" + resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.2.tgz#f5376400695168f4cc694ac9393d0c9585eeea19" + integrity sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ== + +tree-sitter-cli@^0.20.0: + version "0.20.0" + resolved "https://registry.yarnpkg.com/tree-sitter-cli/-/tree-sitter-cli-0.20.0.tgz#feaaa11c7ecf44a6e236aa1e2963b85d045d33cc" + integrity sha512-4D1qapWbJXZ5rrSUGM5rcw5Vuq/smzn9KbiFRhlON6KeuuXjra+KAtDYVrDgAoLIG4ku+jbEEGrJxCptUGi3dg==