diff --git a/.gitignore b/.gitignore --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ .mypy_cache/ .hypothesis/ .vscode/ +node_modules/ +static/ +*.wasm +*.so diff --git a/CONTRIBUTORS b/CONTRIBUTORS --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -0,0 +1 @@ +Kumar Shivendu diff --git a/Makefile.local b/Makefile.local new file mode 100644 --- /dev/null +++ b/Makefile.local @@ -0,0 +1,27 @@ +YARN ?= yarn +PYTHON ?= python3 + + +ts-install: package.json + $(YARN) install + +ts-generate: ts-install query_language/grammar.js + $(YARN) generate + +ts-dev: ts-install + $(YARN) dev + +ts-test: ts-install + $(YARN) test + +ts-build-so: ts-generate query_language/src/ + $(YARN) build-so + +ts-build-wasm: ts-generate query_language/src/ + $(YARN) build-wasm + +ts-build: ts-build-so ts-build-wasm + @echo 'Build completed' + +ts-repl: ts-generate + $(YARN) repl diff --git a/file.sh b/file.sh new file mode 100755 --- /dev/null +++ b/file.sh @@ -0,0 +1 @@ +echo 'Hello world' diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -19,3 +19,6 @@ [mypy-pytest.*] ignore_missing_imports = True + +[mypy-tree_sitter.*] +ignore_missing_imports = True diff --git a/package.json b/package.json new file mode 100644 --- /dev/null +++ b/package.json @@ -0,0 +1,37 @@ +{ + "name": "swh-search-query-language-parser", + "version": "1.0.0", + "description": "Parser for Software Heritage archive search query language", + "scripts": { + "generate": "cd query_language && tree-sitter generate --no-bindings && echo 'Generated parser files '", + "dev": "yarn generate && cd query_language && tree-sitter parse sample_query", + "test": "yarn generate && cd query_language && tree-sitter test", + "build-so": "yarn generate && cd query_language && python3 build.py", + "build-wasm": "yarn generate && cd query_language && tree-sitter build-wasm . && mv tree-sitter-swh_search_ql.wasm swh_ql.wasm", + "build": "yarn build-so && yarn build-wasm", + "repl": "yarn generate && cd query_language && tree-sitter build-wasm && tree-sitter playground" + }, + "repository": { + "type": "git", + "url": "https://forge.softwareheritage.org/source/swh-search.git" + }, + "keywords": [ + "swh", + "Software Heritage", + "treesitter", + "parser", + "custom", + "search", + "query", + "language" + ], + "author": "The Software Heritage developers", + "license": "GPL-3.0-only", + "dependencies": { + "nan": "^2.14.2" + }, + "devDependencies": { + "tree-sitter-cli": "^0.20.0", + "tree-sitter-javascript": "^0.19.0" + } +} diff --git a/pyproject.toml b/pyproject.toml --- a/pyproject.toml +++ b/pyproject.toml @@ -9,3 +9,6 @@ ensure_newline_before_comments = true line_length = 88 force_sort_within_sections = true + +[build-system] +requires = ["setuptools", "wheel", "tree_sitter"] diff --git a/query_language/.gitignore b/query_language/.gitignore new file mode 100644 --- /dev/null +++ b/query_language/.gitignore @@ -0,0 +1,6 @@ +src +build +bindings +binding.gyp +Cargo.toml +package.json diff --git a/query_language/build-wasm.sh b/query_language/build-wasm.sh new file mode 100755 --- /dev/null +++ b/query_language/build-wasm.sh @@ -0,0 +1,111 @@ +#!/bin/sh + +set -e + +if [ ! -d "tree-sitter" ]; then + git clone https://github.com/tree-sitter/tree-sitter +fi + +web_dir=tree-sitter/lib/binding_web +lib_src=tree-sitter/lib/src +emscripten_flags="-O3" +minify_js=1 +force_docker=0 +emscripen_version=2.0.24 + +while [[ $# > 0 ]]; do + case "$1" in + --debug) + minify_js=0 + emscripten_flags="-s ASSERTIONS=1 -s SAFE_HEAP=1 -O0" + ;; + + --help) + usage + exit 0 + ;; + + --docker) + force_docker=1 + ;; + + *) + usage + echo "Unrecognized argument '$1'" + exit 1 + ;; + esac + shift +done + +emcc= +if which emcc > /dev/null && [[ "$force_docker" == "0" ]]; then + echo 'Using emcc sdk directly' + emcc=emcc +elif which docker > /dev/null; then + echo 'Using docker' + emcc="docker run \ + --rm \ + -v $(pwd):/src:Z \ + -u $(id -u) \ + emscripten/emsdk:$emscripen_version \ + emcc" +else + echo 'You must have either `docker` or `emcc` on your PATH to run this script' + exit 1 +fi + +mkdir -p target/scratch + +echo 'Copying files to target directory' +cp -r grammar.js src/ tree-sitter/lib/ + +echo 'Executing emcc command' + +# Use emscripten to generate `tree-sitter.js` and `tree-sitter.wasm` +# in the `target/scratch` directory +$emcc \ + -s WASM=1 \ + -s TOTAL_MEMORY=33554432 \ + -s ALLOW_MEMORY_GROWTH=1 \ + -s MAIN_MODULE=2 \ + -s NO_FILESYSTEM=1 \ + -s NODEJS_CATCH_EXIT=0 \ + -s NODEJS_CATCH_REJECTION=0 \ + -s EXPORTED_FUNCTIONS=@${web_dir}/exports.json \ + $emscripten_flags \ + -std=c99 \ + -D 'fprintf(...)=' \ + -D NDEBUG= \ + -I lib/src \ + -I lib/src/unicode \ + -I lib/include \ + -I lib/include/tree_sitter \ + --js-library ${web_dir}/imports.js \ + --pre-js ${web_dir}/prefix.js \ + --post-js ${web_dir}/binding.js \ + --post-js ${web_dir}/suffix.js \ + ${lib_src}/lib.c \ + ${web_dir}/binding.c \ + -o target/scratch/tree-sitter.js + +# # Use terser to write a minified version of `tree-sitter.js` into +# # the `lib/binding_web` directory. +# if [[ "$minify_js" == "1" ]]; then +# if [ ! -d ${web_dir}/node_modules/terser ]; then +# ( +# cd ${web_dir} +# npm install +# ) +# fi +# ${web_dir}/node_modules/.bin/terser \ +# --compress \ +# --mangle \ +# --keep-classnames \ +# -- target/scratch/tree-sitter.js \ +# > $web_dir/tree-sitter.js +# else +# cp target/scratch/tree-sitter.js $web_dir/tree-sitter.js +# fi + +mv target/scratch/tree-sitter.wasm $web_dir/tree-sitter.wasm diff --git a/query_language/build.py b/query_language/build.py new file mode 100644 --- /dev/null +++ b/query_language/build.py @@ -0,0 +1,3 @@ +from tree_sitter import Language + +Language.build_library("swh_ql.so", ["."]) diff --git a/query_language/grammar.js b/query_language/grammar.js new file mode 100644 --- /dev/null +++ b/query_language/grammar.js @@ -0,0 +1,132 @@ +// Copyright (C) 2019-2021 The Software Heritage developers +// See the AUTHORS file at the top-level directory of this distribution +// License: GNU General Public License version 3, or any later version +// See top-level LICENSE file for more information + +module.exports = grammar({ + name: 'swh_search_ql', + + rules: { + query: $ => repeat( + choice( + $.patternFilter, + $.booleanFilter, + $.numericFilter, + $.unboundedListFilter, + $.boundedListFilter, + $.dateFilter, + $.limitFilter + ) + ), + + patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal), + patternField: $ => token(choice('url', 'metadata')), + patternOp: $ => $.colonOp, + patternVal: $ => $.string, + + booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal), + booleanField: $ => token(choice('with_visit')), + booleanOp: $ => $.colonOp, + booleanVal: $ => choice($.booleanTrue, $.booleanFalse), + + numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal), + numericField: $ => token(choice('nb_visits')), + numericOp: $ => $.rangeOp, + numberVal: $ => $.number, + + boundedListFilter: $ => choice($.visitTypeFilter, $.sortByFilter), + + visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal), + visitTypeField: $ => token(choice('visit_types')), + visitTypeOp: $ => $.colonOp, + visitTypeVal: $ => createArray($.visitTypeOptions), + visitTypeOptions: $ => choice( + "any", + "cran", + "deb", + "deposit", + "ftp", + "hg", + "git", + "nixguix", + "npm", + "pypi", + "svn", + "tar" + ), // TODO: fetch this list dynamically from other swh services? + + sortByFilter: $ => seq($.sortByField, $.sortByOp, $.sortByVal), + sortByField: $ => token(choice('sort_by')), + sortByOp: $ => $.colonOp, + sortByVal: $ => createArray($.sortByOptions), + sortByOptions: $ => choice( + 'nb_visits', + 'last_visit_date', + 'last_eventful_visit_date', + 'last_revision_date', + 'last_release_date', + 'date_created', + 'date_modified', + 'date_published' + ), + + unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), + listField: $ => token(choice('programming_languages', 'licenses', 'keywords')), + listOp: $ => $.choiceOp, + listVal: $ => createArray($.words), // Needs to be fixed !! + // currently doesn't accept: licenses in ["MIT", BSD,] because of BSD (no inverted comma) + + + dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal), + dateField: $ => token(choice( + 'last_visit_date', + 'last_eventful_visit_date', + 'last_revision_date', + 'last_release_date', + 'date_created', + 'date_modified', + 'date_published' + )), + dateOp: $ => $.rangeOp, + dateVal: $ => $.dateWithOptionalTime, + + limitFilter: $ => seq('limit', $.colonOp, $.number), + + + rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')), + colonOp: $ => token(':'), + choiceOp: $ => token(choice('in', 'not in')), + + dateWithOptionalTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?/, + + string: $ => choice(wrapWithInvertedComma($.words), $.word), + number: $ => /\d+/, + booleanTrue: $ => "true", + booleanFalse: $ => "false", + + words: $ => repeat1(seq($.word)), + word: $ => /[^\s"']+/, + + } +}); + +function commaSep1(rule) { + return seq(rule, repeat(seq(",", optional(rule)))) +} + +function commaSep(rule) { + return optional(commaSep1(rule)) +} + +function createArray(rule) { + return seq('[', commaSep( + field('array_member', (choice(wrapWithInvertedComma(rule), rule))) + ), ']') +} + +function wrapWithInvertedComma(rule) { + return choice( + seq("'", rule, "'"), + seq('"', rule, '"') + ) +} diff --git a/query_language/sample_query b/query_language/sample_query new file mode 100644 --- /dev/null +++ b/query_language/sample_query @@ -0,0 +1,6 @@ +url : "github.com/django/Django" metadata : "Repo description" +with_visit : true with_visit : false +nb_visits >= 0 nb_visits = 10 nb_visits != 256 nb_visits < 1000 +sort_by : ["nb_visits", "last_revision_date nb_visits", last_release_date,] +last_release_date < 2001-02-13 15:54:21 +licenses in ["MIT", "BSD X", "Apache XZY ABC", ] diff --git a/query_language/test/corpus/booleanFilters.txt b/query_language/test/corpus/booleanFilters.txt new file mode 100644 --- /dev/null +++ b/query_language/test/corpus/booleanFilters.txt @@ -0,0 +1,42 @@ +================== +boolean filter +================== + +with_visit: true +with_visit: false +with_visit :"true" +with_visit : 'false' + +--- + + +(query [0, 0] - [4, 0] + (booleanFilter [0, 0] - [0, 16] + (booleanField [0, 0] - [0, 10]) + (booleanOp [0, 10] - [0, 11] + (colonOp [0, 10] - [0, 11])) + (booleanVal [0, 12] - [0, 16] + (booleanTrue [0, 12] - [0, 16]))) + (booleanFilter [1, 0] - [1, 17] + (booleanField [1, 0] - [1, 10]) + (booleanOp [1, 10] - [1, 11] + (colonOp [1, 10] - [1, 11])) + (booleanVal [1, 12] - [1, 17] + (booleanFalse [1, 12] - [1, 17]))) + (booleanFilter [2, 0] - [2, 17] + (booleanField [2, 0] - [2, 10]) + (booleanOp [2, 11] - [2, 12] + (colonOp [2, 11] - [2, 12])) + (ERROR [2, 12] - [2, 13]) + (booleanVal [2, 13] - [2, 17] + (booleanTrue [2, 13] - [2, 17]))) + (ERROR [2, 17] - [2, 18]) + (booleanFilter [3, 0] - [3, 19] + (booleanField [3, 0] - [3, 10]) + (booleanOp [3, 11] - [3, 12] + (colonOp [3, 11] - [3, 12])) + (ERROR [3, 13] - [3, 14]) + (booleanVal [3, 14] - [3, 19] + (booleanFalse [3, 14] - [3, 19]))) + (ERROR [3, 19] - [3, 20])) +sample_query 0 ms (ERROR [2, 12] - [2, 13]) diff --git a/query_language/test/corpus/boundedListFilters.txt b/query_language/test/corpus/boundedListFilters.txt new file mode 100644 --- /dev/null +++ b/query_language/test/corpus/boundedListFilters.txt @@ -0,0 +1,51 @@ +================== +sort_by filter +================== + +sort_by : ["nb_visits", "last_revision_date", last_release_date] +sort_by :[some_invalid_field, "last_eventful_visit_date", "date_created"] +sort_by: ["date_published", 'date_modified'] +sort_by:['date_published', "last_visit_date"] + +--- + + +(query [0, 0] - [4, 0] + (boundedListFilter [0, 0] - [0, 64] + (sortByFilter [0, 0] - [0, 64] + (sortByField [0, 0] - [0, 7]) + (sortByOp [0, 8] - [0, 9] + (colonOp [0, 8] - [0, 9])) + (sortByVal [0, 10] - [0, 64] + array_member: (sortByOptions [0, 12] - [0, 21]) + array_member: (sortByOptions [0, 25] - [0, 43]) + array_member: (sortByOptions [0, 46] - [0, 63])))) + (boundedListFilter [1, 0] - [1, 73] + (sortByFilter [1, 0] - [1, 73] + (sortByField [1, 0] - [1, 7]) + (sortByOp [1, 8] - [1, 9] + (colonOp [1, 8] - [1, 9])) + (sortByVal [1, 9] - [1, 73] + (ERROR [1, 10] - [1, 29] + (ERROR [1, 10] - [1, 15]) + (choiceOp [1, 15] - [1, 17]) + (ERROR [1, 17] - [1, 28])) + array_member: (sortByOptions [1, 31] - [1, 55]) + array_member: (sortByOptions [1, 59] - [1, 71])))) + (boundedListFilter [2, 0] - [2, 44] + (sortByFilter [2, 0] - [2, 44] + (sortByField [2, 0] - [2, 7]) + (sortByOp [2, 7] - [2, 8] + (colonOp [2, 7] - [2, 8])) + (sortByVal [2, 9] - [2, 44] + array_member: (sortByOptions [2, 11] - [2, 25]) + array_member: (sortByOptions [2, 29] - [2, 42])))) + (boundedListFilter [3, 0] - [3, 45] + (sortByFilter [3, 0] - [3, 45] + (sortByField [3, 0] - [3, 7]) + (sortByOp [3, 7] - [3, 8] + (colonOp [3, 7] - [3, 8])) + (sortByVal [3, 8] - [3, 45] + array_member: (sortByOptions [3, 10] - [3, 24]) + array_member: (sortByOptions [3, 28] - [3, 43]))))) +sample_query 0 ms (ERROR [1, 10] - [1, 29]) diff --git a/query_language/test/corpus/combinations.txt b/query_language/test/corpus/combinations.txt new file mode 100644 --- /dev/null +++ b/query_language/test/corpus/combinations.txt @@ -0,0 +1,82 @@ +================== +Combinations +================== + +url = "github.com/django/Django" metadata = something in metadata +with_visit = true with_visit = false +nb_visits >= 0 nb_visits = 10 nb_visits != 256 nb_visits < 1000 +sort_by = ["nb_visits", "last_revision_date", last_release_date, ] +last_release_date < 2001-02-13 15:54:21 +licenses in ["MIT","BSD X","Apache", ] + +--- + +(query [0, 0] - [6, 0] + (patternFilter [0, 0] - [0, 32] + (patternField [0, 0] - [0, 3]) + (patternOp [0, 3] - [0, 6]) + (patternVal [0, 6] - [0, 32] + (string [0, 6] - [0, 32] + (string_content [0, 7] - [0, 31])))) + (patternFilter [0, 33] - [0, 65] + (patternField [0, 33] - [0, 41]) + (patternOp [0, 41] - [0, 44]) + (patternVal [0, 44] - [0, 65] + (string [0, 44] - [0, 65] + (string_content [0, 44] - [0, 65])))) + (booleanFilter [1, 0] - [1, 17] + (booleanField [1, 0] - [1, 10]) + (booleanOp [1, 10] - [1, 13]) + (booleanVal [1, 13] - [1, 17] + (booleanTrue [1, 13] - [1, 17]))) + (booleanFilter [1, 18] - [1, 36] + (booleanField [1, 18] - [1, 28]) + (booleanOp [1, 28] - [1, 31]) + (booleanVal [1, 31] - [1, 36] + (booleanFalse [1, 31] - [1, 36]))) + (numericFilter [2, 0] - [2, 14] + (numericField [2, 0] - [2, 9]) + (numericOp [2, 10] - [2, 12] + (rangeOp [2, 10] - [2, 12])) + (numberVal [2, 13] - [2, 14] + (number [2, 13] - [2, 14]))) + (numericFilter [2, 16] - [2, 30] + (numericField [2, 16] - [2, 25]) + (numericOp [2, 26] - [2, 27] + (rangeOp [2, 26] - [2, 27])) + (numberVal [2, 28] - [2, 30] + (number [2, 28] - [2, 30]))) + (numericFilter [2, 31] - [2, 47] + (numericField [2, 31] - [2, 40]) + (numericOp [2, 41] - [2, 43] + (rangeOp [2, 41] - [2, 43])) + (numberVal [2, 44] - [2, 47] + (number [2, 44] - [2, 47]))) + (numericFilter [2, 48] - [2, 64] + (numericField [2, 48] - [2, 57]) + (numericOp [2, 58] - [2, 59] + (rangeOp [2, 58] - [2, 59])) + (numberVal [2, 60] - [2, 64] + (number [2, 60] - [2, 64]))) + (boundedListFilter [3, 0] - [3, 66] + (sortByFilter [3, 0] - [3, 66] + (sortByField [3, 0] - [3, 7]) + (sortByOp [3, 7] - [3, 10]) + (sortByVal [3, 10] - [3, 66] + array_member: (sortByOptions [3, 12] - [3, 21]) + array_member: (sortByOptions [3, 25] - [3, 43]) + array_member: (sortByOptions [3, 46] - [3, 63])))) + (dateFilter [4, 0] - [4, 39] + (dateField [4, 0] - [4, 17]) + (dateOp [4, 18] - [4, 19] + (rangeOp [4, 18] - [4, 19])) + (dateVal [4, 20] - [4, 39] + (dateWithOptionalTime [4, 20] - [4, 39]))) + (unboundedListFilter [5, 0] - [5, 38] + (listField [5, 0] - [5, 8]) + (listOp [5, 9] - [5, 11]) + (listVal [5, 12] - [5, 38] + array_member: (string_content [5, 14] - [5, 17]) + array_member: (string_content [5, 20] - [5, 25]) + array_member: (string_content [5, 28] - [5, 34])))) + diff --git a/query_language/test/corpus/dateFilters.txt b/query_language/test/corpus/dateFilters.txt new file mode 100644 --- /dev/null +++ b/query_language/test/corpus/dateFilters.txt @@ -0,0 +1,58 @@ +================== +date filters +================== + + +last_release_date< 2001-02-13 15:54:21 +last_revision_date <=2001-02-13 15:54 +date_created=2001-02-13 +last_eventful_visit_date!=2001-02-13 +date_modified>2010-02-13 15:54:21 +date_published>=2010-02-13 +some_non_existent_field>=2010-02-13 + + +--- + +(query [0, 0] - [7, 0] + (dateFilter [0, 0] - [0, 38] + (dateField [0, 0] - [0, 17]) + (dateOp [0, 17] - [0, 18] + (rangeOp [0, 17] - [0, 18])) + (dateVal [0, 19] - [0, 38] + (dateWithOptionalTime [0, 19] - [0, 38]))) + (dateFilter [1, 0] - [1, 40] + (dateField [1, 0] - [1, 18]) + (dateOp [1, 19] - [1, 21] + (rangeOp [1, 19] - [1, 21])) + (dateVal [1, 21] - [1, 40] + (dateWithOptionalTime [1, 21] - [1, 40]))) + (dateFilter [2, 0] - [3, 0] + (dateField [2, 0] - [2, 12]) + (dateOp [2, 12] - [2, 13] + (rangeOp [2, 12] - [2, 13])) + (dateVal [2, 13] - [3, 0] + (dateWithOptionalTime [2, 13] - [3, 0]))) + (dateFilter [3, 0] - [4, 0] + (dateField [3, 0] - [3, 24]) + (dateOp [3, 24] - [3, 26] + (rangeOp [3, 24] - [3, 26])) + (dateVal [3, 26] - [4, 0] + (dateWithOptionalTime [3, 26] - [4, 0]))) + (dateFilter [4, 0] - [5, 0] + (dateField [4, 0] - [4, 13]) + (dateOp [4, 13] - [4, 14] + (rangeOp [4, 13] - [4, 14])) + (dateVal [4, 14] - [5, 0] + (dateWithOptionalTime [4, 14] - [5, 0]))) + (dateFilter [5, 0] - [6, 0] + (dateField [5, 0] - [5, 14]) + (dateOp [5, 14] - [5, 16] + (rangeOp [5, 14] - [5, 16])) + (dateVal [5, 16] - [6, 0] + (dateWithOptionalTime [5, 16] - [6, 0]))) + (ERROR [6, 0] - [7, 0] + (ERROR [6, 0] - [6, 23]) + (rangeOp [6, 23] - [6, 25]) + (dateWithOptionalTime [6, 25] - [7, 0]))) +sample_query 0 ms (ERROR [6, 0] - [7, 0]) diff --git a/query_language/test/corpus/numericFilters.txt b/query_language/test/corpus/numericFilters.txt new file mode 100644 --- /dev/null +++ b/query_language/test/corpus/numericFilters.txt @@ -0,0 +1,51 @@ +================== +numeric filter +================== + +nb_visits< 1000 +nb_visits <= 0 +nb_visits =10 +nb_visits != 256 +nb_visits> 1000 +nb_visits>=1000 + +--- + + +(query [0, 0] - [5, 15] + (numericFilter [0, 0] - [0, 15] + (numericField [0, 0] - [0, 9]) + (numericOp [0, 9] - [0, 10] + (rangeOp [0, 9] - [0, 10])) + (numberVal [0, 11] - [0, 15] + (number [0, 11] - [0, 15]))) + (numericFilter [1, 0] - [1, 14] + (numericField [1, 0] - [1, 9]) + (numericOp [1, 10] - [1, 12] + (rangeOp [1, 10] - [1, 12])) + (numberVal [1, 13] - [1, 14] + (number [1, 13] - [1, 14]))) + (numericFilter [2, 0] - [2, 13] + (numericField [2, 0] - [2, 9]) + (numericOp [2, 10] - [2, 11] + (rangeOp [2, 10] - [2, 11])) + (numberVal [2, 11] - [2, 13] + (number [2, 11] - [2, 13]))) + (numericFilter [3, 0] - [3, 16] + (numericField [3, 0] - [3, 9]) + (numericOp [3, 10] - [3, 12] + (rangeOp [3, 10] - [3, 12])) + (numberVal [3, 13] - [3, 16] + (number [3, 13] - [3, 16]))) + (numericFilter [4, 0] - [4, 15] + (numericField [4, 0] - [4, 9]) + (numericOp [4, 9] - [4, 10] + (rangeOp [4, 9] - [4, 10])) + (numberVal [4, 11] - [4, 15] + (number [4, 11] - [4, 15]))) + (numericFilter [5, 0] - [5, 15] + (numericField [5, 0] - [5, 9]) + (numericOp [5, 9] - [5, 11] + (rangeOp [5, 9] - [5, 11])) + (numberVal [5, 11] - [5, 15] + (number [5, 11] - [5, 15])))) diff --git a/query_language/test/corpus/patternFilters.txt b/query_language/test/corpus/patternFilters.txt new file mode 100644 --- /dev/null +++ b/query_language/test/corpus/patternFilters.txt @@ -0,0 +1,94 @@ +================== +url filter +================== + +url: github.com/django/Django +url:github.com/\django/Django +url :"github.com/\django/nDjango" +url : 'github com \/ django Django' + +--- + +(query [1, 0] - [5, 0] + (patternFilter [1, 0] - [1, 29] + (patternField [1, 0] - [1, 3]) + (patternOp [1, 3] - [1, 4] + (colonOp [1, 3] - [1, 4])) + (patternVal [1, 5] - [1, 29] + (string [1, 5] - [1, 29] + (word [1, 5] - [1, 29])))) + (patternFilter [2, 0] - [2, 29] + (patternField [2, 0] - [2, 3]) + (patternOp [2, 3] - [2, 4] + (colonOp [2, 3] - [2, 4])) + (patternVal [2, 4] - [2, 29] + (string [2, 4] - [2, 29] + (word [2, 4] - [2, 29])))) + (patternFilter [3, 0] - [3, 33] + (patternField [3, 0] - [3, 3]) + (patternOp [3, 4] - [3, 5] + (colonOp [3, 4] - [3, 5])) + (patternVal [3, 5] - [3, 33] + (string [3, 5] - [3, 33] + (words [3, 6] - [3, 32] + (word [3, 6] - [3, 32]))))) + (patternFilter [4, 0] - [4, 35] + (patternField [4, 0] - [4, 3]) + (patternOp [4, 4] - [4, 5] + (colonOp [4, 4] - [4, 5])) + (patternVal [4, 6] - [4, 35] + (string [4, 6] - [4, 35] + (words [4, 7] - [4, 34] + (word [4, 7] - [4, 13]) + (word [4, 14] - [4, 17]) + (word [4, 18] - [4, 20]) + (word [4, 21] - [4, 27]) + (word [4, 28] - [4, 34])))))) + + +================== +metadata filter +================== + +metadata: https://github.com/python/mypy +metadata: static,typing +metadata :"static typing" +metadata : 'python join us issues' + +--- + +(query [1, 0] - [5, 0] + (patternFilter [1, 0] - [1, 40] + (patternField [1, 0] - [1, 8]) + (patternOp [1, 8] - [1, 9] + (colonOp [1, 8] - [1, 9])) + (patternVal [1, 10] - [1, 40] + (string [1, 10] - [1, 40] + (word [1, 10] - [1, 40])))) + (patternFilter [2, 0] - [2, 23] + (patternField [2, 0] - [2, 8]) + (patternOp [2, 8] - [2, 9] + (colonOp [2, 8] - [2, 9])) + (patternVal [2, 10] - [2, 23] + (string [2, 10] - [2, 23] + (word [2, 10] - [2, 23])))) + (patternFilter [3, 0] - [3, 25] + (patternField [3, 0] - [3, 8]) + (patternOp [3, 9] - [3, 10] + (colonOp [3, 9] - [3, 10])) + (patternVal [3, 10] - [3, 25] + (string [3, 10] - [3, 25] + (words [3, 11] - [3, 24] + (word [3, 11] - [3, 17]) + (word [3, 18] - [3, 24]))))) + (patternFilter [4, 0] - [4, 34] + (patternField [4, 0] - [4, 8]) + (patternOp [4, 9] - [4, 10] + (colonOp [4, 9] - [4, 10])) + (patternVal [4, 11] - [4, 34] + (string [4, 11] - [4, 34] + (words [4, 12] - [4, 33] + (word [4, 12] - [4, 18]) + (word [4, 19] - [4, 23]) + (word [4, 24] - [4, 26]) + (word [4, 27] - [4, 33])))))) diff --git a/query_language/test/corpus/unboundedListFilter.txt b/query_language/test/corpus/unboundedListFilter.txt new file mode 100644 --- /dev/null +++ b/query_language/test/corpus/unboundedListFilter.txt @@ -0,0 +1,40 @@ +================== +sort_by filter +================== + + +licenses in ["MIT", "BSD X","Apache version 1.0"] +licenses in ["MIT", BSD,] +licenses in [] + +--- + + +(query [0, 0] - [2, 14] + (unboundedListFilter [0, 0] - [0, 50] + (listField [0, 0] - [0, 8]) + (listOp [0, 9] - [0, 11] + (choiceOp [0, 9] - [0, 11])) + (listVal [0, 12] - [0, 50] + array_member: (words [0, 14] - [0, 17] + (word [0, 14] - [0, 17])) + array_member: (words [0, 21] - [0, 26] + (word [0, 21] - [0, 24]) + (word [0, 25] - [0, 26])) + array_member: (words [0, 29] - [0, 48] + (word [0, 29] - [0, 35]) + (word [0, 36] - [0, 40]) + (word [0, 41] - [0, 48])))) + (unboundedListFilter [1, 0] - [2, 14] + (listField [1, 0] - [1, 8]) + (listOp [1, 9] - [1, 11] + (choiceOp [1, 9] - [1, 11])) + (listVal [1, 12] - [2, 14] + array_member: (words [1, 14] - [1, 17] + (word [1, 14] - [1, 17])) + array_member: (words [1, 20] - [2, 14] + (word [1, 20] - [1, 25]) + (word [2, 0] - [2, 8]) + (word [2, 9] - [2, 11]) + (word [2, 12] - [2, 14]))))) +sample_query 0 ms (MISSING "]" [2, 14] - [2, 14]) diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,3 +5,4 @@ types-pytz types-pyyaml types-requests +types-setuptools diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ click elasticsearch>=7.0.0,<8.0.0 typing-extensions +tree_sitter diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -4,8 +4,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from distutils.command.build_py import build_py from io import open -from os import path +from os import environ, path, system from setuptools import find_packages, setup @@ -35,6 +36,70 @@ return requirements +class custom_build(build_py): + def run(self): + if not self.dry_run: + yarn = environ.get("YARN", "yarn") + system(f"{yarn} install") + system(f"{yarn} generate") + + print("Pulling docker container") + EMSDK_VERSION = "2.0.24" + system(f"docker pull emscripten/emsdk:{EMSDK_VERSION}") + system("docker images") + + run_emcc_image = [ + "docker run", + "--rm", + "--workdir /src/query_language", + "--volume ${PWD}:/src:Z", + "--user 1000", + f"emscripten/emsdk:{EMSDK_VERSION}", + ] + system("echo 'pwd='") + system(" ".join(run_emcc_image + ["pwd"])) + system("echo 'ls='") + system(" ".join(run_emcc_image + ["ls"])) + system("echo 'ls src='") + system(" ".join(run_emcc_image + ["ls src"])) + # raise Exception + + emcc = run_emcc_image + ["emcc"] + cmd = emcc + [ + "-o swh_ql.wasm", + "-Os", + "-s", + "WASM=1", + "-s SIDE_MODULE=1", + "-s TOTAL_MEMORY=33554432", + "-s NODEJS_CATCH_EXIT=0", + '-s EXPORTED_FUNCTIONS=["_tree_sitter_swh_search_ql"]', + "-fno-exceptions", + "-I src src/parser.c", + ] + + print(cmd) + + system(" ".join(cmd)) + print("Command ran !!") + system("ls query_language") + # system( + # "git clone https://github.com/emscripten-core/emsdk.git && " + # "cd emsdk && ./emsdk install latest && " + # "./emsdk activate latest && chmod +x ./emsdk_env.sh && " + # "sh ./emsdk_env.sh" + # ) + system(f"{yarn} build-so && echo 'swh_ql.so file generated'") + # system(f"{yarn} build-wasm && echo 'swh_ql.wasm file generated'") + system("echo 'static files generated. moving them to static/ dir'") + + system("mkdir static") + system("cp query_language/swh_ql.so static/swh_ql.so") + system("cp query_language/swh_ql.wasm static/swh_ql.wasm") + + build_py.run(self) + + setup( name="swh.search", description="Software Heritage search service", @@ -68,4 +133,6 @@ "Source": "https://forge.softwareheritage.org/source/swh-search", "Documentation": "https://docs.softwareheritage.org/devel/swh-search/", }, + cmdclass={"build_py": custom_build}, + data_files=[("share/swh/search", ["static/swh_ql.so", "static/swh_ql.wasm"])], ) diff --git a/yarn.lock b/yarn.lock new file mode 100644 --- /dev/null +++ b/yarn.lock @@ -0,0 +1,13 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +nan@^2.14.2: + version "2.14.2" + resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.2.tgz#f5376400695168f4cc694ac9393d0c9585eeea19" + integrity sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ== + +tree-sitter-cli@^0.20.0: + version "0.20.0" + resolved "https://registry.yarnpkg.com/tree-sitter-cli/-/tree-sitter-cli-0.20.0.tgz#feaaa11c7ecf44a6e236aa1e2963b85d045d33cc" + integrity sha512-4D1qapWbJXZ5rrSUGM5rcw5Vuq/smzn9KbiFRhlON6KeuuXjra+KAtDYVrDgAoLIG4ku+jbEEGrJxCptUGi3dg==