diff --git a/.gitignore b/.gitignore --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ .mypy_cache/ .hypothesis/ .vscode/ +node_modules/ +static/ +*.wasm +*.so diff --git a/CONTRIBUTORS b/CONTRIBUTORS --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -0,0 +1 @@ +Kumar Shivendu diff --git a/Makefile.local b/Makefile.local new file mode 100644 --- /dev/null +++ b/Makefile.local @@ -0,0 +1,31 @@ +YARN ?= yarn +PYTHON ?= python3 + + +ts-install: package.json + $(PYTHON) setup.py ts_install + +ts-generate: ts-install query_language/grammar.js + $(PYTHON) setup.py ts_generate + +ts-dev: ts-install +ifdef sanitize + $(YARN) dev | sed '5,$$s/[[0-9]\+, [0-9]\+]/ /g' | sed '5,$$s/ *- *//g'; +else + $(YARN) dev; +endif + +ts-test: ts-install + $(YARN) test + +ts-repl: ts-generate + $(YARN) repl + +ts-build-so: ts-generate query_language/src/ + $(PYTHON) setup.py ts_build_so + +ts-build-wasm: ts-generate query_language/src/ + $(PYTHON) setup.py ts_build_wasm + +ts-build: ts-build-so ts-build-wasm + @echo 'Build completed' diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -11,20 +11,55 @@ Currently uses ElasticSearch, and provides only origin search (by URL and metadata) -# Dependencies +## Dependencies -Python tests for this module include tests that cannot be run without a local +- Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). -## Debian-like host + - Debian-like host -The elasticsearch package is required. As it's not part of debian-stable, -[another debian repository is required to be -configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) + The elasticsearch package is required. As it's not part of debian-stable, + [another debian repository is required to be + configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) -## Non Debian-like host + - Non Debian-like host + + The tests expect: + - `/usr/share/elasticsearch/jdk/bin/java` to exist. + - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. +- Emscripten is required for generating tree-sitter WASM module. The following commands need to be executed for the setup: + ```bash + cd /opt && git clone https://github.com/emscripten-core/emsdk.git && cd emsdk && \ + ./emsdk install latest && ./emsdk activate latest + PATH="${PATH}:/opt/emsdk/upstream/emscripten" + ``` + + **Note:** If emsdk isn't found in the PATH, the tree-sitter cli automatically pulls `emscripten/emsdk` image from docker hub when `make ts-build-wasm` or `make ts-build` is used. + + +## Make targets + +Below is the list of available make targets that can be executed from the root directory of swh-search in order to build and/or execute the swh-search under various configurations: + +* **ts-install**: Install node_modules and emscripten SDK required for TreeSitter + +* **ts-generate**: Generate parser files(C and JSON) from the grammar + +* **ts-repl**: Starts a web based playground for the TreeSitter grammar. It's the recommended way for developing TreeSitter grammar. + +* **ts-dev**: Parse the `query_language/sample_query` and print the corresponding syntax expression +along with the start and end positions of all the nodes. + +* **ts-dev sanitize=1**: Same as **ts-dev** but without start and end position of the nodes. +This format is expected by TreeSitter's native test command. `sanitize=1` cleans the output +of **ts-dev** using `sed` to achieve the desired format. + +* **ts-test**: executes TreeSitter's native tests + +* **ts-build-so**: Generates `swh_ql.so` file from the previously generated parser using py-tree-sitter + +* **ts-build-so**: Generates `swh_ql.wasm` file from the previously generated parser using emscripten + +* **ts-build**: Executes both **ts-build-so** and **ts-build-so** -The tests expect: -- `/usr/share/elasticsearch/jdk/bin/java` to exist. -- `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -19,3 +19,6 @@ [mypy-pytest.*] ignore_missing_imports = True + +[mypy-tree_sitter.*] +ignore_missing_imports = True diff --git a/package.json b/package.json new file mode 100644 --- /dev/null +++ b/package.json @@ -0,0 +1,36 @@ +{ + "name": "swh-search-query-language-parser", + "version": "1.0.0", + "description": "Parser for Software Heritage archive search query language", + "scripts": { + "generate": "cd query_language && tree-sitter generate --no-bindings && echo 'Generated parser files '", + "dev": "yarn generate && cd query_language && tree-sitter parse sample_query", + "test": "yarn generate && cd query_language && tree-sitter test", + "build-so": "yarn generate && cd query_language && python3 build.py", + "build-wasm": "yarn generate && cd query_language && tree-sitter build-wasm . && mv tree-sitter-swh_search_ql.wasm swh_ql.wasm", + "build": "yarn build-so && yarn build-wasm", + "repl": "yarn generate && cd query_language && tree-sitter build-wasm && tree-sitter playground" + }, + "repository": { + "type": "git", + "url": "https://forge.softwareheritage.org/source/swh-search.git" + }, + "keywords": [ + "swh", + "Software Heritage", + "treesitter", + "parser", + "custom", + "search", + "query", + "language" + ], + "author": "The Software Heritage developers", + "license": "GPL-3.0-only", + "dependencies": { + "nan": "^2.14.2" + }, + "devDependencies": { + "tree-sitter-cli": "^0.20.0" + } +} diff --git a/pyproject.toml b/pyproject.toml --- a/pyproject.toml +++ b/pyproject.toml @@ -9,3 +9,6 @@ ensure_newline_before_comments = true line_length = 88 force_sort_within_sections = true + +[build-system] +requires = ["setuptools", "wheel", "tree_sitter"] diff --git a/query_language/.gitignore b/query_language/.gitignore new file mode 100644 --- /dev/null +++ b/query_language/.gitignore @@ -0,0 +1,7 @@ +src +build +bindings +binding.gyp +Cargo.toml +package.json +log.html diff --git a/query_language/build.py b/query_language/build.py new file mode 100644 --- /dev/null +++ b/query_language/build.py @@ -0,0 +1,3 @@ +from tree_sitter import Language + +Language.build_library("swh_ql.so", ["."]) diff --git a/query_language/grammar.js b/query_language/grammar.js new file mode 100644 --- /dev/null +++ b/query_language/grammar.js @@ -0,0 +1,184 @@ +// Copyright (C) 2019-2021 The Software Heritage developers +// See the AUTHORS file at the top-level directory of this distribution +// License: GNU General Public License version 3, or any later version +// See top-level LICENSE file for more information + + +const PRECEDENCE = { + or: 2, + and: 3, + bracket: 4, +} + +module.exports = grammar({ + name: 'swh_search_ql', + + rules: { + query: $ => $.filters, + + filters: $ => choice( + prec.left(PRECEDENCE.and, + seq( + field('left', $.filters), + field('operator', $.and), + field('right', $.filters), + ) + ), + prec.left(PRECEDENCE.or, + seq( + field('left', $.filters), + field('operator', $.or), + field('right', $.filters), + ) + ), + prec.left(PRECEDENCE.bracket, + seq("(", $.filters, ")"), + ), + $.filter + ), + + filter: $ => choice( + $.patternFilter, + $.booleanFilter, + $.numericFilter, + $.boundedListFilter, + $.unboundedListFilter, + $.dateFilter, + $.limitFilter + ), + + patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal), + patternField: $ => token(choice('origin', 'metadata')), + patternOp: $ => $.equalOp, + patternVal: $ => $.string, + + booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal), + booleanField: $ => token(choice('visited')), + booleanOp: $ => $.equalOp, + booleanVal: $ => choice($.booleanTrue, $.booleanFalse), + + numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal), + numericField: $ => token(choice('visits')), + numericOp: $ => $.rangeOp, + numberVal: $ => $.number, + + boundedListFilter: $ => choice($.visitTypeFilter, $.sortByFilter), + + visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal), + visitTypeField: $ => token(choice('visit_type')), + visitTypeOp: $ => $.equalOp, + visitTypeVal: $ => createArray(optionalWrapWith($.visitTypeOptions, ["'", '"'])), + visitTypeOptions: $ => choice( + "any", + "cran", + "deb", + "deposit", + "ftp", + "hg", + "git", + "nixguix", + "npm", + "pypi", + "svn", + "tar" + ), // TODO: fetch this list dynamically from other swh services? + + sortByFilter: $ => seq($.sortByField, $.sortByOp, $.sortByVal), + sortByField: $ => token(choice('sort_by')), + sortByOp: $ => $.equalOp, + sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), + sortByOptions: $ => seq( + optional(token.immediate('-')), + choice( + 'visits', + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' + )), + + unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), + listField: $ => token(choice('language', 'license', 'keyword')), + listOp: $ => $.choiceOp, + listVal: $ => createArray($.string), + + + dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal), + dateField: $ => token(choice( + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' + )), + dateOp: $ => $.rangeOp, + dateVal: $ => $.isoDateTime, + + limitFilter: $ => seq('limit', $.equalOp, $.number), + + + rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')), + equalOp: $ => token('='), + choiceOp: $ => token(choice('in', 'not in')), + + isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/, + + string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord), + number: $ => /\d+/, + booleanTrue: $ => "true", + booleanFalse: $ => "false", + + or: $ => "or", + and: $ => "and", + + stringContent: $ => repeat1(choice( + token.immediate(/[^\\"\n]+/), + $.escape_sequence + )), + singleWord: $ => /[^\s"'\[\]\(\)]+/, + escape_sequence: $ => token.immediate(seq( + '\\', + /(\"|\'|\\|\/|b|n|r|t|u)/ + )), + + } +}); + + +function joinBySep1(rule, sep) { + // At least one repetition of the rule separated by `sep` + return seq(rule, repeat(seq(sep, optional(rule)))) +} + +function joinBySep(rule, sep = ",") { + // Any number of repetitions of the rule separated by `sep` + return optional(joinBySep1(rule, sep)) +} + +function createArray(rule) { + // An array having `rule` as its member + return seq( + "[", + joinBySep( + field('array_member', rule), + "," + ), + "]" + ) +} + +function wrapWith(rule, wrappers = ["'", '"']) { + // The rule must be wrapped with one of the wrappers + const wrappedRules = wrappers.map(wrapper => seq(wrapper, rule, wrapper)) + return choice(...wrappedRules) +} + +function optionalWrapWith(rule, wrappers = ["'", '"']) { + // The rule may or may not be wrapped with the wrappers + return choice(wrapWith(rule, wrappers), rule) +} diff --git a/query_language/sample_query b/query_language/sample_query new file mode 100644 --- /dev/null +++ b/query_language/sample_query @@ -0,0 +1,6 @@ +(origin = django/django and language in ["python"] or visits >= 5) or +(last_revision > 2020-01-01 and limit = 10) or +(last_visit > 2021-01-01 or last_visit < 2020-01-01) or +(visited = false and metadata = "gitlab") or +(keyword in ["orchestration", "kubectl"] and language in ["go", "rust"]) or +(visit_type = [deb] and license in ["GPL-3"]) diff --git a/query_language/test/corpus/combinations.txt b/query_language/test/corpus/combinations.txt new file mode 100644 --- /dev/null +++ b/query_language/test/corpus/combinations.txt @@ -0,0 +1,76 @@ +============================== +Empty query (should throw error) +============================== + +--- + +(ERROR) + + +================== +Origins with django as keyword, python language, and more than 5 visits +================== + +origin = django and language in ["python"] and visits >= 5 + +--- +(query (filters (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (singleWord)))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent))))))) (and) (filters (filter (numericFilter (numericField) (numericOp (rangeOp)) (numberVal (number))))))) + +================== +10 origins with latest revision after 2020-01-01 +================== +last_revision > 2020-01-01 and limit = 10 +--- + +(query (filters (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (and) (filters (filter (limitFilter (equalOp) (number)))))) + +================== +Origins with last visit date not in 2020-2021 +================== + +last_visit > 2021-01-01 or last_visit < 2020-01-01 +--- +(query (filters (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (or) (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))))) + +================== +Unvisited origins with kubernetes in metadata or minikube in url +================== + +visited = false and metadata = "kubernetes" or origin = "minikube" + +--- +(query (filters (filters (filters (filter (booleanFilter (booleanField) (booleanOp (equalOp)) (booleanVal (booleanFalse))))) (and) (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent))))))) (or) (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent)))))))) + +================== +Origins with "orchestration" or "kubectl" as keywords and language as "go" or "rust" +================== + +keyword in ["orchestration", "kubectl"] and language in ["go", "rust"] + +--- +(query (filters (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)) (string (stringContent)))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)) (string (stringContent)))))))) + +================== +Origins with a GPL-3 license that have "debian" in their url or have visit type as "deb" +================== +(origin = debian or visit_type = ["deb"]) and license in ["GPL-3"] +--- + +(query (filters (filters (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (singleWord)))))) (or) (filters (filter (boundedListFilter (visitTypeFilter (visitTypeField) (visitTypeOp (equalOp)) (visitTypeVal (visitTypeOptions)))))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)))))))) + +================== +Origins with 'and' and 'or' inside filter values +================== +(origin = "foo and bar or baz") +--- + +(query (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent)))))))) + + +================== +Origins with `'` and `"` inside filter values +================== +(origin = "foo \\ \'bar\' \"baz\" ") +--- + +(query (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent (escape_sequence) (escape_sequence) (escape_sequence) (escape_sequence) (escape_sequence))))))))) diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,3 +5,4 @@ types-pytz types-pyyaml types-requests +types-setuptools diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ click elasticsearch>=7.0.0,<8.0.0 typing-extensions +tree_sitter diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -4,8 +4,10 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from distutils.cmd import Command +from distutils.command.build import build from io import open -from os import path +from os import environ, path, system from setuptools import find_packages, setup @@ -35,6 +37,76 @@ return requirements +yarn = environ.get("YARN", "yarn") + + +class TSCommand(Command): + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + +class TSInstallCommand(TSCommand): + description = "Installs node_modules related to query language" + + def run(self): + system(f"{yarn} install") + + +class TSGenerateCommand(TSCommand): + description = "Generates parser related files from grammar.js" + + def run(self): + system(f"{yarn} generate") + + +class TSBuildSoCommand(TSCommand): + description = "Builds swh_ql.so" + + def run(self): + system(f"{yarn} build-so && echo 'swh_ql.so file generated'") + + +class TSBuildWasmCommand(TSCommand): + description = "Builds swh_ql.wasm" + + def run(self): + system(f"{yarn} build-wasm && echo 'swh_ql.wasm file generated'") + + +class TSBuildCommand(TSCommand): + description = "Builds swh_ql.so and swh_ql.wasm" + + def run(self): + self.run_command("ts_build_so") + self.run_command("ts_build_wasm") + + +class TSBuildExportCommand(TSCommand): + description = "Builds swh_ql.so and swh_ql.wasm and exports them to static/" + + def run(self): + self.run_command("ts_install") + self.run_command("ts_build") + + system("echo 'static files generated. copying them to static/ dir'") + system("mkdir static") + system("cp query_language/swh_ql.so static/swh_ql.so") + system("cp query_language/swh_ql.wasm static/swh_ql.wasm") + + +class custom_build(build): + def run(self): + if not self.dry_run: + self.run_command("ts_build_export") + + super().run() + + setup( name="swh.search", description="Software Heritage search service", @@ -68,4 +140,14 @@ "Source": "https://forge.softwareheritage.org/source/swh-search", "Documentation": "https://docs.softwareheritage.org/devel/swh-search/", }, + cmdclass={ + "build": custom_build, + "ts_install": TSInstallCommand, + "ts_generate": TSGenerateCommand, + "ts_build_so": TSBuildSoCommand, + "ts_build_wasm": TSBuildWasmCommand, + "ts_build": TSBuildCommand, + "ts_build_export": TSBuildExportCommand, + }, + data_files=[("share/swh/search", ["static/swh_ql.so", "static/swh_ql.wasm"])], ) diff --git a/yarn.lock b/yarn.lock new file mode 100644 --- /dev/null +++ b/yarn.lock @@ -0,0 +1,13 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +nan@^2.14.2: + version "2.14.2" + resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.2.tgz#f5376400695168f4cc694ac9393d0c9585eeea19" + integrity sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ== + +tree-sitter-cli@^0.20.0: + version "0.20.0" + resolved "https://registry.yarnpkg.com/tree-sitter-cli/-/tree-sitter-cli-0.20.0.tgz#feaaa11c7ecf44a6e236aa1e2963b85d045d33cc" + integrity sha512-4D1qapWbJXZ5rrSUGM5rcw5Vuq/smzn9KbiFRhlON6KeuuXjra+KAtDYVrDgAoLIG4ku+jbEEGrJxCptUGi3dg==