diff --git a/.gitignore b/.gitignore --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ .mypy_cache/ .hypothesis/ .vscode/ +node_modules/ +static/ +*.wasm +*.so diff --git a/CONTRIBUTORS b/CONTRIBUTORS --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -0,0 +1 @@ +Kumar Shivendu diff --git a/Makefile.local b/Makefile.local new file mode 100644 --- /dev/null +++ b/Makefile.local @@ -0,0 +1,31 @@ +YARN ?= yarn +PYTHON ?= python3 + + +ts-install: package.json + $(PYTHON) setup.py ts_install + +ts-generate: ts-install query_language/grammar.js + $(PYTHON) setup.py ts_generate + +ts-dev: ts-install +ifdef sanitize + $(YARN) dev | sed '5,$$s/[[0-9]\+, [0-9]\+]/ /g' | sed '5,$$s/ *- *//g'; +else + $(YARN) dev; +endif + +ts-test: ts-install + $(YARN) test + +ts-repl: ts-generate + $(YARN) repl + +ts-build-so: ts-generate query_language/src/ + $(PYTHON) setup.py ts_build_so + +ts-build-wasm: ts-generate query_language/src/ + $(PYTHON) setup.py ts_build_wasm + +ts-build: ts-build-so ts-build-wasm + @echo 'Build completed' diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -11,20 +11,55 @@ Currently uses ElasticSearch, and provides only origin search (by URL and metadata) -# Dependencies +## Dependencies -Python tests for this module include tests that cannot be run without a local +- Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). -## Debian-like host + - Debian-like host -The elasticsearch package is required. As it's not part of debian-stable, -[another debian repository is required to be -configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) + The elasticsearch package is required. As it's not part of debian-stable, + [another debian repository is required to be + configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) -## Non Debian-like host + - Non Debian-like host + + The tests expect: + - `/usr/share/elasticsearch/jdk/bin/java` to exist. + - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. +- Emscripten is required for generating tree-sitter WASM module. The following commands need to be executed for the setup: + ```bash + cd /opt && git clone https://github.com/emscripten-core/emsdk.git && cd emsdk && \ + ./emsdk install latest && ./emsdk activate latest + PATH="${PATH}:/opt/emsdk/upstream/emscripten" + ``` + + **Note:** If emsdk isn't found in the PATH, the tree-sitter cli automatically pulls `emscripten/emsdk` image from docker hub when `make ts-build-wasm` or `make ts-build` is used. + + +## Make targets + +Below is the list of available make targets that can be executed from the root directory of swh-search in order to build and/or execute the swh-search under various configurations: + +* **ts-install**: Install node_modules and emscripten SDK required for TreeSitter + +* **ts-generate**: Generate parser files(C and JSON) from the grammar + +* **ts-repl**: Starts a web based playground for the TreeSitter grammar. It's the recommended way for developing TreeSitter grammar. + +* **ts-dev**: Parse the `query_language/sample_query` and print the corresponding syntax expression +along with the start and end positions of all the nodes. + +* **ts-dev sanitize=1**: Same as **ts-dev** but without start and end position of the nodes. +This format is expected by TreeSitter's native test command. `sanitize=1` cleans the output +of **ts-dev** using `sed` to achieve the desired format. + +* **ts-test**: executes TreeSitter's native tests + +* **ts-build-so**: Generates `swh_ql.so` file from the previously generated parser using py-tree-sitter + +* **ts-build-so**: Generates `swh_ql.wasm` file from the previously generated parser using emscripten + +* **ts-build**: Executes both **ts-build-so** and **ts-build-so** -The tests expect: -- `/usr/share/elasticsearch/jdk/bin/java` to exist. -- `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -19,3 +19,6 @@ [mypy-pytest.*] ignore_missing_imports = True + +[mypy-tree_sitter.*] +ignore_missing_imports = True diff --git a/package.json b/package.json new file mode 100644 --- /dev/null +++ b/package.json @@ -0,0 +1,36 @@ +{ + "name": "swh-search-query-language-parser", + "version": "1.0.0", + "description": "Parser for Software Heritage archive search query language", + "scripts": { + "generate": "cd query_language && tree-sitter generate --no-bindings && echo 'Generated parser files '", + "dev": "yarn generate && cd query_language && tree-sitter parse sample_query", + "test": "yarn generate && cd query_language && tree-sitter test", + "build-so": "yarn generate && cd query_language && python3 build.py", + "build-wasm": "yarn generate && cd query_language && tree-sitter build-wasm . && mv tree-sitter-swh_search_ql.wasm swh_ql.wasm", + "build": "yarn build-so && yarn build-wasm", + "repl": "yarn generate && cd query_language && tree-sitter build-wasm && tree-sitter playground" + }, + "repository": { + "type": "git", + "url": "https://forge.softwareheritage.org/source/swh-search.git" + }, + "keywords": [ + "swh", + "Software Heritage", + "treesitter", + "parser", + "custom", + "search", + "query", + "language" + ], + "author": "The Software Heritage developers", + "license": "GPL-3.0-only", + "dependencies": { + "nan": "^2.14.2" + }, + "devDependencies": { + "tree-sitter-cli": "^0.20.0" + } +} diff --git a/pyproject.toml b/pyproject.toml --- a/pyproject.toml +++ b/pyproject.toml @@ -9,3 +9,6 @@ ensure_newline_before_comments = true line_length = 88 force_sort_within_sections = true + +[build-system] +requires = ["setuptools", "wheel", "tree_sitter"] diff --git a/query_language/.gitignore b/query_language/.gitignore new file mode 100644 --- /dev/null +++ b/query_language/.gitignore @@ -0,0 +1,7 @@ +src +build +bindings +binding.gyp +Cargo.toml +package.json +log.html diff --git a/query_language/build.py b/query_language/build.py new file mode 100644 --- /dev/null +++ b/query_language/build.py @@ -0,0 +1,3 @@ +from tree_sitter import Language + +Language.build_library("swh_ql.so", ["."]) diff --git a/query_language/grammar.js b/query_language/grammar.js new file mode 100644 --- /dev/null +++ b/query_language/grammar.js @@ -0,0 +1,170 @@ +// Copyright (C) 2019-2021 The Software Heritage developers +// See the AUTHORS file at the top-level directory of this distribution +// License: GNU General Public License version 3, or any later version +// See top-level LICENSE file for more information + + +const PRECEDENCE = { + or: 2, + and: 3, + bracket: 4, +} + +module.exports = grammar({ + name: 'swh_search_ql', + + rules: { + query: $ => $.filters, + + filters: $ => choice( + prec.left(PRECEDENCE.and, + seq( + field('left', $.filters), + field('operator', $.and), + field('right', $.filters), + ) + ), + prec.left(PRECEDENCE.or, + seq( + field('left', $.filters), + field('operator', $.or), + field('right', $.filters), + ) + ), + prec.left(PRECEDENCE.bracket, + seq("(", $.filters, ")"), + ), + $.filter + ), + + filter: $ => choice( + $.patternFilter, + $.booleanFilter, + $.numericFilter, + $.boundedListFilter, + $.unboundedListFilter, + $.dateFilter, + $.limitFilter + ), + + patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal), + patternField: $ => token(choice('origin', 'metadata')), + patternOp: $ => $.equalOp, + patternVal: $ => $.string, + + booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal), + booleanField: $ => token(choice('visited')), + booleanOp: $ => $.equalOp, + booleanVal: $ => choice($.booleanTrue, $.booleanFalse), + + numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal), + numericField: $ => token(choice('visits')), + numericOp: $ => $.rangeOp, + numberVal: $ => $.number, + + boundedListFilter: $ => choice($.visitTypeFilter, $.sortByFilter), + + visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal), + visitTypeField: $ => token(choice('visit_type')), + visitTypeOp: $ => $.equalOp, + visitTypeVal: $ => createArray($.visitTypeOptions), + visitTypeOptions: $ => choice( + "any", + "cran", + "deb", + "deposit", + "ftp", + "hg", + "git", + "nixguix", + "npm", + "pypi", + "svn", + "tar" + ), // TODO: fetch this list dynamically from other swh services? + + sortByFilter: $ => seq($.sortByField, $.sortByOp, $.sortByVal), + sortByField: $ => token(choice('sort_by')), + sortByOp: $ => $.equalOp, + sortByVal: $ => createArray($.sortByOptions), + sortByOptions: $ => choice( + 'visits', + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' + ), + + unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), + listField: $ => token(choice('language', 'license', 'keyword')), + listOp: $ => $.choiceOp, + listVal: $ => createArray($.words), + + + dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal), + dateField: $ => token(choice( + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' + )), + dateOp: $ => $.rangeOp, + dateVal: $ => $.isoDateTime, + + limitFilter: $ => seq('limit', $.equalOp, $.number), + + + rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')), + equalOp: $ => token('='), + choiceOp: $ => token(choice('in', 'not in')), + + isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/, + + string: $ => choice(wrapWith($.words, ["'", '"']), $.word), + number: $ => /\d+/, + booleanTrue: $ => "true", + booleanFalse: $ => "false", + + or: $ => "or", + and: $ => "and", + + words: $ => joinBySep1($.word, /\w/), + word: $ => /[^\s"']+/, + + } +}); + + +function joinBySep1(rule, sep) { + return seq(rule, repeat(seq(sep, optional(rule)))) // WIth extra sep rule, rule, rule, +} + +function joinBySep(rule, sep = ",") { + return optional(joinBySep1(rule, sep)) +} + +function createArray(rule) { + return seq( + "[", + joinBySep( + field('array_member', (optionalWrapWith(rule, ["'", '"']))), + "," + ), + "]" + ) +} + +function wrapWith(rule, wrappers = ["'", '"']) { + const wrappedRules = wrappers.map(wrapper => seq(wrapper, rule, wrapper)) + return choice(...wrappedRules) +} + +function optionalWrapWith(rule, wrappers = ["'", '"']) { + return choice(wrapWith(rule, wrappers), rule) +} \ No newline at end of file diff --git a/query_language/sample_query b/query_language/sample_query new file mode 100644 --- /dev/null +++ b/query_language/sample_query @@ -0,0 +1,6 @@ +(origin = django/django and language in ["python"] or visits >= 5) or +(last_revision > 2020-01-01 and limit = 10) or +(last_visit > 2021-01-01 or last_visit < 2020-01-01) or +(visited = false and metadata = "gitlab") or +(keyword in ["orchestration", "kubectl"] and language in ["go", "rust"]) or +(visit_type = [deb] and license in ["GPL-3"]) diff --git a/query_language/test/corpus/combinations.txt b/query_language/test/corpus/combinations.txt new file mode 100644 --- /dev/null +++ b/query_language/test/corpus/combinations.txt @@ -0,0 +1,233 @@ +================== +Empty query +================== + + +--- + +(query) + + +================== +Huge query +================== +(origin = django/django and language in ["python"] or visits >= 5) or +(last_revision > 2020-01-01 and limit = 10) or +(last_visit > 2021-01-01 or last_visit < 2020-01-01) or +(visited = false and metadata = "gitlab") or +(keyword in ["orchestration", "kubectl"] and language in ["go", "rust"]) or +(visit_type = [deb] and license in ["GPL-3"]) + +--- + +(query) + + +================== +Combinations +================== + +( + origin = "github.com/django/Django" or metadata = "ceph distributed" + and + (metadata = "ceph distributed" or origin = "something") +) +and +(origin = "dj" or metadata = "eqweqw" and visits > 5) and +visited = true and visited = false +or visits >= 0 or visits = 10 or visits != 256 or visits < 1000 and +sort_by = ["visits", "last_revision", last_release,] and +last_release < 2001-02-13 15:54:21 and +license in ["MIT", "BSD X", "Apache XZY ABC", ] + +--- + +(query + (filters + left: (filters + left: (filters + left: (filters + left: (filters + left: (filters + left: (filters + left: (filters + (filters + left: (filters + (filter + (patternFilter + (patternField) + (patternOp + (equalOp)) + (patternVal + (string + (words + (word))))))) + operator: (or) + right: (filters + left: (filters + (filter + (patternFilter + (patternField) + (patternOp + (equalOp)) + (patternVal + (string + (words + (word) + (word))))))) + operator: (and) + right: (filters + (filters + left: (filters + (filter + (patternFilter + (patternField) + (patternOp + (equalOp)) + (patternVal + (string + (words + (word) + (word))))))) + operator: (or) + right: (filters + (filter + (patternFilter + (patternField) + (patternOp + (equalOp)) + (patternVal + (string + (words + (word)))))))))))) + operator: (and) + right: (filters + (filters + left: (filters + (filter + (patternFilter + (patternField) + (patternOp + (equalOp)) + (patternVal + (string + (words + (word))))))) + operator: (or) + right: (filters + left: (filters + (filter + (patternFilter + (patternField) + (patternOp + (equalOp)) + (patternVal + (string + (words + (word))))))) + operator: (and) + right: (filters + (filter + (numericFilter + (numericField) + (numericOp + (rangeOp)) + (numberVal + (number))))))))) + operator: (and) + right: (filters + (filter + (booleanFilter + (booleanField) + (booleanOp + (equalOp)) + (booleanVal + (booleanTrue)))))) + operator: (and) + right: (filters + (filter + (booleanFilter + (booleanField) + (booleanOp + (equalOp)) + (booleanVal + (booleanFalse)))))) + operator: (or) + right: (filters + (filter + (numericFilter + (numericField) + (numericOp + (rangeOp)) + (numberVal + (number)))))) + operator: (or) + right: (filters + (filter + (numericFilter + (numericField) + (numericOp + (rangeOp)) + (numberVal + (number)))))) + operator: (or) + right: (filters + (filter + (numericFilter + (numericField) + (numericOp + (rangeOp)) + (numberVal + (number)))))) + operator: (or) + right: (filters + left: (filters + left: (filters + left: (filters + (filter + (numericFilter + (numericField) + (numericOp + (rangeOp)) + (numberVal + (number))))) + operator: (and) + right: (filters + (filter + (boundedListFilter + (sortByFilter + (sortByField) + (sortByOp + (equalOp)) + (sortByVal + array_member: (sortByOptions) + array_member: (sortByOptions) + array_member: (sortByOptions))))))) + operator: (and) + right: (filters + (filter + (dateFilter + (dateField) + (dateOp + (rangeOp)) + (dateVal + (isoDateTime)))))) + operator: (and) + right: (filters + (filter + (unboundedListFilter + (listField) + (listOp + (choiceOp)) + (listVal + array_member: (words + (word)) + array_member: (words + (word) + (word)) + array_member: (words + (word) + (word) + (word))))))))) + + diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,3 +5,4 @@ types-pytz types-pyyaml types-requests +types-setuptools diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ click elasticsearch>=7.0.0,<8.0.0 typing-extensions +tree_sitter diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -4,8 +4,10 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from distutils.cmd import Command +from distutils.command.build import build from io import open -from os import path +from os import environ, path, system from setuptools import find_packages, setup @@ -35,6 +37,76 @@ return requirements +yarn = environ.get("YARN", "yarn") + + +class TSCommand(Command): + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + +class TSInstallCommand(TSCommand): + description = "Installs node_modules related to query language" + + def run(self): + system(f"{yarn} install") + + +class TSGenerateCommand(TSCommand): + description = "Generates parser related files from grammar.js" + + def run(self): + system(f"{yarn} generate") + + +class TSBuildSoCommand(TSCommand): + description = "Builds swh_ql.so" + + def run(self): + system(f"{yarn} build-so && echo 'swh_ql.so file generated'") + + +class TSBuildWasmCommand(TSCommand): + description = "Builds swh_ql.wasm" + + def run(self): + system(f"{yarn} build-wasm && echo 'swh_ql.wasm file generated'") + + +class TSBuildCommand(TSCommand): + description = "Builds swh_ql.so and swh_ql.wasm" + + def run(self): + self.run_command("ts_build_so") + self.run_command("ts_build_wasm") + + +class TSBuildExportCommand(TSCommand): + description = "Builds swh_ql.so and swh_ql.wasm and exports them to static/" + + def run(self): + self.run_command("ts_install") + self.run_command("ts_build") + + system("echo 'static files generated. copying them to static/ dir'") + system("mkdir static") + system("cp query_language/swh_ql.so static/swh_ql.so") + system("cp query_language/swh_ql.wasm static/swh_ql.wasm") + + +class custom_build(build): + def run(self): + if not self.dry_run: + self.run_command("ts_build_export") + + build.run(self) + + setup( name="swh.search", description="Software Heritage search service", @@ -68,4 +140,14 @@ "Source": "https://forge.softwareheritage.org/source/swh-search", "Documentation": "https://docs.softwareheritage.org/devel/swh-search/", }, + cmdclass={ + "build": custom_build, + "ts_install": TSInstallCommand, + "ts_generate": TSGenerateCommand, + "ts_build_so": TSBuildSoCommand, + "ts_build_wasm": TSBuildWasmCommand, + "ts_build": TSBuildCommand, + "ts_build_export": TSBuildExportCommand, + }, + data_files=[("share/swh/search", ["static/swh_ql.so", "static/swh_ql.wasm"])], ) diff --git a/yarn.lock b/yarn.lock new file mode 100644 --- /dev/null +++ b/yarn.lock @@ -0,0 +1,13 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +nan@^2.14.2: + version "2.14.2" + resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.2.tgz#f5376400695168f4cc694ac9393d0c9585eeea19" + integrity sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ== + +tree-sitter-cli@^0.20.0: + version "0.20.0" + resolved "https://registry.yarnpkg.com/tree-sitter-cli/-/tree-sitter-cli-0.20.0.tgz#feaaa11c7ecf44a6e236aa1e2963b85d045d33cc" + integrity sha512-4D1qapWbJXZ5rrSUGM5rcw5Vuq/smzn9KbiFRhlON6KeuuXjra+KAtDYVrDgAoLIG4ku+jbEEGrJxCptUGi3dg==