diff --git a/.gitignore b/.gitignore --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ .mypy_cache/ .hypothesis/ .vscode/ +node_modules/ +static/ +*.wasm +*.so diff --git a/CONTRIBUTORS b/CONTRIBUTORS --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -0,0 +1 @@ +Kumar Shivendu diff --git a/Makefile.local b/Makefile.local new file mode 100644 --- /dev/null +++ b/Makefile.local @@ -0,0 +1,35 @@ +YARN ?= yarn +PYTHON ?= python3 + + +ts-install: package.json + $(YARN) install + install emscripten required for generating tree-sitter WASM module in swh-search + cd /opt && git clone https://github.com/emscripten-core/emsdk.git && cd emsdk && \ + ./emsdk install latest && ./emsdk activate latest + PATH="${PATH}:/opt/emsdk/upstream/emscripten" + +ts-generate: ts-install query_language/grammar.js + $(YARN) generate + +ts-dev: ts-install +ifdef sanitize + $(YARN) dev | sed '5,$$s/[[0-9]\+, [0-9]\+]/ /g' | sed '5,$$s/ *- *//g'; +else + $(YARN) dev; +endif + +ts-test: ts-install + $(YARN) test + +ts-build-so: ts-generate query_language/src/ + $(YARN) build-so + +ts-build-wasm: ts-generate query_language/src/ + $(YARN) build-wasm + +ts-build: ts-build-so ts-build-wasm + @echo 'Build completed' + +ts-repl: ts-generate + $(YARN) repl diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -11,20 +11,45 @@ Currently uses ElasticSearch, and provides only origin search (by URL and metadata) -# Dependencies +## Dependencies Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). -## Debian-like host +### Debian-like host The elasticsearch package is required. As it's not part of debian-stable, [another debian repository is required to be configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) -## Non Debian-like host +### Non Debian-like host The tests expect: - `/usr/share/elasticsearch/jdk/bin/java` to exist. - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. + +## Make targets + +Below is the list of available make targets that can be executed from the root directory of swh-search in order to build and/or execute the swh-search under various configurations: + +* **ts-install**: Install node_modules and emscripten SDK required for TreeSitter + +* **ts-generate**: Generate parser files(C and JSON) from the grammar + +* **ts-dev**: Parse the `query_language/sample_query` and print the corresponding syntax expression +along with the start and end positions of all the nodes. + +* **ts-dev sanitize=1**: Same as **ts-dev** but without start and end position of the nodes. +This format is expected by TreeSitter's native test command. `sanitize=1` cleans the output +of **ts-dev** using `sed` to achieve the desired format. + +* **ts-test**: executes TreeSitter's native tests + +* **ts-build-so**: Generates `swh_ql.so` file from the previously generated parser using py-tree-sitter + +* **ts-build-so**: Generates `swh_ql.wasm` file from the previously generated parser using emscripten + +* **ts-build**: Executes both **ts-build-so** and **ts-build-so** + +* **ts-repl**: Starts a web based playground for the TreeSitter grammar diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -19,3 +19,6 @@ [mypy-pytest.*] ignore_missing_imports = True + +[mypy-tree_sitter.*] +ignore_missing_imports = True diff --git a/package.json b/package.json new file mode 100644 --- /dev/null +++ b/package.json @@ -0,0 +1,36 @@ +{ + "name": "swh-search-query-language-parser", + "version": "1.0.0", + "description": "Parser for Software Heritage archive search query language", + "scripts": { + "generate": "cd query_language && tree-sitter generate --no-bindings && echo 'Generated parser files '", + "dev": "yarn generate && cd query_language && tree-sitter parse sample_query", + "test": "yarn generate && cd query_language && tree-sitter test", + "build-so": "yarn generate && cd query_language && python3 build.py", + "build-wasm": "yarn generate && cd query_language && tree-sitter build-wasm . && mv tree-sitter-swh_search_ql.wasm swh_ql.wasm", + "build": "yarn build-so && yarn build-wasm", + "repl": "yarn generate && cd query_language && tree-sitter build-wasm && tree-sitter playground" + }, + "repository": { + "type": "git", + "url": "https://forge.softwareheritage.org/source/swh-search.git" + }, + "keywords": [ + "swh", + "Software Heritage", + "treesitter", + "parser", + "custom", + "search", + "query", + "language" + ], + "author": "The Software Heritage developers", + "license": "GPL-3.0-only", + "dependencies": { + "nan": "^2.14.2" + }, + "devDependencies": { + "tree-sitter-cli": "^0.20.0" + } +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml --- a/pyproject.toml +++ b/pyproject.toml @@ -9,3 +9,6 @@ ensure_newline_before_comments = true line_length = 88 force_sort_within_sections = true + +[build-system] +requires = ["setuptools", "wheel", "tree_sitter"] diff --git a/query_language/.gitignore b/query_language/.gitignore new file mode 100644 --- /dev/null +++ b/query_language/.gitignore @@ -0,0 +1,7 @@ +src +build +bindings +binding.gyp +Cargo.toml +package.json +log.html diff --git a/query_language/build.py b/query_language/build.py new file mode 100644 --- /dev/null +++ b/query_language/build.py @@ -0,0 +1,3 @@ +from tree_sitter import Language + +Language.build_library("swh_ql.so", ["."]) diff --git a/query_language/grammar.js b/query_language/grammar.js new file mode 100644 --- /dev/null +++ b/query_language/grammar.js @@ -0,0 +1,132 @@ +// Copyright (C) 2019-2021 The Software Heritage developers +// See the AUTHORS file at the top-level directory of this distribution +// License: GNU General Public License version 3, or any later version +// See top-level LICENSE file for more information + +module.exports = grammar({ + name: 'swh_search_ql', + + rules: { + query: $ => repeat( + choice( + $.patternFilter, + $.booleanFilter, + $.numericFilter, + $.unboundedListFilter, + $.boundedListFilter, + $.dateFilter, + $.limitFilter + ) + ), + + patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal), + patternField: $ => token(choice('url', 'metadata')), + patternOp: $ => $.colonOp, + patternVal: $ => $.string, + + booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal), + booleanField: $ => token(choice('with_visit')), + booleanOp: $ => $.colonOp, + booleanVal: $ => choice($.booleanTrue, $.booleanFalse), + + numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal), + numericField: $ => token(choice('nb_visits')), + numericOp: $ => $.rangeOp, + numberVal: $ => $.number, + + boundedListFilter: $ => choice($.visitTypeFilter, $.sortByFilter), + + visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal), + visitTypeField: $ => token(choice('visit_types')), + visitTypeOp: $ => $.colonOp, + visitTypeVal: $ => createArray($.visitTypeOptions), + visitTypeOptions: $ => choice( + "any", + "cran", + "deb", + "deposit", + "ftp", + "hg", + "git", + "nixguix", + "npm", + "pypi", + "svn", + "tar" + ), // TODO: fetch this list dynamically from other swh services? + + sortByFilter: $ => seq($.sortByField, $.sortByOp, $.sortByVal), + sortByField: $ => token(choice('sort_by')), + sortByOp: $ => $.colonOp, + sortByVal: $ => createArray($.sortByOptions), + sortByOptions: $ => choice( + 'nb_visits', + 'last_visit_date', + 'last_eventful_visit_date', + 'last_revision_date', + 'last_release_date', + 'date_created', + 'date_modified', + 'date_published' + ), + + unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), + listField: $ => token(choice('programming_languages', 'licenses', 'keywords')), + listOp: $ => $.choiceOp, + listVal: $ => createArray($.words), // Needs to be fixed !! + // currently doesn't accept: licenses in ["MIT", BSD,] because of BSD (no inverted comma) + + + dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal), + dateField: $ => token(choice( + 'last_visit_date', + 'last_eventful_visit_date', + 'last_revision_date', + 'last_release_date', + 'date_created', + 'date_modified', + 'date_published' + )), + dateOp: $ => $.rangeOp, + dateVal: $ => $.dateWithOptionalTime, + + limitFilter: $ => seq('limit', $.colonOp, $.number), + + + rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')), + colonOp: $ => token(':'), + choiceOp: $ => token(choice('in', 'not in')), + + dateWithOptionalTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?/, + + string: $ => choice(wrapWithInvertedComma($.words), $.word), + number: $ => /\d+/, + booleanTrue: $ => "true", + booleanFalse: $ => "false", + + words: $ => repeat1(seq($.word)), + word: $ => /[^\s"']+/, + + } +}); + +function commaSep1(rule) { + return seq(rule, repeat(seq(",", optional(rule)))) +} + +function commaSep(rule) { + return optional(commaSep1(rule)) +} + +function createArray(rule) { + return seq('[', commaSep( + field('array_member', (choice(wrapWithInvertedComma(rule), rule))) + ), ']') +} + +function wrapWithInvertedComma(rule) { + return choice( + seq("'", rule, "'"), + seq('"', rule, '"') + ) +} diff --git a/query_language/sample_query b/query_language/sample_query new file mode 100644 --- /dev/null +++ b/query_language/sample_query @@ -0,0 +1,6 @@ +url : "github.com/django/Django" metadata : "Repo description" +with_visit : true with_visit : false +nb_visits >= 0 nb_visits = 10 nb_visits != 256 nb_visits < 1000 +sort_by : ["nb_visits", "last_revision_date", last_release_date,] +last_release_date < 2001-02-13 15:54:21 +licenses in ["MIT", "BSD X", "Apache XZY ABC", ] \ No newline at end of file diff --git a/query_language/test/corpus/combinations.txt b/query_language/test/corpus/combinations.txt new file mode 100644 --- /dev/null +++ b/query_language/test/corpus/combinations.txt @@ -0,0 +1,106 @@ +================== +Empty query +================== + + +--- + +(query) + + +================== +Combinations +================== + +url : "github.com/django/Django" metadata : "Repo description" +with_visit : true with_visit : false +nb_visits >= 0 nb_visits = 10 nb_visits != 256 nb_visits < 1000 +sort_by : ["nb_visits", "last_revision_date", last_release_date,] +last_release_date < 2001-02-13 15:54:21 +licenses in ["MIT", "BSD X", "Apache XZY ABC", ] + +--- + +(query + (patternFilter + (patternField) + (patternOp + (colonOp)) + (patternVal + (string + (words + (word))))) + (patternFilter + (patternField) + (patternOp + (colonOp)) + (patternVal + (string + (words + (word) + (word))))) + (booleanFilter + (booleanField) + (booleanOp + (colonOp)) + (booleanVal + (booleanTrue))) + (booleanFilter + (booleanField) + (booleanOp + (colonOp)) + (booleanVal + (booleanFalse))) + (numericFilter + (numericField) + (numericOp + (rangeOp)) + (numberVal + (number))) + (numericFilter + (numericField) + (numericOp + (rangeOp)) + (numberVal + (number))) + (numericFilter + (numericField) + (numericOp + (rangeOp)) + (numberVal + (number))) + (numericFilter + (numericField) + (numericOp + (rangeOp)) + (numberVal + (number))) + (boundedListFilter + (sortByFilter + (sortByField) + (sortByOp + (colonOp)) + (sortByVal + array_member: (sortByOptions) + array_member: (sortByOptions) + array_member: (sortByOptions)))) + (dateFilter + (dateField) + (dateOp + (rangeOp)) + (dateVal + (dateWithOptionalTime))) + (unboundedListFilter + (listField) + (listOp + (choiceOp)) + (listVal + array_member: (words + (word)) + array_member: (words + (word) + (word)) + array_member: (words + (word) + (word) + (word))))) diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,3 +5,4 @@ types-pytz types-pyyaml types-requests +types-setuptools diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ click elasticsearch>=7.0.0,<8.0.0 typing-extensions +tree_sitter diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -4,8 +4,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from distutils.command.build_py import build_py from io import open -from os import path +from os import environ, path, system from setuptools import find_packages, setup @@ -35,6 +36,23 @@ return requirements +class custom_build(build_py): + def run(self): + if not self.dry_run: + yarn = environ.get("YARN", "yarn") + system(f"{yarn} install") + + system(f"{yarn} build-so && echo 'swh_ql.so file generated'") + system(f"{yarn} build-wasm && echo 'swh_ql.wasm file generated'") + system("echo 'static files generated. moving them to static/ dir'") + + system("mkdir static") + system("cp query_language/swh_ql.so static/swh_ql.so") + system("cp query_language/swh_ql.wasm static/swh_ql.wasm") + + build_py.run(self) + + setup( name="swh.search", description="Software Heritage search service", @@ -68,4 +86,6 @@ "Source": "https://forge.softwareheritage.org/source/swh-search", "Documentation": "https://docs.softwareheritage.org/devel/swh-search/", }, + cmdclass={"build_py": custom_build}, + data_files=[("share/swh/search", ["static/swh_ql.so", "static/swh_ql.wasm"])], ) diff --git a/yarn.lock b/yarn.lock new file mode 100644 --- /dev/null +++ b/yarn.lock @@ -0,0 +1,13 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +nan@^2.14.2: + version "2.14.2" + resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.2.tgz#f5376400695168f4cc694ac9393d0c9585eeea19" + integrity sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ== + +tree-sitter-cli@^0.20.0: + version "0.20.0" + resolved "https://registry.yarnpkg.com/tree-sitter-cli/-/tree-sitter-cli-0.20.0.tgz#feaaa11c7ecf44a6e236aa1e2963b85d045d33cc" + integrity sha512-4D1qapWbJXZ5rrSUGM5rcw5Vuq/smzn9KbiFRhlON6KeuuXjra+KAtDYVrDgAoLIG4ku+jbEEGrJxCptUGi3dg==