diff --git a/.gitignore b/.gitignore index d8e3b91..0c1c008 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,18 @@ *.pyc *.sw? *~ .coverage .eggs/ __pycache__ *.egg-info/ build/ dist/ version.txt .tox .mypy_cache/ .hypothesis/ .vscode/ +node_modules/ +static/ +*.wasm +*.so diff --git a/CONTRIBUTORS b/CONTRIBUTORS index e69de29..b97d981 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -0,0 +1 @@ +Kumar Shivendu diff --git a/Makefile.local b/Makefile.local new file mode 100644 index 0000000..1d1fd9d --- /dev/null +++ b/Makefile.local @@ -0,0 +1,31 @@ +YARN ?= yarn +PYTHON ?= python3 + + +ts-install: package.json + $(PYTHON) setup.py ts_install + +ts-generate: ts-install query_language/grammar.js + $(PYTHON) setup.py ts_generate + +ts-dev: ts-install +ifdef sanitize + $(YARN) dev | sed '5,$$s/[[0-9]\+, [0-9]\+]/ /g' | sed '5,$$s/ *- *//g'; +else + $(YARN) dev; +endif + +ts-test: ts-install + $(YARN) test + +ts-repl: ts-generate + $(YARN) repl + +ts-build-so: ts-generate query_language/src/ + $(PYTHON) setup.py ts_build_so + +ts-build-wasm: ts-generate query_language/src/ + $(PYTHON) setup.py ts_build_wasm + +ts-build: ts-build-so ts-build-wasm + @echo 'Build completed' diff --git a/README.md b/README.md index 4292fe2..71c17b6 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,65 @@ swh-search ========== Search service for the Software Heritage archive. It is similar to swh-storage in what it contains, but provides different ways to query it: while swh-storage is mostly a key-value store that returns an object from a primary key, swh-search is focused on reverse indices, to allow finding objects that match some criteria; for example full-text search. Currently uses ElasticSearch, and provides only origin search (by URL and metadata) -# Dependencies +## Dependencies -Python tests for this module include tests that cannot be run without a local +- Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). -## Debian-like host + - Debian-like host -The elasticsearch package is required. As it's not part of debian-stable, -[another debian repository is required to be -configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) + The elasticsearch package is required. As it's not part of debian-stable, + [another debian repository is required to be + configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) -## Non Debian-like host + - Non Debian-like host + + The tests expect: + - `/usr/share/elasticsearch/jdk/bin/java` to exist. + - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. +- Emscripten is required for generating tree-sitter WASM module. The following commands need to be executed for the setup: + ```bash + cd /opt && git clone https://github.com/emscripten-core/emsdk.git && cd emsdk && \ + ./emsdk install latest && ./emsdk activate latest + PATH="${PATH}:/opt/emsdk/upstream/emscripten" + ``` + + **Note:** If emsdk isn't found in the PATH, the tree-sitter cli automatically pulls `emscripten/emsdk` image from docker hub when `make ts-build-wasm` or `make ts-build` is used. + + +## Make targets + +Below is the list of available make targets that can be executed from the root directory of swh-search in order to build and/or execute the swh-search under various configurations: + +* **ts-install**: Install node_modules and emscripten SDK required for TreeSitter + +* **ts-generate**: Generate parser files(C and JSON) from the grammar + +* **ts-repl**: Starts a web based playground for the TreeSitter grammar. It's the recommended way for developing TreeSitter grammar. + +* **ts-dev**: Parse the `query_language/sample_query` and print the corresponding syntax expression +along with the start and end positions of all the nodes. + +* **ts-dev sanitize=1**: Same as **ts-dev** but without start and end position of the nodes. +This format is expected by TreeSitter's native test command. `sanitize=1` cleans the output +of **ts-dev** using `sed` to achieve the desired format. + +* **ts-test**: executes TreeSitter's native tests + +* **ts-build-so**: Generates `swh_ql.so` file from the previously generated parser using py-tree-sitter + +* **ts-build-so**: Generates `swh_ql.wasm` file from the previously generated parser using emscripten + +* **ts-build**: Executes both **ts-build-so** and **ts-build-so** -The tests expect: -- `/usr/share/elasticsearch/jdk/bin/java` to exist. -- `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. diff --git a/mypy.ini b/mypy.ini index 5c756c5..02b0e9f 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,21 +1,24 @@ [mypy] namespace_packages = True warn_unused_ignores = True # 3rd party libraries without stubs (yet) [mypy-confluent_kafka.*] ignore_missing_imports = True [mypy-elasticsearch.*] ignore_missing_imports = True [mypy-msgpack.*] ignore_missing_imports = True [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True + +[mypy-tree_sitter.*] +ignore_missing_imports = True diff --git a/package.json b/package.json new file mode 100644 index 0000000..71c6ebb --- /dev/null +++ b/package.json @@ -0,0 +1,36 @@ +{ + "name": "swh-search-query-language-parser", + "version": "1.0.0", + "description": "Parser for Software Heritage archive search query language", + "scripts": { + "generate": "cd query_language && tree-sitter generate --no-bindings && echo 'Generated parser files '", + "dev": "yarn generate && cd query_language && tree-sitter parse sample_query", + "test": "yarn generate && cd query_language && tree-sitter test", + "build-so": "yarn generate && cd query_language && python3 build.py", + "build-wasm": "yarn generate && cd query_language && tree-sitter build-wasm . && mv tree-sitter-swh_search_ql.wasm swh_ql.wasm", + "build": "yarn build-so && yarn build-wasm", + "repl": "yarn generate && cd query_language && tree-sitter build-wasm && tree-sitter playground" + }, + "repository": { + "type": "git", + "url": "https://forge.softwareheritage.org/source/swh-search.git" + }, + "keywords": [ + "swh", + "Software Heritage", + "treesitter", + "parser", + "custom", + "search", + "query", + "language" + ], + "author": "The Software Heritage developers", + "license": "GPL-3.0-only", + "dependencies": { + "nan": "^2.14.2" + }, + "devDependencies": { + "tree-sitter-cli": "^0.20.0" + } +} diff --git a/pyproject.toml b/pyproject.toml index 69b8f4d..4785edb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,14 @@ [tool.black] target-version = ['py37'] [tool.isort] multi_line_output = 3 include_trailing_comma = true force_grid_wrap = 0 use_parentheses = true ensure_newline_before_comments = true line_length = 88 force_sort_within_sections = true + +[build-system] +requires = ["setuptools", "wheel", "tree_sitter"] diff --git a/query_language/.gitignore b/query_language/.gitignore new file mode 100644 index 0000000..4368455 --- /dev/null +++ b/query_language/.gitignore @@ -0,0 +1,7 @@ +src +build +bindings +binding.gyp +Cargo.toml +package.json +log.html diff --git a/query_language/build.py b/query_language/build.py new file mode 100644 index 0000000..62c3de2 --- /dev/null +++ b/query_language/build.py @@ -0,0 +1,3 @@ +from tree_sitter import Language + +Language.build_library("swh_ql.so", ["."]) diff --git a/query_language/grammar.js b/query_language/grammar.js new file mode 100644 index 0000000..4a02fe3 --- /dev/null +++ b/query_language/grammar.js @@ -0,0 +1,184 @@ +// Copyright (C) 2019-2021 The Software Heritage developers +// See the AUTHORS file at the top-level directory of this distribution +// License: GNU General Public License version 3, or any later version +// See top-level LICENSE file for more information + + +const PRECEDENCE = { + or: 2, + and: 3, + bracket: 4, +} + +module.exports = grammar({ + name: 'swh_search_ql', + + rules: { + query: $ => $.filters, + + filters: $ => choice( + prec.left(PRECEDENCE.and, + seq( + field('left', $.filters), + field('operator', $.and), + field('right', $.filters), + ) + ), + prec.left(PRECEDENCE.or, + seq( + field('left', $.filters), + field('operator', $.or), + field('right', $.filters), + ) + ), + prec.left(PRECEDENCE.bracket, + seq("(", $.filters, ")"), + ), + $.filter + ), + + filter: $ => choice( + $.patternFilter, + $.booleanFilter, + $.numericFilter, + $.boundedListFilter, + $.unboundedListFilter, + $.dateFilter, + $.limitFilter + ), + + patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal), + patternField: $ => token(choice('origin', 'metadata')), + patternOp: $ => $.equalOp, + patternVal: $ => $.string, + + booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal), + booleanField: $ => token(choice('visited')), + booleanOp: $ => $.equalOp, + booleanVal: $ => choice($.booleanTrue, $.booleanFalse), + + numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal), + numericField: $ => token(choice('visits')), + numericOp: $ => $.rangeOp, + numberVal: $ => $.number, + + boundedListFilter: $ => choice($.visitTypeFilter, $.sortByFilter), + + visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal), + visitTypeField: $ => token(choice('visit_type')), + visitTypeOp: $ => $.equalOp, + visitTypeVal: $ => createArray(optionalWrapWith($.visitTypeOptions, ["'", '"'])), + visitTypeOptions: $ => choice( + "any", + "cran", + "deb", + "deposit", + "ftp", + "hg", + "git", + "nixguix", + "npm", + "pypi", + "svn", + "tar" + ), // TODO: fetch this list dynamically from other swh services? + + sortByFilter: $ => seq($.sortByField, $.sortByOp, $.sortByVal), + sortByField: $ => token(choice('sort_by')), + sortByOp: $ => $.equalOp, + sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), + sortByOptions: $ => seq( + optional(token.immediate('-')), + choice( + 'visits', + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' + )), + + unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), + listField: $ => token(choice('language', 'license', 'keyword')), + listOp: $ => $.choiceOp, + listVal: $ => createArray($.string), + + + dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal), + dateField: $ => token(choice( + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' + )), + dateOp: $ => $.rangeOp, + dateVal: $ => $.isoDateTime, + + limitFilter: $ => seq('limit', $.equalOp, $.number), + + + rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')), + equalOp: $ => token('='), + choiceOp: $ => token(choice('in', 'not in')), + + isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/, + + string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord), + number: $ => /\d+/, + booleanTrue: $ => "true", + booleanFalse: $ => "false", + + or: $ => "or", + and: $ => "and", + + stringContent: $ => repeat1(choice( + token.immediate(/[^\\"\n]+/), + $.escape_sequence + )), + singleWord: $ => /[^\s"'\[\]\(\)]+/, + escape_sequence: $ => token.immediate(seq( + '\\', + /(\"|\'|\\|\/|b|n|r|t|u)/ + )), + + } +}); + + +function joinBySep1(rule, sep) { + // At least one repetition of the rule separated by `sep` + return seq(rule, repeat(seq(sep, optional(rule)))) +} + +function joinBySep(rule, sep = ",") { + // Any number of repetitions of the rule separated by `sep` + return optional(joinBySep1(rule, sep)) +} + +function createArray(rule) { + // An array having `rule` as its member + return seq( + "[", + joinBySep( + field('array_member', rule), + "," + ), + "]" + ) +} + +function wrapWith(rule, wrappers = ["'", '"']) { + // The rule must be wrapped with one of the wrappers + const wrappedRules = wrappers.map(wrapper => seq(wrapper, rule, wrapper)) + return choice(...wrappedRules) +} + +function optionalWrapWith(rule, wrappers = ["'", '"']) { + // The rule may or may not be wrapped with the wrappers + return choice(wrapWith(rule, wrappers), rule) +} diff --git a/query_language/sample_query b/query_language/sample_query new file mode 100644 index 0000000..3d8c08d --- /dev/null +++ b/query_language/sample_query @@ -0,0 +1,6 @@ +(origin = django/django and language in ["python"] or visits >= 5) or +(last_revision > 2020-01-01 and limit = 10) or +(last_visit > 2021-01-01 or last_visit < 2020-01-01) or +(visited = false and metadata = "gitlab") or +(keyword in ["orchestration", "kubectl"] and language in ["go", "rust"]) or +(visit_type = [deb] and license in ["GPL-3"]) diff --git a/query_language/test/corpus/combinations.txt b/query_language/test/corpus/combinations.txt new file mode 100644 index 0000000..cf66e84 --- /dev/null +++ b/query_language/test/corpus/combinations.txt @@ -0,0 +1,76 @@ +============================== +Empty query (should throw error) +============================== + +--- + +(ERROR) + + +================== +Origins with django as keyword, python language, and more than 5 visits +================== + +origin = django and language in ["python"] and visits >= 5 + +--- +(query (filters (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (singleWord)))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent))))))) (and) (filters (filter (numericFilter (numericField) (numericOp (rangeOp)) (numberVal (number))))))) + +================== +10 origins with latest revision after 2020-01-01 +================== +last_revision > 2020-01-01 and limit = 10 +--- + +(query (filters (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (and) (filters (filter (limitFilter (equalOp) (number)))))) + +================== +Origins with last visit date not in 2020-2021 +================== + +last_visit > 2021-01-01 or last_visit < 2020-01-01 +--- +(query (filters (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (or) (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))))) + +================== +Unvisited origins with kubernetes in metadata or minikube in url +================== + +visited = false and metadata = "kubernetes" or origin = "minikube" + +--- +(query (filters (filters (filters (filter (booleanFilter (booleanField) (booleanOp (equalOp)) (booleanVal (booleanFalse))))) (and) (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent))))))) (or) (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent)))))))) + +================== +Origins with "orchestration" or "kubectl" as keywords and language as "go" or "rust" +================== + +keyword in ["orchestration", "kubectl"] and language in ["go", "rust"] + +--- +(query (filters (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)) (string (stringContent)))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)) (string (stringContent)))))))) + +================== +Origins with a GPL-3 license that have "debian" in their url or have visit type as "deb" +================== +(origin = debian or visit_type = ["deb"]) and license in ["GPL-3"] +--- + +(query (filters (filters (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (singleWord)))))) (or) (filters (filter (boundedListFilter (visitTypeFilter (visitTypeField) (visitTypeOp (equalOp)) (visitTypeVal (visitTypeOptions)))))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)))))))) + +================== +Origins with 'and' and 'or' inside filter values +================== +(origin = "foo and bar or baz") +--- + +(query (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent)))))))) + + +================== +Origins with `'` and `"` inside filter values +================== +(origin = "foo \\ \'bar\' \"baz\" ") +--- + +(query (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent (escape_sequence) (escape_sequence) (escape_sequence) (escape_sequence) (escape_sequence))))))))) diff --git a/requirements-test.txt b/requirements-test.txt index d0c4f08..7b5b9a1 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,7 +1,8 @@ pytest pytest-mock confluent-kafka types-click types-pytz types-pyyaml types-requests +types-setuptools diff --git a/requirements.txt b/requirements.txt index 12608a1..422247f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html click elasticsearch>=7.0.0,<8.0.0 typing-extensions +tree_sitter diff --git a/setup.py b/setup.py index c6fef58..50a73c9 100755 --- a/setup.py +++ b/setup.py @@ -1,71 +1,153 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from distutils.cmd import Command +from distutils.command.build import build from io import open -from os import path +from os import environ, path, system from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements +yarn = environ.get("YARN", "yarn") + + +class TSCommand(Command): + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + +class TSInstallCommand(TSCommand): + description = "Installs node_modules related to query language" + + def run(self): + system(f"{yarn} install") + + +class TSGenerateCommand(TSCommand): + description = "Generates parser related files from grammar.js" + + def run(self): + system(f"{yarn} generate") + + +class TSBuildSoCommand(TSCommand): + description = "Builds swh_ql.so" + + def run(self): + system(f"{yarn} build-so && echo 'swh_ql.so file generated'") + + +class TSBuildWasmCommand(TSCommand): + description = "Builds swh_ql.wasm" + + def run(self): + system(f"{yarn} build-wasm && echo 'swh_ql.wasm file generated'") + + +class TSBuildCommand(TSCommand): + description = "Builds swh_ql.so and swh_ql.wasm" + + def run(self): + self.run_command("ts_build_so") + self.run_command("ts_build_wasm") + + +class TSBuildExportCommand(TSCommand): + description = "Builds swh_ql.so and swh_ql.wasm and exports them to static/" + + def run(self): + self.run_command("ts_install") + self.run_command("ts_build") + + system("echo 'static files generated. copying them to static/ dir'") + system("mkdir static") + system("cp query_language/swh_ql.so static/swh_ql.so") + system("cp query_language/swh_ql.wasm static/swh_ql.wasm") + + +class custom_build(build): + def run(self): + if not self.dry_run: + self.run_command("ts_build_export") + + super().run() + + setup( name="swh.search", description="Software Heritage search service", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DSEA", packages=find_packages(), # packages's modules install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), entry_points=""" [swh.cli.subcommands] search=swh.search.cli """, setup_requires=["setuptools-scm"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 3 - Alpha", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-search", "Documentation": "https://docs.softwareheritage.org/devel/swh-search/", }, + cmdclass={ + "build": custom_build, + "ts_install": TSInstallCommand, + "ts_generate": TSGenerateCommand, + "ts_build_so": TSBuildSoCommand, + "ts_build_wasm": TSBuildWasmCommand, + "ts_build": TSBuildCommand, + "ts_build_export": TSBuildExportCommand, + }, + data_files=[("share/swh/search", ["static/swh_ql.so", "static/swh_ql.wasm"])], ) diff --git a/yarn.lock b/yarn.lock new file mode 100644 index 0000000..a9f8eaf --- /dev/null +++ b/yarn.lock @@ -0,0 +1,13 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +nan@^2.14.2: + version "2.14.2" + resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.2.tgz#f5376400695168f4cc694ac9393d0c9585eeea19" + integrity sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ== + +tree-sitter-cli@^0.20.0: + version "0.20.0" + resolved "https://registry.yarnpkg.com/tree-sitter-cli/-/tree-sitter-cli-0.20.0.tgz#feaaa11c7ecf44a6e236aa1e2963b85d045d33cc" + integrity sha512-4D1qapWbJXZ5rrSUGM5rcw5Vuq/smzn9KbiFRhlON6KeuuXjra+KAtDYVrDgAoLIG4ku+jbEEGrJxCptUGi3dg==