diff --git a/README.md b/README.md index 1406f99..066fb49 100644 --- a/README.md +++ b/README.md @@ -1,68 +1,64 @@ swh-search ========== Search service for the Software Heritage archive. It is similar to swh-storage in what it contains, but provides different ways to query it: while swh-storage is mostly a key-value store that returns an object from a primary key, swh-search is focused on reverse indices, to allow finding objects that match some criteria; for example full-text search. Currently uses ElasticSearch, and provides only origin search (by URL and metadata) ## Dependencies - Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). - Debian-like host The elasticsearch package is required. As it's not part of debian-stable, [another debian repository is required to be configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) - Non Debian-like host The tests expect: - `/usr/share/elasticsearch/jdk/bin/java` to exist. - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. -- The `tree-sitter` module is required in the python venv to launch the yarn build. To install it, run: -```bash -pip install tree-sitter==0.19.0 -``` - Emscripten is required for generating tree-sitter WASM module. The following commands need to be executed for the setup: ```bash cd /opt && git clone https://github.com/emscripten-core/emsdk.git && cd emsdk && \ ./emsdk install latest && ./emsdk activate latest PATH="${PATH}:/opt/emsdk/upstream/emscripten" ``` **Note:** If emsdk isn't found in the PATH, the tree-sitter cli automatically pulls `emscripten/emsdk` image from docker hub when `make ts-build-wasm` or `make ts-build` is used. ## Make targets Below is the list of available make targets that can be executed from the root directory of swh-search in order to build and/or execute the swh-search under various configurations: * **ts-install**: Install node_modules and emscripten SDK required for TreeSitter * **ts-generate**: Generate parser files(C and JSON) from the grammar * **ts-repl**: Starts a web based playground for the TreeSitter grammar. It's the recommended way for developing TreeSitter grammar. * **ts-dev**: Parse the `query_language/sample_query` and print the corresponding syntax expression along with the start and end positions of all the nodes. * **ts-dev sanitize=1**: Same as **ts-dev** but without start and end position of the nodes. This format is expected by TreeSitter's native test command. `sanitize=1` cleans the output of **ts-dev** using `sed` to achieve the desired format. * **ts-test**: executes TreeSitter's native tests * **ts-build-so**: Generates `swh_ql.so` file from the previously generated parser using py-tree-sitter * **ts-build-so**: Generates `swh_ql.wasm` file from the previously generated parser using emscripten * **ts-build**: Executes both **ts-build-so** and **ts-build-so** diff --git a/package.json b/package.json index 71c6ebb..523b762 100644 --- a/package.json +++ b/package.json @@ -1,36 +1,36 @@ { "name": "swh-search-query-language-parser", "version": "1.0.0", "description": "Parser for Software Heritage archive search query language", "scripts": { "generate": "cd query_language && tree-sitter generate --no-bindings && echo 'Generated parser files '", "dev": "yarn generate && cd query_language && tree-sitter parse sample_query", "test": "yarn generate && cd query_language && tree-sitter test", "build-so": "yarn generate && cd query_language && python3 build.py", "build-wasm": "yarn generate && cd query_language && tree-sitter build-wasm . && mv tree-sitter-swh_search_ql.wasm swh_ql.wasm", - "build": "yarn build-so && yarn build-wasm", + "build": "echo 'use `pip3 install .` or `pip3 wheel .` instead.'", "repl": "yarn generate && cd query_language && tree-sitter build-wasm && tree-sitter playground" }, "repository": { "type": "git", "url": "https://forge.softwareheritage.org/source/swh-search.git" }, "keywords": [ "swh", "Software Heritage", "treesitter", "parser", "custom", "search", "query", "language" ], "author": "The Software Heritage developers", "license": "GPL-3.0-only", "dependencies": { "nan": "^2.14.2" }, "devDependencies": { "tree-sitter-cli": "^0.20.0" } } diff --git a/setup.py b/setup.py index 448b6ac..52de412 100755 --- a/setup.py +++ b/setup.py @@ -1,194 +1,201 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.cmd import Command from io import open import os import shutil import subprocess +import sys from setuptools import find_packages, setup from setuptools.command.build_py import build_py from setuptools.command.sdist import sdist here = os.path.abspath(os.path.dirname(__file__)) # Get the long description from the README file with open(os.path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not os.path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements yarn = os.environ.get("YARN", "yarn") class TSCommand(Command): user_options = [] def initialize_options(self): pass def finalize_options(self): pass class TSInstallCommand(TSCommand): description = "Installs node_modules related to query language" def run(self): subprocess.run([yarn, "install"], check=True) class TSGenerateCommand(TSCommand): description = "Generates parser related files from grammar.js" def run(self): subprocess.run([yarn, "generate"], check=True) class TSBuildSoCommand(TSCommand): description = "Builds swh_ql.so" def run(self): - subprocess.run([yarn, "build-so"], check=True) + # setup_requires changes sys.path so the build dependencies + # can be imported even though they are in a temporary + # directory (usually `.eggs`). We need to pass this updated sys.path to + # 'yarn build-so', as it invokes a Python script that needs to import + # tree_sitter + env = {**os.environ, "PYTHONPATH": os.pathsep.join(sys.path)} + subprocess.run([yarn, "build-so"], check=True, env=env) print("swh_ql.so file generated") class TSBuildWasmCommand(TSCommand): description = "Builds swh_ql.wasm" def run(self): subprocess.run([yarn, "build-wasm"], check=True) print("swh_ql.wasm file generated") class TSBuildCommand(TSCommand): description = "Builds swh_ql.so and swh_ql.wasm" def run(self): self.run_command("ts_build_so") self.run_command("ts_build_wasm") class TSBuildExportCommand(TSCommand): description = "Builds swh_ql.so and swh_ql.wasm and exports them to static/" def initialize_options(self): self.build_lib = None super().initialize_options() def finalize_options(self): self.set_undefined_options("build", ("build_lib", "build_lib")) super().finalize_options() def run(self): self.run_command("ts_install") self.run_command("ts_build") print("static files generated. copying them to package dir") os.makedirs(os.path.join(self.build_lib, "swh/search/static"), exist_ok=True) shutil.copyfile( "query_language/swh_ql.so", os.path.join(self.build_lib, "swh/search/static/swh_ql.so"), ) shutil.copyfile( "query_language/swh_ql.wasm", os.path.join(self.build_lib, "swh/search/static/swh_ql.wasm"), ) class custom_build(build_py): def run(self): super().run() if not self.dry_run: self.run_command("ts_build_export") class custom_sdist(sdist): def make_release_tree(self, base_dir, files): super().make_release_tree(base_dir, files) # TODO: build the .c file and .wasm but not .so, because it's architecture- # dependent, and shouldn't be in a sdist (aka *source* distribution) if not self.dry_run: self.run_command("ts_install") self.run_command("ts_build") print("static files generated. copying them to package dir") os.makedirs(os.path.join(base_dir, "swh/search/static"), exist_ok=True) shutil.copyfile( "query_language/swh_ql.so", os.path.join(base_dir, "swh/search/static/swh_ql.so"), ) shutil.copyfile( "query_language/swh_ql.wasm", os.path.join(base_dir, "swh/search/static/swh_ql.wasm"), ) setup( name="swh.search", description="Software Heritage search service", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DSEA", packages=find_packages(), # packages's modules install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), entry_points=""" [swh.cli.subcommands] search=swh.search.cli """, - setup_requires=["setuptools-scm"], + setup_requires=["setuptools-scm", "tree-sitter==0.19.0"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 3 - Alpha", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-search", "Documentation": "https://docs.softwareheritage.org/devel/swh-search/", }, cmdclass={ "build_py": custom_build, "sdist": custom_sdist, "ts_install": TSInstallCommand, "ts_generate": TSGenerateCommand, "ts_build_so": TSBuildSoCommand, "ts_build_wasm": TSBuildWasmCommand, "ts_build": TSBuildCommand, "ts_build_export": TSBuildExportCommand, }, zip_safe=False, )