diff --git a/PKG-INFO b/PKG-INFO index 3c1cc82..94c6798 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,90 +1,90 @@ Metadata-Version: 2.1 Name: swh.search -Version: 0.11.1 +Version: 0.11.2 Summary: Software Heritage search service Home-page: https://forge.softwareheritage.org/diffusion/DSEA Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-search Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-search/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-search ========== Search service for the Software Heritage archive. It is similar to swh-storage in what it contains, but provides different ways to query it: while swh-storage is mostly a key-value store that returns an object from a primary key, swh-search is focused on reverse indices, to allow finding objects that match some criteria; for example full-text search. Currently uses ElasticSearch, and provides only origin search (by URL and metadata) ## Dependencies - Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). - Debian-like host The elasticsearch package is required. As it's not part of debian-stable, [another debian repository is required to be configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) - Non Debian-like host The tests expect: - `/usr/share/elasticsearch/jdk/bin/java` to exist. - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. - Emscripten is required for generating tree-sitter WASM module. The following commands need to be executed for the setup: ```bash cd /opt && git clone https://github.com/emscripten-core/emsdk.git && cd emsdk && \ ./emsdk install latest && ./emsdk activate latest PATH="${PATH}:/opt/emsdk/upstream/emscripten" ``` **Note:** If emsdk isn't found in the PATH, the tree-sitter cli automatically pulls `emscripten/emsdk` image from docker hub when `make ts-build-wasm` or `make ts-build` is used. ## Make targets Below is the list of available make targets that can be executed from the root directory of swh-search in order to build and/or execute the swh-search under various configurations: * **ts-install**: Install node_modules and emscripten SDK required for TreeSitter * **ts-generate**: Generate parser files(C and JSON) from the grammar * **ts-repl**: Starts a web based playground for the TreeSitter grammar. It's the recommended way for developing TreeSitter grammar. * **ts-dev**: Parse the `query_language/sample_query` and print the corresponding syntax expression along with the start and end positions of all the nodes. * **ts-dev sanitize=1**: Same as **ts-dev** but without start and end position of the nodes. This format is expected by TreeSitter's native test command. `sanitize=1` cleans the output of **ts-dev** using `sed` to achieve the desired format. * **ts-test**: executes TreeSitter's native tests * **ts-build-so**: Generates `swh_ql.so` file from the previously generated parser using py-tree-sitter * **ts-build-so**: Generates `swh_ql.wasm` file from the previously generated parser using emscripten * **ts-build**: Executes both **ts-build-so** and **ts-build-so** diff --git a/query_language/grammar.js b/query_language/grammar.js index ab5320b..594a934 100644 --- a/query_language/grammar.js +++ b/query_language/grammar.js @@ -1,216 +1,192 @@ -// Copyright (C) 2019-2021 The Software Heritage developers +// Copyright (C) 2021 The Software Heritage developers // See the AUTHORS file at the top-level directory of this distribution // License: GNU General Public License version 3, or any later version // See top-level LICENSE file for more information +const { visitTypeField, sortByField, limitField } = require("./tokens.js"); +const { + patternFields, + booleanFields, + numericFields, + listFields, + dateFields +} = require("./tokens.js"); +const { equalOp, rangeOp, choiceOp } = require("./tokens.js"); +const { sortByOptions, visitTypeOptions } = require("./tokens.js"); +const { OR, AND, TRUE, FALSE } = require("./tokens.js"); const PRECEDENCE = { or: 2, and: 3, bracket: 4, } module.exports = grammar({ name: 'swh_search_ql', rules: { query: $ => seq( $.filters, - optional($.and), - choice( - seq(optional($.sortBy), optional($.and), optional($.limit)), - seq(optional($.limit), optional($.and), optional($.sortBy)), - ), + optional(seq( + optional($.and), + choice( + seq($.sortBy, optional($.and), optional($.limit)), + seq($.limit, optional($.and), optional($.sortBy)), + ), + )) ), - - filters: $ => choice( prec.left(PRECEDENCE.and, seq( field('left', $.filters), field('operator', $.and), field('right', $.filters), ) ), prec.left(PRECEDENCE.or, seq( field('left', $.filters), field('operator', $.or), field('right', $.filters), ) ), prec.left(PRECEDENCE.bracket, seq("(", $.filters, ")"), ), $.filter ), - sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal), - sortByField: $ => token('sort_by'), + sortBy: $ => annotateFilter($.sortByField, $.sortByOp, $.sortByVal), + sortByField: $ => token(sortByField), sortByOp: $ => $.equalOp, sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), - sortByOptions: $ => seq(optional(token.immediate('-')), choice( - 'visits', - 'last_visit', - 'last_eventful_visit', - 'last_revision', - 'last_release', - 'created', - 'modified', - 'published' - )), + sortByOptions: $ => seq( + optional('-'), + choice(...sortByOptions) + ), - limit: $ => seq('limit', $.equalOp, $.number), + limit: $ => annotateFilter($.limitField, $.equalOp, $.number), + limitField: $ => token(limitField), - filter: $ => choice( + filter: $ => field('category', choice( $.patternFilter, $.booleanFilter, $.numericFilter, $.boundedListFilter, $.unboundedListFilter, $.dateFilter - ), + )), - patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal), - patternField: $ => token(choice('origin', 'metadata')), + patternFilter: $ => annotateFilter($.patternField, $.patternOp, $.patternVal), + patternField: $ => token(choice(...patternFields)), patternOp: $ => $.equalOp, patternVal: $ => $.string, - booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal), - booleanField: $ => token(choice('visited')), + booleanFilter: $ => annotateFilter($.booleanField, $.booleanOp, $.booleanVal), + booleanField: $ => token(choice(...booleanFields)), booleanOp: $ => $.equalOp, booleanVal: $ => choice($.booleanTrue, $.booleanFalse), - numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal), - numericField: $ => token(choice('visits')), + numericFilter: $ => annotateFilter($.numericField, $.numericOp, $.numberVal), + numericField: $ => token(choice(...numericFields)), numericOp: $ => $.rangeOp, numberVal: $ => $.number, + // Array members must be from the given options boundedListFilter: $ => choice($.visitTypeFilter), - visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal), - visitTypeField: $ => token(choice('visit_type')), + visitTypeFilter: $ => annotateFilter($.visitTypeField, $.visitTypeOp, $.visitTypeVal), + visitTypeField: $ => token(visitTypeField), visitTypeOp: $ => $.equalOp, visitTypeVal: $ => createArray(optionalWrapWith($.visitTypeOptions, ["'", '"'])), - visitTypeOptions: $ => choice( - "any", - "cran", - "deb", - "deposit", - "ftp", - "hg", - "git", - "nixguix", - "npm", - "pypi", - "svn", - "tar" - ), // TODO: fetch this list dynamically from other swh services? - - sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal), - sortByField: $ => token(choice('sort_by')), - sortByOp: $ => $.equalOp, - sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])), - sortByOptions: $ => seq( - optional('-'), - choice( - 'visits', - 'last_visit', - 'last_eventful_visit', - 'last_revision', - 'last_release', - 'created', - 'modified', - 'published' - ) - ), + visitTypeOptions: $ => choice(...visitTypeOptions), + // TODO: fetch visitTypeOptions choices dynamically from other swh services? - unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal), - listField: $ => token(choice('language', 'license', 'keyword')), + // Array members can be any string + unboundedListFilter: $ => annotateFilter($.listField, $.listOp, $.listVal), + listField: $ => token(choice(...listFields)), listOp: $ => $.choiceOp, listVal: $ => createArray($.string), - - dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal), - dateField: $ => token(choice( - 'last_visit', - 'last_eventful_visit', - 'last_revision', - 'last_release', - 'created', - 'modified', - 'published' - )), + dateFilter: $ => annotateFilter($.dateField, $.dateOp, $.dateVal), + dateField: $ => token(choice(...dateFields)), dateOp: $ => $.rangeOp, dateVal: $ => $.isoDateTime, - limit: $ => seq('limit', $.equalOp, $.number), - - - rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')), - equalOp: $ => token('='), - choiceOp: $ => token(choice('in', 'not in')), + rangeOp: $ => token(choice(...rangeOp)), + equalOp: $ => token(choice(...equalOp)), + choiceOp: $ => token(choice(...choiceOp)), isoDateTime: $ => { const dateRegex = (/\d{4}[-]\d{2}[-]\d{2}/).source const dateTimeSepRegex = (/(\s|T)*/).source const timeRegex = (/(\d{2}:\d{2}(:\d{2}(\.\d{6})?)?)?/).source const timezoneRegex = (/(\+\d{2}:\d{2}|Z)?/).source return new RegExp(dateRegex + dateTimeSepRegex + timeRegex + timezoneRegex) }, string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord), number: $ => /\d+/, - booleanTrue: $ => "true", - booleanFalse: $ => "false", + booleanTrue: $ => TRUE, + booleanFalse: $ => FALSE, - or: $ => "or", - and: $ => "and", + or: $ => OR, + and: $ => AND, + singleWord: $ => /[^\s"'\[\]\(\),]+/, + + // Based on tree-sitter-json grammar: stringContent: $ => repeat1(choice( token.immediate(/[^\\'"\n]+/), $.escape_sequence )), - singleWord: $ => /[^\s"'\[\]\(\),]+/, escape_sequence: $ => token.immediate(seq( '\\', /(\"|\'|\\|\/|b|n|r|t|u)/ )), } }); function joinBySep1(rule, sep) { // At least one repetition of the rule separated by `sep` return seq(rule, repeat(seq(sep, optional(rule)))) } function joinBySep(rule, sep = ",") { // Any number of repetitions of the rule separated by `sep` return optional(joinBySep1(rule, sep)) } function createArray(rule) { // An array having `rule` as its member return seq( "[", joinBySep( field('array_member', rule), "," ), "]" ) } function wrapWith(rule, wrappers = ["'", '"']) { // The rule must be wrapped with one of the wrappers const wrappedRules = wrappers.map(wrapper => seq(wrapper, rule, wrapper)) return choice(...wrappedRules) } function optionalWrapWith(rule, wrappers = ["'", '"']) { // The rule may or may not be wrapped with the wrappers return choice(wrapWith(rule, wrappers), rule) } + +function annotateFilter(filterField, filterOp, filterVal) { + return seq( + field('field', filterField), + field('op', filterOp), + field('value', filterVal) + ); +} diff --git a/query_language/test/corpus/combinations.txt b/query_language/test/corpus/combinations.txt index 07802ea..999ed76 100644 --- a/query_language/test/corpus/combinations.txt +++ b/query_language/test/corpus/combinations.txt @@ -1,75 +1,82 @@ ============================== Empty query (should throw error) ============================== --- (ERROR) ================== Origins with django as keyword, python language, and more than 5 visits ================== origin = django and language in ["python"] and visits >= 5 --- (query (filters (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (singleWord)))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent))))))) (and) (filters (filter (numericFilter (numericField) (numericOp (rangeOp)) (numberVal (number))))))) ================== 10 origins with latest revision after 2020-01-01 ================== last_revision > 2020-01-01 limit = 10 --- -(query (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (limit (equalOp) (number))) +(query (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (limit (limitField) (equalOp) (number))) ================== Origins with last visit date not in 2020-2021 (sorted by number of visits) ================== last_visit > 2021-01-01 or last_visit < 2020-01-01 sort_by = ["visits"] --- (query (filters (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime))))) (or) (filters (filter (dateFilter (dateField) (dateOp (rangeOp)) (dateVal (isoDateTime)))))) (sortBy (sortByField) (sortByOp (equalOp)) (sortByVal (sortByOptions)))) ================== Unvisited origins with kubernetes in metadata or minikube in url ================== visited = false and metadata = "kubernetes" or origin = "minikube" --- (query (filters (filters (filters (filter (booleanFilter (booleanField) (booleanOp (equalOp)) (booleanVal (booleanFalse))))) (and) (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent))))))) (or) (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent)))))))) ================== Origins with "orchestration" or "kubectl" as keywords and language as "go" or "rust" ================== keyword in ["orchestration", "kubectl"] and language in ["go", "rust"] --- (query (filters (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)) (string (stringContent)))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)) (string (stringContent)))))))) ================== Origins with a GPL-3 license that have "debian" in their url or have visit type as "deb" ================== (origin = debian or visit_type = ["deb"]) and license in ["GPL-3"] --- (query (filters (filters (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (singleWord)))))) (or) (filters (filter (boundedListFilter (visitTypeFilter (visitTypeField) (visitTypeOp (equalOp)) (visitTypeVal (visitTypeOptions)))))))) (and) (filters (filter (unboundedListFilter (listField) (listOp (choiceOp)) (listVal (string (stringContent)))))))) ================== Origins with `and` and `or` inside filter values ================== (origin = "foo and bar or baz") --- (query (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent)))))))) ================== Origins with `'` and `"` inside filter values ================== (origin = "foo \\ \'bar\' \"baz\" ") --- (query (filters (filters (filter (patternFilter (patternField) (patternOp (equalOp)) (patternVal (string (stringContent (escape_sequence) (escape_sequence) (escape_sequence) (escape_sequence) (escape_sequence))))))))) + +================== +Incomplete conjunction operators should throw error +================== +visits > 5 and +--- +(query (filters (filter (numericFilter (numericField) (numericOp (rangeOp)) (numberVal (number))))) (ERROR (and))) diff --git a/query_language/tokens.js b/query_language/tokens.js new file mode 100644 index 0000000..47c9968 --- /dev/null +++ b/query_language/tokens.js @@ -0,0 +1,105 @@ +// Copyright (C) 2021 The Software Heritage developers +// See the AUTHORS file at the top-level directory of this distribution +// License: GNU General Public License version 3, or any later version +// See top-level LICENSE file for more information + +// Field tokens +const visitTypeField = 'visit_type'; +const sortByField = 'sort_by'; +const limitField = 'limit'; + +// Field categories +const patternFields = ['origin', 'metadata']; +const booleanFields = ['visited']; +const numericFields = ['visits']; +const boundedListFields = [visitTypeField]; +const listFields = ['language', 'license', 'keyword']; +const dateFields = [ + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' +]; + +const fields = [].concat( + patternFields, + booleanFields, + numericFields, + boundedListFields, + listFields, + dateFields +); + +// Operators +const equalOp = ['=']; +const rangeOp = ['<', '<=', '=', '!=', '>=', '>']; +const choiceOp = ['in', 'not in']; + + +// Values +const sortByOptions = [ + 'visits', + 'last_visit', + 'last_eventful_visit', + 'last_revision', + 'last_release', + 'created', + 'modified', + 'published' +]; + +const visitTypeOptions = [ + "any", + "cran", + "deb", + "deposit", + "ftp", + "hg", + "git", + "nixguix", + "npm", + "pypi", + "svn", + "tar" +]; + +// Extra tokens +const OR = "or"; +const AND = "and"; + +const TRUE = "true"; +const FALSE = "false"; + +module.exports = { + // Field tokens + visitTypeField, + sortByField, + limitField, + + // Field categories + patternFields, + booleanFields, + numericFields, + boundedListFields, + listFields, + dateFields, + fields, + + // Operators + equalOp, + rangeOp, + choiceOp, + + // Values + sortByOptions, + visitTypeOptions, + + // Extra tokens + OR, + AND, + TRUE, + FALSE +} diff --git a/swh.search.egg-info/PKG-INFO b/swh.search.egg-info/PKG-INFO index 3c1cc82..94c6798 100644 --- a/swh.search.egg-info/PKG-INFO +++ b/swh.search.egg-info/PKG-INFO @@ -1,90 +1,90 @@ Metadata-Version: 2.1 Name: swh.search -Version: 0.11.1 +Version: 0.11.2 Summary: Software Heritage search service Home-page: https://forge.softwareheritage.org/diffusion/DSEA Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-search Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-search/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-search ========== Search service for the Software Heritage archive. It is similar to swh-storage in what it contains, but provides different ways to query it: while swh-storage is mostly a key-value store that returns an object from a primary key, swh-search is focused on reverse indices, to allow finding objects that match some criteria; for example full-text search. Currently uses ElasticSearch, and provides only origin search (by URL and metadata) ## Dependencies - Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). - Debian-like host The elasticsearch package is required. As it's not part of debian-stable, [another debian repository is required to be configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) - Non Debian-like host The tests expect: - `/usr/share/elasticsearch/jdk/bin/java` to exist. - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. - Emscripten is required for generating tree-sitter WASM module. The following commands need to be executed for the setup: ```bash cd /opt && git clone https://github.com/emscripten-core/emsdk.git && cd emsdk && \ ./emsdk install latest && ./emsdk activate latest PATH="${PATH}:/opt/emsdk/upstream/emscripten" ``` **Note:** If emsdk isn't found in the PATH, the tree-sitter cli automatically pulls `emscripten/emsdk` image from docker hub when `make ts-build-wasm` or `make ts-build` is used. ## Make targets Below is the list of available make targets that can be executed from the root directory of swh-search in order to build and/or execute the swh-search under various configurations: * **ts-install**: Install node_modules and emscripten SDK required for TreeSitter * **ts-generate**: Generate parser files(C and JSON) from the grammar * **ts-repl**: Starts a web based playground for the TreeSitter grammar. It's the recommended way for developing TreeSitter grammar. * **ts-dev**: Parse the `query_language/sample_query` and print the corresponding syntax expression along with the start and end positions of all the nodes. * **ts-dev sanitize=1**: Same as **ts-dev** but without start and end position of the nodes. This format is expected by TreeSitter's native test command. `sanitize=1` cleans the output of **ts-dev** using `sed` to achieve the desired format. * **ts-test**: executes TreeSitter's native tests * **ts-build-so**: Generates `swh_ql.so` file from the previously generated parser using py-tree-sitter * **ts-build-so**: Generates `swh_ql.wasm` file from the previously generated parser using emscripten * **ts-build**: Executes both **ts-build-so** and **ts-build-so** diff --git a/swh.search.egg-info/SOURCES.txt b/swh.search.egg-info/SOURCES.txt index 6d24cc4..f4609a7 100644 --- a/swh.search.egg-info/SOURCES.txt +++ b/swh.search.egg-info/SOURCES.txt @@ -1,70 +1,71 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile Makefile.local README.md mypy.ini package.json pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini yarn.lock docs/.gitignore docs/Makefile docs/cli.rst docs/conf.py docs/index.rst docs/query-language.rst docs/_static/.placeholder docs/_templates/.placeholder es_config/elasticsearch.keystore es_config/elasticsearch.yml es_config/jvm.options es_config/log4j2.properties query_language/.gitignore query_language/build.py query_language/grammar.js query_language/sample_query +query_language/tokens.js query_language/test/corpus/combinations.txt swh/__init__.py swh.search.egg-info/PKG-INFO swh.search.egg-info/SOURCES.txt swh.search.egg-info/dependency_links.txt swh.search.egg-info/entry_points.txt swh.search.egg-info/not-zip-safe swh.search.egg-info/requires.txt swh.search.egg-info/top_level.txt swh/search/__init__.py swh/search/cli.py swh/search/elasticsearch.py swh/search/in_memory.py swh/search/interface.py swh/search/journal_client.py swh/search/metrics.py swh/search/py.typed swh/search/translator.py swh/search/utils.py swh/search/api/__init__.py swh/search/api/client.py swh/search/api/server.py swh/search/tests/__init__.py swh/search/tests/conftest.py swh/search/tests/test_api_client.py swh/search/tests/test_cli.py swh/search/tests/test_elasticsearch.py swh/search/tests/test_in_memory.py swh/search/tests/test_init.py swh/search/tests/test_journal_client.py swh/search/tests/test_search.py swh/search/tests/test_server.py swh/search/tests/test_translator.py \ No newline at end of file diff --git a/swh/search/cli.py b/swh/search/cli.py index 5893027..6979852 100644 --- a/swh/search/cli.py +++ b/swh/search/cli.py @@ -1,113 +1,150 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control +import logging + import click from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group @swh_cli_group.group(name="search", context_settings=CONTEXT_SETTINGS) @click.option( "--config-file", "-C", default=None, type=click.Path(exists=True, dir_okay=False,), help="Configuration file.", ) @click.pass_context def search_cli_group(ctx, config_file): """Software Heritage Search tools.""" from swh.core import config ctx.ensure_object(dict) conf = config.read(config_file) ctx.obj["config"] = conf @search_cli_group.command("initialize") @click.pass_context def initialize(ctx): """Creates Elasticsearch indices.""" from . import get_search search = get_search(**ctx.obj["config"]["search"]) search.initialize() print("Done.") +@search_cli_group.command(name="rpc-serve") +@click.option( + "--host", + default="0.0.0.0", + metavar="IP", + show_default=True, + help="Host ip address to bind the server on", +) +@click.option( + "--port", + default=5010, + type=click.INT, + metavar="PORT", + show_default=True, + help="Binding port of the server", +) +@click.option( + "--debug/--no-debug", + default=True, + help="Indicates if the server should run in debug mode", +) +@click.pass_context +def serve(ctx, host, port, debug): + """Software Heritage Storage RPC server. + + Do NOT use this in a production environment. + """ + from swh.search.api.server import app + + if "log_level" in ctx.obj: + logging.getLogger("werkzeug").setLevel(ctx.obj["log_level"]) + app.config.update(ctx.obj["config"]) + app.run(host, port=int(port), debug=bool(debug)) + + @search_cli_group.group("journal-client") @click.pass_context def journal_client(ctx): """""" pass @journal_client.command("objects") @click.option( "--stop-after-objects", "-m", default=None, type=int, help="Maximum number of objects to replay. Default is to run forever.", ) @click.option( "--object-type", "-o", multiple=True, help="Default list of object types to subscribe to", ) @click.option( "--prefix", "-p", help="Topic prefix to use (e.g swh.journal.indexed)", ) @click.pass_context def journal_client_objects(ctx, stop_after_objects, object_type, prefix): """Listens for new objects from the SWH Journal, and schedules tasks to run relevant indexers (currently, origin and origin_visit) on these new objects. """ import functools from swh.journal.client import get_journal_client from swh.storage import get_storage from . import get_search from .journal_client import process_journal_objects config = ctx.obj["config"] journal_cfg = config["journal"] journal_cfg["object_types"] = object_type or journal_cfg.get("object_types", []) journal_cfg["prefix"] = prefix or journal_cfg.get("prefix") journal_cfg["stop_after_objects"] = stop_after_objects or journal_cfg.get( "stop_after_objects" ) if len(journal_cfg["object_types"]) == 0: raise ValueError("'object_types' must be specified by cli or configuration") if journal_cfg["prefix"] is None: raise ValueError("'prefix' must be specified by cli or configuration") client = get_journal_client(cls="kafka", **journal_cfg,) search = get_search(**config["search"]) storage = get_storage(**config["storage"]) worker_fn = functools.partial( process_journal_objects, search=search, storage=storage ) nb_messages = 0 try: nb_messages = client.process(worker_fn) print("Processed %d messages." % nb_messages) except KeyboardInterrupt: ctx.exit(0) else: print("Done.") finally: client.close() diff --git a/swh/search/static/swh_ql.so b/swh/search/static/swh_ql.so index e193991..28b901d 100644 Binary files a/swh/search/static/swh_ql.so and b/swh/search/static/swh_ql.so differ diff --git a/swh/search/static/swh_ql.wasm b/swh/search/static/swh_ql.wasm index a5f111b..cd6914b 100644 Binary files a/swh/search/static/swh_ql.wasm and b/swh/search/static/swh_ql.wasm differ diff --git a/swh/search/translator.py b/swh/search/translator.py index 9a607bf..03c6344 100644 --- a/swh/search/translator.py +++ b/swh/search/translator.py @@ -1,301 +1,301 @@ import os from pkg_resources import resource_filename from tree_sitter import Language, Parser from swh.search.utils import get_expansion, unescape class Translator: RANGE_OPERATOR_MAP = { ">": "gt", "<": "lt", ">=": "gte", "<=": "lte", } def __init__(self): ql_rel_paths = [ "static/swh_ql.so", # installed - "../../query_language/static/swh_ql.so", # development + "../../query_language/swh_ql.so", # development ] for ql_rel_path in ql_rel_paths: ql_path = resource_filename("swh.search", ql_rel_path) if os.path.exists(ql_path): break else: assert False, "swh_ql.so was not found in any of the expected paths" search_ql = Language(ql_path, "swh_search_ql") self.parser = Parser() self.parser.set_language(search_ql) self.query = "" def parse_query(self, query): self.query = query tree = self.parser.parse(query.encode("utf8")) self.query_node = tree.root_node if self.query_node.has_error: raise Exception("Invalid query") return self._traverse(self.query_node) def _traverse(self, node): if len(node.children) == 3 and node.children[1].type == "filters": # filters => ( filters ) return self._traverse(node.children[1]) # Go past the () brackets if node.type == "query": result = {} for child in node.children: # query => filters sort_by limit result[child.type] = self._traverse(child) return result if node.type == "filters": if len(node.children) == 1: # query => filters # filters => filters # filters => filter # Current node is just a wrapper, so go one level deep return self._traverse(node.children[0]) if len(node.children) == 3: # filters => filters conj_op filters filters1 = self._traverse(node.children[0]) conj_op = self._get_value(node.children[1]) filters2 = self._traverse(node.children[2]) if conj_op == "and": # "must" is equivalent to "AND" return {"bool": {"must": [filters1, filters2]}} if conj_op == "or": # "should" is equivalent to "OR" return {"bool": {"should": [filters1, filters2]}} if node.type == "filter": filter_category = node.children[0] return self._parse_filter(filter_category) if node.type == "sortBy": return self._parse_filter(node) if node.type == "limit": return self._parse_filter(node) return Exception( f"Unknown node type ({node.type}) " f"or unexpected number of children ({node.children})" ) def _get_value(self, node): if ( len(node.children) > 0 and node.children[0].type == "[" and node.children[-1].type == "]" ): # array return [self._get_value(child) for child in node.children if child.is_named] start = node.start_point[1] end = node.end_point[1] value = self.query[start:end] if len(value) > 1 and ( (value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"') ): return unescape(value[1:-1]) if node.type in ["number", "numberVal"]: return int(value) return unescape(value) def _parse_filter(self, filter): if filter.type == "boundedListFilter": filter = filter.children[0] children = filter.children assert len(children) == 3 category = filter.type name, op, value = [self._get_value(child) for child in children] if category == "patternFilter": if name == "origin": return { "multi_match": { "query": value, "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } elif name == "metadata": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": value, # Makes it so that the "foo bar" query returns # documents which contain "foo" in a field and "bar" # in a different field "type": "cross_fields", # All keywords must be found in a document for it to # be considered a match. # TODO: allow missing keywords? "operator": "and", # Searches on all fields of the intrinsic_metadata dict, # recursively. "fields": ["intrinsic_metadata.*"], # date{Created,Modified,Published} are of type date "lenient": True, } }, } } if category == "booleanFilter": if name == "visited": return {"term": {"has_visits": value == "true"}} if category == "numericFilter": if name == "visits": if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ {"range": {"nb_visits": {"gte": value, "lte": value}}} ] } } else: return { "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}} } if category == "visitTypeFilter": if name == "visit_type": return {"terms": {"visit_types": value}} if category == "unboundedListFilter": value_array = value if name == "keyword": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": " ".join(value_array), "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), # "^2" boosts an origin's score by 2x # if it the queried keywords are # found in its intrinsic_metadata.keywords ], } }, } } elif name in ["language", "license"]: name_mapping = { "language": "programming_languages", "license": "licenses", } name = name_mapping[name] return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ {"match": {get_expansion(name, "."): val}} for val in value_array ], } }, } } if category == "dateFilter": if name in ["created", "modified", "published"]: if op in ["=", "!="]: return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { get_expansion(f"date_{name}", "."): { "gte": value, "lte": value, } } } ], } }, } } return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must": [ { "range": { get_expansion(f"date_{name}", "."): { self.RANGE_OPERATOR_MAP[op]: value, } } } ], } }, } } else: if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { f"{name}_date": {"gte": value, "lte": value,} } } ], } } return { "range": { f"{name}_date": { self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"), } } } if category == "sortBy": return value if category == "limit": return value raise Exception(f"Unknown filter {category}.{name}")