diff --git a/swh/search/cli.py b/swh/search/cli.py index 5893027..6979852 100644 --- a/swh/search/cli.py +++ b/swh/search/cli.py @@ -1,113 +1,150 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control +import logging + import click from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group @swh_cli_group.group(name="search", context_settings=CONTEXT_SETTINGS) @click.option( "--config-file", "-C", default=None, type=click.Path(exists=True, dir_okay=False,), help="Configuration file.", ) @click.pass_context def search_cli_group(ctx, config_file): """Software Heritage Search tools.""" from swh.core import config ctx.ensure_object(dict) conf = config.read(config_file) ctx.obj["config"] = conf @search_cli_group.command("initialize") @click.pass_context def initialize(ctx): """Creates Elasticsearch indices.""" from . import get_search search = get_search(**ctx.obj["config"]["search"]) search.initialize() print("Done.") +@search_cli_group.command(name="rpc-serve") +@click.option( + "--host", + default="0.0.0.0", + metavar="IP", + show_default=True, + help="Host ip address to bind the server on", +) +@click.option( + "--port", + default=5010, + type=click.INT, + metavar="PORT", + show_default=True, + help="Binding port of the server", +) +@click.option( + "--debug/--no-debug", + default=True, + help="Indicates if the server should run in debug mode", +) +@click.pass_context +def serve(ctx, host, port, debug): + """Software Heritage Storage RPC server. + + Do NOT use this in a production environment. + """ + from swh.search.api.server import app + + if "log_level" in ctx.obj: + logging.getLogger("werkzeug").setLevel(ctx.obj["log_level"]) + app.config.update(ctx.obj["config"]) + app.run(host, port=int(port), debug=bool(debug)) + + @search_cli_group.group("journal-client") @click.pass_context def journal_client(ctx): """""" pass @journal_client.command("objects") @click.option( "--stop-after-objects", "-m", default=None, type=int, help="Maximum number of objects to replay. Default is to run forever.", ) @click.option( "--object-type", "-o", multiple=True, help="Default list of object types to subscribe to", ) @click.option( "--prefix", "-p", help="Topic prefix to use (e.g swh.journal.indexed)", ) @click.pass_context def journal_client_objects(ctx, stop_after_objects, object_type, prefix): """Listens for new objects from the SWH Journal, and schedules tasks to run relevant indexers (currently, origin and origin_visit) on these new objects. """ import functools from swh.journal.client import get_journal_client from swh.storage import get_storage from . import get_search from .journal_client import process_journal_objects config = ctx.obj["config"] journal_cfg = config["journal"] journal_cfg["object_types"] = object_type or journal_cfg.get("object_types", []) journal_cfg["prefix"] = prefix or journal_cfg.get("prefix") journal_cfg["stop_after_objects"] = stop_after_objects or journal_cfg.get( "stop_after_objects" ) if len(journal_cfg["object_types"]) == 0: raise ValueError("'object_types' must be specified by cli or configuration") if journal_cfg["prefix"] is None: raise ValueError("'prefix' must be specified by cli or configuration") client = get_journal_client(cls="kafka", **journal_cfg,) search = get_search(**config["search"]) storage = get_storage(**config["storage"]) worker_fn = functools.partial( process_journal_objects, search=search, storage=storage ) nb_messages = 0 try: nb_messages = client.process(worker_fn) print("Processed %d messages." % nb_messages) except KeyboardInterrupt: ctx.exit(0) else: print("Done.") finally: client.close() diff --git a/swh/search/translator.py b/swh/search/translator.py index 9a607bf..03c6344 100644 --- a/swh/search/translator.py +++ b/swh/search/translator.py @@ -1,301 +1,301 @@ import os from pkg_resources import resource_filename from tree_sitter import Language, Parser from swh.search.utils import get_expansion, unescape class Translator: RANGE_OPERATOR_MAP = { ">": "gt", "<": "lt", ">=": "gte", "<=": "lte", } def __init__(self): ql_rel_paths = [ "static/swh_ql.so", # installed - "../../query_language/static/swh_ql.so", # development + "../../query_language/swh_ql.so", # development ] for ql_rel_path in ql_rel_paths: ql_path = resource_filename("swh.search", ql_rel_path) if os.path.exists(ql_path): break else: assert False, "swh_ql.so was not found in any of the expected paths" search_ql = Language(ql_path, "swh_search_ql") self.parser = Parser() self.parser.set_language(search_ql) self.query = "" def parse_query(self, query): self.query = query tree = self.parser.parse(query.encode("utf8")) self.query_node = tree.root_node if self.query_node.has_error: raise Exception("Invalid query") return self._traverse(self.query_node) def _traverse(self, node): if len(node.children) == 3 and node.children[1].type == "filters": # filters => ( filters ) return self._traverse(node.children[1]) # Go past the () brackets if node.type == "query": result = {} for child in node.children: # query => filters sort_by limit result[child.type] = self._traverse(child) return result if node.type == "filters": if len(node.children) == 1: # query => filters # filters => filters # filters => filter # Current node is just a wrapper, so go one level deep return self._traverse(node.children[0]) if len(node.children) == 3: # filters => filters conj_op filters filters1 = self._traverse(node.children[0]) conj_op = self._get_value(node.children[1]) filters2 = self._traverse(node.children[2]) if conj_op == "and": # "must" is equivalent to "AND" return {"bool": {"must": [filters1, filters2]}} if conj_op == "or": # "should" is equivalent to "OR" return {"bool": {"should": [filters1, filters2]}} if node.type == "filter": filter_category = node.children[0] return self._parse_filter(filter_category) if node.type == "sortBy": return self._parse_filter(node) if node.type == "limit": return self._parse_filter(node) return Exception( f"Unknown node type ({node.type}) " f"or unexpected number of children ({node.children})" ) def _get_value(self, node): if ( len(node.children) > 0 and node.children[0].type == "[" and node.children[-1].type == "]" ): # array return [self._get_value(child) for child in node.children if child.is_named] start = node.start_point[1] end = node.end_point[1] value = self.query[start:end] if len(value) > 1 and ( (value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"') ): return unescape(value[1:-1]) if node.type in ["number", "numberVal"]: return int(value) return unescape(value) def _parse_filter(self, filter): if filter.type == "boundedListFilter": filter = filter.children[0] children = filter.children assert len(children) == 3 category = filter.type name, op, value = [self._get_value(child) for child in children] if category == "patternFilter": if name == "origin": return { "multi_match": { "query": value, "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } elif name == "metadata": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": value, # Makes it so that the "foo bar" query returns # documents which contain "foo" in a field and "bar" # in a different field "type": "cross_fields", # All keywords must be found in a document for it to # be considered a match. # TODO: allow missing keywords? "operator": "and", # Searches on all fields of the intrinsic_metadata dict, # recursively. "fields": ["intrinsic_metadata.*"], # date{Created,Modified,Published} are of type date "lenient": True, } }, } } if category == "booleanFilter": if name == "visited": return {"term": {"has_visits": value == "true"}} if category == "numericFilter": if name == "visits": if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ {"range": {"nb_visits": {"gte": value, "lte": value}}} ] } } else: return { "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}} } if category == "visitTypeFilter": if name == "visit_type": return {"terms": {"visit_types": value}} if category == "unboundedListFilter": value_array = value if name == "keyword": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": " ".join(value_array), "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), # "^2" boosts an origin's score by 2x # if it the queried keywords are # found in its intrinsic_metadata.keywords ], } }, } } elif name in ["language", "license"]: name_mapping = { "language": "programming_languages", "license": "licenses", } name = name_mapping[name] return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ {"match": {get_expansion(name, "."): val}} for val in value_array ], } }, } } if category == "dateFilter": if name in ["created", "modified", "published"]: if op in ["=", "!="]: return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { get_expansion(f"date_{name}", "."): { "gte": value, "lte": value, } } } ], } }, } } return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must": [ { "range": { get_expansion(f"date_{name}", "."): { self.RANGE_OPERATOR_MAP[op]: value, } } } ], } }, } } else: if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { f"{name}_date": {"gte": value, "lte": value,} } } ], } } return { "range": { f"{name}_date": { self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"), } } } if category == "sortBy": return value if category == "limit": return value raise Exception(f"Unknown filter {category}.{name}")