diff --git a/setup.py b/setup.py index bf3108d..a8d4c08 100755 --- a/setup.py +++ b/setup.py @@ -1,182 +1,185 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.cmd import Command from distutils.command.build import build from io import open import os import shutil import subprocess from setuptools import find_packages, setup +from setuptools.command.develop import develop from setuptools.command.sdist import sdist here = os.path.abspath(os.path.dirname(__file__)) # Get the long description from the README file with open(os.path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not os.path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements yarn = os.environ.get("YARN", "yarn") class TSCommand(Command): user_options = [] def initialize_options(self): pass def finalize_options(self): pass class TSInstallCommand(TSCommand): description = "Installs node_modules related to query language" def run(self): subprocess.run([yarn, "install"], check=True) class TSBuildSoCommand(TSCommand): description = "Builds swh_ql.so" def initialize_options(self): self.build_lib = None super().initialize_options() def finalize_options(self): self.set_undefined_options("build", ("build_lib", "build_lib")) super().finalize_options() def run(self): self.run_command("ts_install") ql_dir = os.path.join(self.build_lib, "swh/search/query_language") if not os.path.exists(os.path.join(ql_dir, "src/parser.c")): - generate_parser(ql_dir) + generate_parser(ql_dir, copy_tree=True) static_dir = os.path.join(self.build_lib, "swh/search/static") os.makedirs(static_dir, exist_ok=True) # This import cannot be toplevel, as setuptools installs it after the script # starts running from tree_sitter import Language Language.build_library(os.path.join(static_dir, "swh_ql.so"), [ql_dir]) print("swh_ql.so file generated") -class TSBuildWasmCommand(TSCommand): - description = "Builds swh_ql.wasm" - - def run(self): - subprocess.run([yarn, "build-wasm"], check=True) - print("swh_ql.wasm file generated") - - class TSBuildCommand(TSCommand): description = "Builds swh_ql.so and swh_ql.wasm" def run(self): self.run_command("ts_build_so") class custom_build(build): def run(self): super().run() if not self.dry_run: self.run_command("ts_build") class custom_sdist(sdist): def make_release_tree(self, base_dir, files): super().make_release_tree(base_dir, files) dist_ql_path = os.path.join(base_dir, "swh/search/query_language") if not self.dry_run: self.run_command("ts_install") - generate_parser(dist_ql_path) + generate_parser(dist_ql_path, copy_tree=True) + + +class custom_develop(develop): + def run(self): + super().run() + + if not self.dry_run: + generate_parser("swh/search/query_language", copy_tree=False) -def generate_parser(dest_path): - # FIXME: setuptools should copy this itself... - print("Copying parser files") - if os.path.exists(dest_path): - shutil.rmtree(dest_path) - shutil.copytree("swh/search/query_language", dest_path) +def generate_parser(dest_path, copy_tree): + if copy_tree: + # FIXME: setuptools should copy this itself... + print("Copying parser files") + if os.path.exists(dest_path): + shutil.rmtree(dest_path) + shutil.copytree("swh/search/query_language", dest_path) print("Getting path") path = subprocess.check_output([yarn, "bin"]).decode().strip() env = {**os.environ, "PATH": os.pathsep.join([path, os.environ["PATH"]])} print("Generating") subprocess.run(["tree-sitter", "generate", "--no-bindings"], cwd=dest_path, env=env) setup( name="swh.search", description="Software Heritage search service", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DSEA", packages=find_packages(), # packages's modules install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), entry_points=""" [swh.cli.subcommands] search=swh.search.cli """, setup_requires=["setuptools-scm", "tree-sitter==0.19.0"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 3 - Alpha", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-search", "Documentation": "https://docs.softwareheritage.org/devel/swh-search/", }, cmdclass={ "build": custom_build, "sdist": custom_sdist, + "develop": custom_develop, "ts_install": TSInstallCommand, "ts_build_so": TSBuildSoCommand, "ts_build": TSBuildCommand, }, zip_safe=False, ) diff --git a/swh/search/translator.py b/swh/search/translator.py index 03c6344..4229bde 100644 --- a/swh/search/translator.py +++ b/swh/search/translator.py @@ -1,301 +1,307 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging import os +import tempfile from pkg_resources import resource_filename from tree_sitter import Language, Parser from swh.search.utils import get_expansion, unescape +logger = logging.getLogger(__name__) + class Translator: RANGE_OPERATOR_MAP = { ">": "gt", "<": "lt", ">=": "gte", "<=": "lte", } def __init__(self): - ql_rel_paths = [ - "static/swh_ql.so", # installed - "../../query_language/swh_ql.so", # development - ] - for ql_rel_path in ql_rel_paths: - ql_path = resource_filename("swh.search", ql_rel_path) - if os.path.exists(ql_path): - break - else: - assert False, "swh_ql.so was not found in any of the expected paths" + ql_path = resource_filename("swh.search", "static/swh_ql.so") + if not os.path.exists(ql_path): + logging.info("%s does not exist, building in temporary directory", ql_path) + self._build_dir = tempfile.TemporaryDirectory(prefix="swh.search-build") + source_path = resource_filename("swh.search", "query_language") + ql_path = os.path.join(self._build_dir.name, "swh_ql.so") + Language.build_library(ql_path, [source_path]) search_ql = Language(ql_path, "swh_search_ql") self.parser = Parser() self.parser.set_language(search_ql) self.query = "" def parse_query(self, query): self.query = query tree = self.parser.parse(query.encode("utf8")) self.query_node = tree.root_node if self.query_node.has_error: raise Exception("Invalid query") return self._traverse(self.query_node) def _traverse(self, node): if len(node.children) == 3 and node.children[1].type == "filters": # filters => ( filters ) return self._traverse(node.children[1]) # Go past the () brackets if node.type == "query": result = {} for child in node.children: # query => filters sort_by limit result[child.type] = self._traverse(child) return result if node.type == "filters": if len(node.children) == 1: # query => filters # filters => filters # filters => filter # Current node is just a wrapper, so go one level deep return self._traverse(node.children[0]) if len(node.children) == 3: # filters => filters conj_op filters filters1 = self._traverse(node.children[0]) conj_op = self._get_value(node.children[1]) filters2 = self._traverse(node.children[2]) if conj_op == "and": # "must" is equivalent to "AND" return {"bool": {"must": [filters1, filters2]}} if conj_op == "or": # "should" is equivalent to "OR" return {"bool": {"should": [filters1, filters2]}} if node.type == "filter": filter_category = node.children[0] return self._parse_filter(filter_category) if node.type == "sortBy": return self._parse_filter(node) if node.type == "limit": return self._parse_filter(node) return Exception( f"Unknown node type ({node.type}) " f"or unexpected number of children ({node.children})" ) def _get_value(self, node): if ( len(node.children) > 0 and node.children[0].type == "[" and node.children[-1].type == "]" ): # array return [self._get_value(child) for child in node.children if child.is_named] start = node.start_point[1] end = node.end_point[1] value = self.query[start:end] if len(value) > 1 and ( (value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"') ): return unescape(value[1:-1]) if node.type in ["number", "numberVal"]: return int(value) return unescape(value) def _parse_filter(self, filter): if filter.type == "boundedListFilter": filter = filter.children[0] children = filter.children assert len(children) == 3 category = filter.type name, op, value = [self._get_value(child) for child in children] if category == "patternFilter": if name == "origin": return { "multi_match": { "query": value, "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } elif name == "metadata": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": value, # Makes it so that the "foo bar" query returns # documents which contain "foo" in a field and "bar" # in a different field "type": "cross_fields", # All keywords must be found in a document for it to # be considered a match. # TODO: allow missing keywords? "operator": "and", # Searches on all fields of the intrinsic_metadata dict, # recursively. "fields": ["intrinsic_metadata.*"], # date{Created,Modified,Published} are of type date "lenient": True, } }, } } if category == "booleanFilter": if name == "visited": return {"term": {"has_visits": value == "true"}} if category == "numericFilter": if name == "visits": if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ {"range": {"nb_visits": {"gte": value, "lte": value}}} ] } } else: return { "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}} } if category == "visitTypeFilter": if name == "visit_type": return {"terms": {"visit_types": value}} if category == "unboundedListFilter": value_array = value if name == "keyword": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": " ".join(value_array), "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), # "^2" boosts an origin's score by 2x # if it the queried keywords are # found in its intrinsic_metadata.keywords ], } }, } } elif name in ["language", "license"]: name_mapping = { "language": "programming_languages", "license": "licenses", } name = name_mapping[name] return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ {"match": {get_expansion(name, "."): val}} for val in value_array ], } }, } } if category == "dateFilter": if name in ["created", "modified", "published"]: if op in ["=", "!="]: return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { get_expansion(f"date_{name}", "."): { "gte": value, "lte": value, } } } ], } }, } } return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must": [ { "range": { get_expansion(f"date_{name}", "."): { self.RANGE_OPERATOR_MAP[op]: value, } } } ], } }, } } else: if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { f"{name}_date": {"gte": value, "lte": value,} } } ], } } return { "range": { f"{name}_date": { self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"), } } } if category == "sortBy": return value if category == "limit": return value raise Exception(f"Unknown filter {category}.{name}")