diff --git a/setup.py b/setup.py index 8235f36..122a92d 100755 --- a/setup.py +++ b/setup.py @@ -1,176 +1,188 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.cmd import Command from io import open from os import environ, path, system from setuptools import find_packages, setup from setuptools.command.build_py import build_py from setuptools.command.sdist import sdist here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements yarn = environ.get("YARN", "yarn") class TSCommand(Command): user_options = [] def initialize_options(self): pass def finalize_options(self): pass class TSInstallCommand(TSCommand): description = "Installs node_modules related to query language" def run(self): system(f"{yarn} install") class TSGenerateCommand(TSCommand): description = "Generates parser related files from grammar.js" def run(self): system(f"{yarn} generate") class TSBuildSoCommand(TSCommand): description = "Builds swh_ql.so" def run(self): system(f"{yarn} build-so && echo 'swh_ql.so file generated'") class TSBuildWasmCommand(TSCommand): description = "Builds swh_ql.wasm" def run(self): system(f"{yarn} build-wasm && echo 'swh_ql.wasm file generated'") class TSBuildCommand(TSCommand): description = "Builds swh_ql.so and swh_ql.wasm" def run(self): self.run_command("ts_build_so") self.run_command("ts_build_wasm") class TSBuildExportCommand(TSCommand): description = "Builds swh_ql.so and swh_ql.wasm and exports them to static/" def initialize_options(self): self.build_lib = None super().initialize_options() def finalize_options(self): self.set_undefined_options("build", ("build_lib", "build_lib")) super().finalize_options() def run(self): self.run_command("ts_install") self.run_command("ts_build") system("echo 'static files generated. copying them to package dir'") - system(f"cp query_language/swh_ql.so {self.build_lib}/swh/search/swh_ql.so") - system(f"cp query_language/swh_ql.wasm {self.build_lib}/swh/search/swh_ql.wasm") + system(f"mkdir {self.build_lib}/swh/search/static") + system( + f"cp query_language/swh_ql.so {self.build_lib}/swh/search/static/swh_ql.so" + ) + system( + f"cp query_language/swh_ql.wasm " + f"{self.build_lib}/swh/search/static/swh_ql.wasm" + ) class custom_build(build_py): def run(self): super().run() if not self.dry_run: self.run_command("ts_build_export") class custom_sdist(sdist): def make_release_tree(self, base_dir, files): super().make_release_tree(base_dir, files) # TODO: build the .c file and .wasm but not .so, because it's architecture- # dependent, and shouldn't be in a sdist (aka *source* distribution) if not self.dry_run: self.run_command("ts_install") self.run_command("ts_build") system("echo 'static files generated. copying them to package dir'") - system(f"cp query_language/swh_ql.so {base_dir}/swh/search/swh_ql.so") - system(f"cp query_language/swh_ql.wasm {base_dir}/swh/search/swh_ql.wasm") + system(f"mkdir {base_dir}/swh/search/static") + system( + f"cp query_language/swh_ql.so {base_dir}/swh/search/static/swh_ql.so" + ) + system( + f"cp query_language/swh_ql.wasm " + f"{base_dir}/swh/search/static/swh_ql.wasm" + ) setup( name="swh.search", description="Software Heritage search service", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DSEA", packages=find_packages(), # packages's modules install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), entry_points=""" [swh.cli.subcommands] search=swh.search.cli """, setup_requires=["setuptools-scm"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 3 - Alpha", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-search", "Documentation": "https://docs.softwareheritage.org/devel/swh-search/", }, cmdclass={ "build_py": custom_build, "sdist": custom_sdist, "ts_install": TSInstallCommand, "ts_generate": TSGenerateCommand, "ts_build_so": TSBuildSoCommand, "ts_build_wasm": TSBuildWasmCommand, "ts_build": TSBuildCommand, "ts_build_export": TSBuildExportCommand, }, zip_safe=False, ) diff --git a/swh/search/translator.py b/swh/search/translator.py index 5c2b9d7..271f172 100644 --- a/swh/search/translator.py +++ b/swh/search/translator.py @@ -1,288 +1,288 @@ import os from pkg_resources import resource_filename from tree_sitter import Language, Parser from swh.search.utils import get_expansion class Translator: RANGE_OPERATOR_MAP = { ">": "gt", "<": "lt", ">=": "gte", "<=": "lte", } def __init__(self): ql_rel_paths = [ - "swh_ql.so", # installed - "../../query_language/swh_ql.so", # development + "static/swh_ql.so", # installed + "../../query_language/static/swh_ql.so", # development ] for ql_rel_path in ql_rel_paths: ql_path = resource_filename("swh.search", ql_rel_path) if os.path.exists(ql_path): break else: assert False, "swh_ql.so was not found in any of the expected paths" search_ql = Language(ql_path, "swh_search_ql") self.parser = Parser() self.parser.set_language(search_ql) self.query = "" def parse_query(self, query): self.query = query tree = self.parser.parse(query.encode("utf8")) self.query_node = tree.root_node if self.query_node.has_error: raise Exception("Invalid query") return self._traverse(self.query_node) def _traverse(self, node): if len(node.children) == 3 and node.children[1].type == "filters": # filters => ( filters ) return self._traverse(node.children[1]) # Go past the () brackets if node.type == "query": result = {} for child in node.children: # query => filters sort_by limit result[child.type] = self._traverse(child) return result if node.type == "filters": if len(node.children) == 1: # query => filters # filters => filters # filters => filter # Current node is just a wrapper, so go one level deep return self._traverse(node.children[0]) if len(node.children) == 3: # filters => filters conj_op filters filters1 = self._traverse(node.children[0]) conj_op = self._get_value(node.children[1]) filters2 = self._traverse(node.children[2]) if conj_op == "and": return {"bool": {"must": [filters1, filters2]}} if conj_op == "or": return {"bool": {"should": [filters1, filters2]}} if node.type == "filter": filter_category = node.children[0] return self._parse_filter(filter_category) if node.type == "sortBy": return self._parse_filter(node) if node.type == "limit": return self._parse_filter(node) return Exception( f"Unknown node type ({node.type}) " f"or unexpected number of children ({node.children})" ) def _get_value(self, node): if ( len(node.children) > 0 and node.children[0].type == "[" and node.children[-1].type == "]" ): # array return [self._get_value(child) for child in node.children if child.is_named] start = node.start_point[1] end = node.end_point[1] value = self.query[start:end] if len(value) > 1 and ( (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"') ): return value[1:-1] if node.type in ["number", "numberVal"]: return int(value) return value def _parse_filter(self, filter): if filter.type == "boundedListFilter": filter = filter.children[0] children = filter.children assert len(children) == 3 category = filter.type name, op, value = [self._get_value(child) for child in children] if category == "patternFilter": if name == "origin": return { "multi_match": { "query": value, "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } elif name == "metadata": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": value, "type": "cross_fields", "operator": "and", "fields": ["intrinsic_metadata.*"], "lenient": True, } }, } } if category == "booleanFilter": if name == "visited": return {"term": {"has_visits": value == "true"}} if category == "numericFilter": if name == "visits": if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ {"range": {"nb_visits": {"gte": value, "lte": value}}} ] } } else: return { "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}} } if category == "visitTypeFilter": if name == "visit_type": return {"terms": {"visit_types": value}} if category == "unboundedListFilter": value_array = value if name == "keyword": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": " ".join(value_array), "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, } } elif name in ["language", "license"]: name_mapping = { "language": "programming_languages", "license": "licenses", } name = name_mapping[name] return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ {"match": {get_expansion(name, "."): val}} for val in value_array ], } }, } } if category == "dateFilter": if name in ["created", "modified", "published"]: if op in ["=", "!="]: return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { get_expansion(f"date_{name}", "."): { "gte": value, "lte": value, } } } ], } }, } } return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must": [ { "range": { get_expansion(f"date_{name}", "."): { self.RANGE_OPERATOR_MAP[op]: value, } } } ], } }, } } else: if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { f"{name}_date": {"gte": value, "lte": value,} } } ], } } return { "range": { f"{name}_date": { self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"), } } } if category == "sortBy": return value if category == "limit": return value raise Exception(f"Unknown filter {category}.{name}")