Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/setup.py b/setup.py
index bf3108d..a8d4c08 100755
--- a/setup.py
+++ b/setup.py
@@ -1,182 +1,185 @@
#!/usr/bin/env python3
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from distutils.cmd import Command
from distutils.command.build import build
from io import open
import os
import shutil
import subprocess
from setuptools import find_packages, setup
+from setuptools.command.develop import develop
from setuptools.command.sdist import sdist
here = os.path.abspath(os.path.dirname(__file__))
# Get the long description from the README file
with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
long_description = f.read()
def parse_requirements(name=None):
if name:
reqf = "requirements-%s.txt" % name
else:
reqf = "requirements.txt"
requirements = []
if not os.path.exists(reqf):
return requirements
with open(reqf) as f:
for line in f.readlines():
line = line.strip()
if not line or line.startswith("#"):
continue
requirements.append(line)
return requirements
yarn = os.environ.get("YARN", "yarn")
class TSCommand(Command):
user_options = []
def initialize_options(self):
pass
def finalize_options(self):
pass
class TSInstallCommand(TSCommand):
description = "Installs node_modules related to query language"
def run(self):
subprocess.run([yarn, "install"], check=True)
class TSBuildSoCommand(TSCommand):
description = "Builds swh_ql.so"
def initialize_options(self):
self.build_lib = None
super().initialize_options()
def finalize_options(self):
self.set_undefined_options("build", ("build_lib", "build_lib"))
super().finalize_options()
def run(self):
self.run_command("ts_install")
ql_dir = os.path.join(self.build_lib, "swh/search/query_language")
if not os.path.exists(os.path.join(ql_dir, "src/parser.c")):
- generate_parser(ql_dir)
+ generate_parser(ql_dir, copy_tree=True)
static_dir = os.path.join(self.build_lib, "swh/search/static")
os.makedirs(static_dir, exist_ok=True)
# This import cannot be toplevel, as setuptools installs it after the script
# starts running
from tree_sitter import Language
Language.build_library(os.path.join(static_dir, "swh_ql.so"), [ql_dir])
print("swh_ql.so file generated")
-class TSBuildWasmCommand(TSCommand):
- description = "Builds swh_ql.wasm"
-
- def run(self):
- subprocess.run([yarn, "build-wasm"], check=True)
- print("swh_ql.wasm file generated")
-
-
class TSBuildCommand(TSCommand):
description = "Builds swh_ql.so and swh_ql.wasm"
def run(self):
self.run_command("ts_build_so")
class custom_build(build):
def run(self):
super().run()
if not self.dry_run:
self.run_command("ts_build")
class custom_sdist(sdist):
def make_release_tree(self, base_dir, files):
super().make_release_tree(base_dir, files)
dist_ql_path = os.path.join(base_dir, "swh/search/query_language")
if not self.dry_run:
self.run_command("ts_install")
- generate_parser(dist_ql_path)
+ generate_parser(dist_ql_path, copy_tree=True)
+
+
+class custom_develop(develop):
+ def run(self):
+ super().run()
+
+ if not self.dry_run:
+ generate_parser("swh/search/query_language", copy_tree=False)
-def generate_parser(dest_path):
- # FIXME: setuptools should copy this itself...
- print("Copying parser files")
- if os.path.exists(dest_path):
- shutil.rmtree(dest_path)
- shutil.copytree("swh/search/query_language", dest_path)
+def generate_parser(dest_path, copy_tree):
+ if copy_tree:
+ # FIXME: setuptools should copy this itself...
+ print("Copying parser files")
+ if os.path.exists(dest_path):
+ shutil.rmtree(dest_path)
+ shutil.copytree("swh/search/query_language", dest_path)
print("Getting path")
path = subprocess.check_output([yarn, "bin"]).decode().strip()
env = {**os.environ, "PATH": os.pathsep.join([path, os.environ["PATH"]])}
print("Generating")
subprocess.run(["tree-sitter", "generate", "--no-bindings"], cwd=dest_path, env=env)
setup(
name="swh.search",
description="Software Heritage search service",
long_description=long_description,
long_description_content_type="text/markdown",
python_requires=">=3.7",
author="Software Heritage developers",
author_email="swh-devel@inria.fr",
url="https://forge.softwareheritage.org/diffusion/DSEA",
packages=find_packages(), # packages's modules
install_requires=parse_requirements() + parse_requirements("swh"),
tests_require=parse_requirements("test"),
entry_points="""
[swh.cli.subcommands]
search=swh.search.cli
""",
setup_requires=["setuptools-scm", "tree-sitter==0.19.0"],
use_scm_version=True,
extras_require={"testing": parse_requirements("test")},
include_package_data=True,
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Development Status :: 3 - Alpha",
],
project_urls={
"Bug Reports": "https://forge.softwareheritage.org/maniphest",
"Funding": "https://www.softwareheritage.org/donate",
"Source": "https://forge.softwareheritage.org/source/swh-search",
"Documentation": "https://docs.softwareheritage.org/devel/swh-search/",
},
cmdclass={
"build": custom_build,
"sdist": custom_sdist,
+ "develop": custom_develop,
"ts_install": TSInstallCommand,
"ts_build_so": TSBuildSoCommand,
"ts_build": TSBuildCommand,
},
zip_safe=False,
)
diff --git a/swh/search/translator.py b/swh/search/translator.py
index 03c6344..4229bde 100644
--- a/swh/search/translator.py
+++ b/swh/search/translator.py
@@ -1,301 +1,307 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
import os
+import tempfile
from pkg_resources import resource_filename
from tree_sitter import Language, Parser
from swh.search.utils import get_expansion, unescape
+logger = logging.getLogger(__name__)
+
class Translator:
RANGE_OPERATOR_MAP = {
">": "gt",
"<": "lt",
">=": "gte",
"<=": "lte",
}
def __init__(self):
- ql_rel_paths = [
- "static/swh_ql.so", # installed
- "../../query_language/swh_ql.so", # development
- ]
- for ql_rel_path in ql_rel_paths:
- ql_path = resource_filename("swh.search", ql_rel_path)
- if os.path.exists(ql_path):
- break
- else:
- assert False, "swh_ql.so was not found in any of the expected paths"
+ ql_path = resource_filename("swh.search", "static/swh_ql.so")
+ if not os.path.exists(ql_path):
+ logging.info("%s does not exist, building in temporary directory", ql_path)
+ self._build_dir = tempfile.TemporaryDirectory(prefix="swh.search-build")
+ source_path = resource_filename("swh.search", "query_language")
+ ql_path = os.path.join(self._build_dir.name, "swh_ql.so")
+ Language.build_library(ql_path, [source_path])
search_ql = Language(ql_path, "swh_search_ql")
self.parser = Parser()
self.parser.set_language(search_ql)
self.query = ""
def parse_query(self, query):
self.query = query
tree = self.parser.parse(query.encode("utf8"))
self.query_node = tree.root_node
if self.query_node.has_error:
raise Exception("Invalid query")
return self._traverse(self.query_node)
def _traverse(self, node):
if len(node.children) == 3 and node.children[1].type == "filters":
# filters => ( filters )
return self._traverse(node.children[1]) # Go past the () brackets
if node.type == "query":
result = {}
for child in node.children:
# query => filters sort_by limit
result[child.type] = self._traverse(child)
return result
if node.type == "filters":
if len(node.children) == 1:
# query => filters
# filters => filters
# filters => filter
# Current node is just a wrapper, so go one level deep
return self._traverse(node.children[0])
if len(node.children) == 3:
# filters => filters conj_op filters
filters1 = self._traverse(node.children[0])
conj_op = self._get_value(node.children[1])
filters2 = self._traverse(node.children[2])
if conj_op == "and":
# "must" is equivalent to "AND"
return {"bool": {"must": [filters1, filters2]}}
if conj_op == "or":
# "should" is equivalent to "OR"
return {"bool": {"should": [filters1, filters2]}}
if node.type == "filter":
filter_category = node.children[0]
return self._parse_filter(filter_category)
if node.type == "sortBy":
return self._parse_filter(node)
if node.type == "limit":
return self._parse_filter(node)
return Exception(
f"Unknown node type ({node.type}) "
f"or unexpected number of children ({node.children})"
)
def _get_value(self, node):
if (
len(node.children) > 0
and node.children[0].type == "["
and node.children[-1].type == "]"
):
# array
return [self._get_value(child) for child in node.children if child.is_named]
start = node.start_point[1]
end = node.end_point[1]
value = self.query[start:end]
if len(value) > 1 and (
(value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"')
):
return unescape(value[1:-1])
if node.type in ["number", "numberVal"]:
return int(value)
return unescape(value)
def _parse_filter(self, filter):
if filter.type == "boundedListFilter":
filter = filter.children[0]
children = filter.children
assert len(children) == 3
category = filter.type
name, op, value = [self._get_value(child) for child in children]
if category == "patternFilter":
if name == "origin":
return {
"multi_match": {
"query": value,
"type": "bool_prefix",
"operator": "and",
"fields": [
"url.as_you_type",
"url.as_you_type._2gram",
"url.as_you_type._3gram",
],
}
}
elif name == "metadata":
return {
"nested": {
"path": "intrinsic_metadata",
"query": {
"multi_match": {
"query": value,
# Makes it so that the "foo bar" query returns
# documents which contain "foo" in a field and "bar"
# in a different field
"type": "cross_fields",
# All keywords must be found in a document for it to
# be considered a match.
# TODO: allow missing keywords?
"operator": "and",
# Searches on all fields of the intrinsic_metadata dict,
# recursively.
"fields": ["intrinsic_metadata.*"],
# date{Created,Modified,Published} are of type date
"lenient": True,
}
},
}
}
if category == "booleanFilter":
if name == "visited":
return {"term": {"has_visits": value == "true"}}
if category == "numericFilter":
if name == "visits":
if op in ["=", "!="]:
return {
"bool": {
("must" if op == "=" else "must_not"): [
{"range": {"nb_visits": {"gte": value, "lte": value}}}
]
}
}
else:
return {
"range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}}
}
if category == "visitTypeFilter":
if name == "visit_type":
return {"terms": {"visit_types": value}}
if category == "unboundedListFilter":
value_array = value
if name == "keyword":
return {
"nested": {
"path": "intrinsic_metadata",
"query": {
"multi_match": {
"query": " ".join(value_array),
"fields": [
get_expansion("keywords", ".") + "^2",
get_expansion("descriptions", "."),
# "^2" boosts an origin's score by 2x
# if it the queried keywords are
# found in its intrinsic_metadata.keywords
],
}
},
}
}
elif name in ["language", "license"]:
name_mapping = {
"language": "programming_languages",
"license": "licenses",
}
name = name_mapping[name]
return {
"nested": {
"path": "intrinsic_metadata",
"query": {
"bool": {
"should": [
{"match": {get_expansion(name, "."): val}}
for val in value_array
],
}
},
}
}
if category == "dateFilter":
if name in ["created", "modified", "published"]:
if op in ["=", "!="]:
return {
"nested": {
"path": "intrinsic_metadata",
"query": {
"bool": {
("must" if op == "=" else "must_not"): [
{
"range": {
get_expansion(f"date_{name}", "."): {
"gte": value,
"lte": value,
}
}
}
],
}
},
}
}
return {
"nested": {
"path": "intrinsic_metadata",
"query": {
"bool": {
"must": [
{
"range": {
get_expansion(f"date_{name}", "."): {
self.RANGE_OPERATOR_MAP[op]: value,
}
}
}
],
}
},
}
}
else:
if op in ["=", "!="]:
return {
"bool": {
("must" if op == "=" else "must_not"): [
{
"range": {
f"{name}_date": {"gte": value, "lte": value,}
}
}
],
}
}
return {
"range": {
f"{name}_date": {
self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"),
}
}
}
if category == "sortBy":
return value
if category == "limit":
return value
raise Exception(f"Unknown filter {category}.{name}")

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 12:40 PM (2 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3236474

Event Timeline