Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9342359
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
18 KB
Subscribers
None
View Options
diff --git a/setup.py b/setup.py
index bf3108d..a8d4c08 100755
--- a/setup.py
+++ b/setup.py
@@ -1,182 +1,185 @@
#!/usr/bin/env python3
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from distutils.cmd import Command
from distutils.command.build import build
from io import open
import os
import shutil
import subprocess
from setuptools import find_packages, setup
+from setuptools.command.develop import develop
from setuptools.command.sdist import sdist
here = os.path.abspath(os.path.dirname(__file__))
# Get the long description from the README file
with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
long_description = f.read()
def parse_requirements(name=None):
if name:
reqf = "requirements-%s.txt" % name
else:
reqf = "requirements.txt"
requirements = []
if not os.path.exists(reqf):
return requirements
with open(reqf) as f:
for line in f.readlines():
line = line.strip()
if not line or line.startswith("#"):
continue
requirements.append(line)
return requirements
yarn = os.environ.get("YARN", "yarn")
class TSCommand(Command):
user_options = []
def initialize_options(self):
pass
def finalize_options(self):
pass
class TSInstallCommand(TSCommand):
description = "Installs node_modules related to query language"
def run(self):
subprocess.run([yarn, "install"], check=True)
class TSBuildSoCommand(TSCommand):
description = "Builds swh_ql.so"
def initialize_options(self):
self.build_lib = None
super().initialize_options()
def finalize_options(self):
self.set_undefined_options("build", ("build_lib", "build_lib"))
super().finalize_options()
def run(self):
self.run_command("ts_install")
ql_dir = os.path.join(self.build_lib, "swh/search/query_language")
if not os.path.exists(os.path.join(ql_dir, "src/parser.c")):
- generate_parser(ql_dir)
+ generate_parser(ql_dir, copy_tree=True)
static_dir = os.path.join(self.build_lib, "swh/search/static")
os.makedirs(static_dir, exist_ok=True)
# This import cannot be toplevel, as setuptools installs it after the script
# starts running
from tree_sitter import Language
Language.build_library(os.path.join(static_dir, "swh_ql.so"), [ql_dir])
print("swh_ql.so file generated")
-class TSBuildWasmCommand(TSCommand):
- description = "Builds swh_ql.wasm"
-
- def run(self):
- subprocess.run([yarn, "build-wasm"], check=True)
- print("swh_ql.wasm file generated")
-
-
class TSBuildCommand(TSCommand):
description = "Builds swh_ql.so and swh_ql.wasm"
def run(self):
self.run_command("ts_build_so")
class custom_build(build):
def run(self):
super().run()
if not self.dry_run:
self.run_command("ts_build")
class custom_sdist(sdist):
def make_release_tree(self, base_dir, files):
super().make_release_tree(base_dir, files)
dist_ql_path = os.path.join(base_dir, "swh/search/query_language")
if not self.dry_run:
self.run_command("ts_install")
- generate_parser(dist_ql_path)
+ generate_parser(dist_ql_path, copy_tree=True)
+
+
+class custom_develop(develop):
+ def run(self):
+ super().run()
+
+ if not self.dry_run:
+ generate_parser("swh/search/query_language", copy_tree=False)
-def generate_parser(dest_path):
- # FIXME: setuptools should copy this itself...
- print("Copying parser files")
- if os.path.exists(dest_path):
- shutil.rmtree(dest_path)
- shutil.copytree("swh/search/query_language", dest_path)
+def generate_parser(dest_path, copy_tree):
+ if copy_tree:
+ # FIXME: setuptools should copy this itself...
+ print("Copying parser files")
+ if os.path.exists(dest_path):
+ shutil.rmtree(dest_path)
+ shutil.copytree("swh/search/query_language", dest_path)
print("Getting path")
path = subprocess.check_output([yarn, "bin"]).decode().strip()
env = {**os.environ, "PATH": os.pathsep.join([path, os.environ["PATH"]])}
print("Generating")
subprocess.run(["tree-sitter", "generate", "--no-bindings"], cwd=dest_path, env=env)
setup(
name="swh.search",
description="Software Heritage search service",
long_description=long_description,
long_description_content_type="text/markdown",
python_requires=">=3.7",
author="Software Heritage developers",
author_email="swh-devel@inria.fr",
url="https://forge.softwareheritage.org/diffusion/DSEA",
packages=find_packages(), # packages's modules
install_requires=parse_requirements() + parse_requirements("swh"),
tests_require=parse_requirements("test"),
entry_points="""
[swh.cli.subcommands]
search=swh.search.cli
""",
setup_requires=["setuptools-scm", "tree-sitter==0.19.0"],
use_scm_version=True,
extras_require={"testing": parse_requirements("test")},
include_package_data=True,
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Development Status :: 3 - Alpha",
],
project_urls={
"Bug Reports": "https://forge.softwareheritage.org/maniphest",
"Funding": "https://www.softwareheritage.org/donate",
"Source": "https://forge.softwareheritage.org/source/swh-search",
"Documentation": "https://docs.softwareheritage.org/devel/swh-search/",
},
cmdclass={
"build": custom_build,
"sdist": custom_sdist,
+ "develop": custom_develop,
"ts_install": TSInstallCommand,
"ts_build_so": TSBuildSoCommand,
"ts_build": TSBuildCommand,
},
zip_safe=False,
)
diff --git a/swh/search/translator.py b/swh/search/translator.py
index 03c6344..4229bde 100644
--- a/swh/search/translator.py
+++ b/swh/search/translator.py
@@ -1,301 +1,307 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
import os
+import tempfile
from pkg_resources import resource_filename
from tree_sitter import Language, Parser
from swh.search.utils import get_expansion, unescape
+logger = logging.getLogger(__name__)
+
class Translator:
RANGE_OPERATOR_MAP = {
">": "gt",
"<": "lt",
">=": "gte",
"<=": "lte",
}
def __init__(self):
- ql_rel_paths = [
- "static/swh_ql.so", # installed
- "../../query_language/swh_ql.so", # development
- ]
- for ql_rel_path in ql_rel_paths:
- ql_path = resource_filename("swh.search", ql_rel_path)
- if os.path.exists(ql_path):
- break
- else:
- assert False, "swh_ql.so was not found in any of the expected paths"
+ ql_path = resource_filename("swh.search", "static/swh_ql.so")
+ if not os.path.exists(ql_path):
+ logging.info("%s does not exist, building in temporary directory", ql_path)
+ self._build_dir = tempfile.TemporaryDirectory(prefix="swh.search-build")
+ source_path = resource_filename("swh.search", "query_language")
+ ql_path = os.path.join(self._build_dir.name, "swh_ql.so")
+ Language.build_library(ql_path, [source_path])
search_ql = Language(ql_path, "swh_search_ql")
self.parser = Parser()
self.parser.set_language(search_ql)
self.query = ""
def parse_query(self, query):
self.query = query
tree = self.parser.parse(query.encode("utf8"))
self.query_node = tree.root_node
if self.query_node.has_error:
raise Exception("Invalid query")
return self._traverse(self.query_node)
def _traverse(self, node):
if len(node.children) == 3 and node.children[1].type == "filters":
# filters => ( filters )
return self._traverse(node.children[1]) # Go past the () brackets
if node.type == "query":
result = {}
for child in node.children:
# query => filters sort_by limit
result[child.type] = self._traverse(child)
return result
if node.type == "filters":
if len(node.children) == 1:
# query => filters
# filters => filters
# filters => filter
# Current node is just a wrapper, so go one level deep
return self._traverse(node.children[0])
if len(node.children) == 3:
# filters => filters conj_op filters
filters1 = self._traverse(node.children[0])
conj_op = self._get_value(node.children[1])
filters2 = self._traverse(node.children[2])
if conj_op == "and":
# "must" is equivalent to "AND"
return {"bool": {"must": [filters1, filters2]}}
if conj_op == "or":
# "should" is equivalent to "OR"
return {"bool": {"should": [filters1, filters2]}}
if node.type == "filter":
filter_category = node.children[0]
return self._parse_filter(filter_category)
if node.type == "sortBy":
return self._parse_filter(node)
if node.type == "limit":
return self._parse_filter(node)
return Exception(
f"Unknown node type ({node.type}) "
f"or unexpected number of children ({node.children})"
)
def _get_value(self, node):
if (
len(node.children) > 0
and node.children[0].type == "["
and node.children[-1].type == "]"
):
# array
return [self._get_value(child) for child in node.children if child.is_named]
start = node.start_point[1]
end = node.end_point[1]
value = self.query[start:end]
if len(value) > 1 and (
(value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"')
):
return unescape(value[1:-1])
if node.type in ["number", "numberVal"]:
return int(value)
return unescape(value)
def _parse_filter(self, filter):
if filter.type == "boundedListFilter":
filter = filter.children[0]
children = filter.children
assert len(children) == 3
category = filter.type
name, op, value = [self._get_value(child) for child in children]
if category == "patternFilter":
if name == "origin":
return {
"multi_match": {
"query": value,
"type": "bool_prefix",
"operator": "and",
"fields": [
"url.as_you_type",
"url.as_you_type._2gram",
"url.as_you_type._3gram",
],
}
}
elif name == "metadata":
return {
"nested": {
"path": "intrinsic_metadata",
"query": {
"multi_match": {
"query": value,
# Makes it so that the "foo bar" query returns
# documents which contain "foo" in a field and "bar"
# in a different field
"type": "cross_fields",
# All keywords must be found in a document for it to
# be considered a match.
# TODO: allow missing keywords?
"operator": "and",
# Searches on all fields of the intrinsic_metadata dict,
# recursively.
"fields": ["intrinsic_metadata.*"],
# date{Created,Modified,Published} are of type date
"lenient": True,
}
},
}
}
if category == "booleanFilter":
if name == "visited":
return {"term": {"has_visits": value == "true"}}
if category == "numericFilter":
if name == "visits":
if op in ["=", "!="]:
return {
"bool": {
("must" if op == "=" else "must_not"): [
{"range": {"nb_visits": {"gte": value, "lte": value}}}
]
}
}
else:
return {
"range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}}
}
if category == "visitTypeFilter":
if name == "visit_type":
return {"terms": {"visit_types": value}}
if category == "unboundedListFilter":
value_array = value
if name == "keyword":
return {
"nested": {
"path": "intrinsic_metadata",
"query": {
"multi_match": {
"query": " ".join(value_array),
"fields": [
get_expansion("keywords", ".") + "^2",
get_expansion("descriptions", "."),
# "^2" boosts an origin's score by 2x
# if it the queried keywords are
# found in its intrinsic_metadata.keywords
],
}
},
}
}
elif name in ["language", "license"]:
name_mapping = {
"language": "programming_languages",
"license": "licenses",
}
name = name_mapping[name]
return {
"nested": {
"path": "intrinsic_metadata",
"query": {
"bool": {
"should": [
{"match": {get_expansion(name, "."): val}}
for val in value_array
],
}
},
}
}
if category == "dateFilter":
if name in ["created", "modified", "published"]:
if op in ["=", "!="]:
return {
"nested": {
"path": "intrinsic_metadata",
"query": {
"bool": {
("must" if op == "=" else "must_not"): [
{
"range": {
get_expansion(f"date_{name}", "."): {
"gte": value,
"lte": value,
}
}
}
],
}
},
}
}
return {
"nested": {
"path": "intrinsic_metadata",
"query": {
"bool": {
"must": [
{
"range": {
get_expansion(f"date_{name}", "."): {
self.RANGE_OPERATOR_MAP[op]: value,
}
}
}
],
}
},
}
}
else:
if op in ["=", "!="]:
return {
"bool": {
("must" if op == "=" else "must_not"): [
{
"range": {
f"{name}_date": {"gte": value, "lte": value,}
}
}
],
}
}
return {
"range": {
f"{name}_date": {
self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"),
}
}
}
if category == "sortBy":
return value
if category == "limit":
return value
raise Exception(f"Unknown filter {category}.{name}")
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 12:40 PM (2 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3236474
Attached To
rDSEA Archive search
Event Timeline
Log In to Comment