Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9336833
D6025.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
19 KB
Subscribers
None
D6025.diff
View Options
diff --git a/query_language/grammar.js b/query_language/grammar.js
--- a/query_language/grammar.js
+++ b/query_language/grammar.js
@@ -156,7 +156,7 @@
token.immediate(/[^\\"\n]+/),
$.escape_sequence
)),
- singleWord: $ => /[^\s"'\[\]\(\)]+/,
+ singleWord: $ => /[^\s"'\[\]\(\),]+/,
escape_sequence: $ => token.immediate(seq(
'\\',
/(\"|\'|\\|\/|b|n|r|t|u)/
diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py
new file mode 100644
--- /dev/null
+++ b/swh/search/tests/test_translator.py
@@ -0,0 +1,309 @@
+import pytest
+
+from swh.search.translator import Translator
+from swh.search.utils import get_expansion
+
+
+def _test_results(query, expected):
+ output = Translator().parse_query(query)
+ assert output == expected
+
+
+def test_empty_query():
+ query = ""
+ with pytest.raises(Exception):
+ _test_results(query, {})
+
+
+def test_conjunction_operators():
+ query = "visited = true or visits > 2 and visits < 5"
+ expected = {
+ "filters": {
+ "bool": {
+ "should": [
+ {"term": {"has_visits": True}},
+ {
+ "bool": {
+ "must": [
+ {"range": {"nb_visits": {"gt": 2}}},
+ {"range": {"nb_visits": {"lt": 5}}},
+ ]
+ }
+ },
+ ]
+ }
+ }
+ }
+ _test_results(query, expected)
+
+
+def test_conjunction_op_precedence_override():
+ query = "(visited = false or visits > 2) and visits < 5"
+ expected = {
+ "filters": {
+ "bool": {
+ "must": [
+ {
+ "bool": {
+ "should": [
+ {"term": {"has_visits": False}},
+ {"range": {"nb_visits": {"gt": 2}}},
+ ]
+ }
+ },
+ {"range": {"nb_visits": {"lt": 5}}},
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_limit_and_sortby():
+ query = "visited = true sort_by = [-visits,last_visit] limit = 15"
+ expected = {
+ "filters": {"term": {"has_visits": True}},
+ "sortBy": ["-visits", "last_visit"],
+ "limit": 15,
+ }
+
+ _test_results(query, expected)
+
+
+def test_deeply_nested_filters():
+ query = "(((visited = true and visits > 0)))"
+ expected = {
+ "filters": {
+ "bool": {
+ "must": [
+ {"term": {"has_visits": True},},
+ {"range": {"nb_visits": {"gt": 0}}},
+ ]
+ }
+ },
+ }
+
+ _test_results(query, expected)
+
+
+def test_origin_and_metadata_filters():
+ query = 'origin = django or metadata = "framework and web"'
+ expected = {
+ "filters": {
+ "bool": {
+ "should": [
+ {
+ "multi_match": {
+ "query": "django",
+ "type": "bool_prefix",
+ "operator": "and",
+ "fields": [
+ "url.as_you_type",
+ "url.as_you_type._2gram",
+ "url.as_you_type._3gram",
+ ],
+ }
+ },
+ {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": "framework and web",
+ "type": "cross_fields",
+ "operator": "and",
+ "fields": ["intrinsic_metadata.*"],
+ "lenient": True,
+ }
+ },
+ }
+ },
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_visits_not_equal_to_filter():
+ query = "visits != 5"
+ expected = {
+ "filters": {
+ "bool": {"must_not": [{"range": {"nb_visits": {"gte": 5, "lte": 5}}},]}
+ },
+ }
+
+ _test_results(query, expected)
+
+
+def test_visit_type_filter():
+ query = 'visit_type = [git,"pypi"]'
+ expected = {"filters": {"terms": {"visit_types": ["git", "pypi"]}}}
+
+ _test_results(query, expected)
+
+
+def test_keyword_filter():
+ query = r"""keyword in [word1, "word2 \" ' word3"]"""
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": r"""word1 word2 \" ' word3""",
+ "fields": [
+ get_expansion("keywords", ".") + "^2",
+ get_expansion("descriptions", "."),
+ ],
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_language_filter():
+ query = 'language in [python, "go lang", cpp]'
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {
+ "match": {
+ get_expansion(
+ "programming_languages", "."
+ ): "python"
+ }
+ },
+ {
+ "match": {
+ get_expansion(
+ "programming_languages", "."
+ ): "go lang"
+ }
+ },
+ {
+ "match": {
+ get_expansion("programming_languages", "."): "cpp"
+ }
+ },
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_license_filter():
+ query = 'license in ["GPL 3", Apache, MIT]'
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {"match": {get_expansion("licenses", "."): "GPL 3"}},
+ {"match": {get_expansion("licenses", "."): "Apache"}},
+ {"match": {get_expansion("licenses", "."): "MIT"}},
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_date_created_not_equal_to_filter():
+ query = "created != 2020-01-01"
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must_not": [
+ {
+ "range": {
+ get_expansion("date_created", "."): {
+ "gte": "2020-01-01",
+ "lte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_date_created_greater_than_filter():
+ query = "created >= 2020-01-01"
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ get_expansion("date_created", "."): {
+ "gte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_last_eventful_visit_not_equal_to_filter():
+ query = "last_visit != 2020-01-01"
+ expected = {
+ "filters": {
+ "bool": {
+ "must_not": [
+ {
+ "range": {
+ "last_visit_date": {
+ "gte": "2020-01-01",
+ "lte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_last_eventful_visit_less_than_to_filter():
+ query = "last_visit < 2020-01-01"
+ expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}}
+
+ _test_results(query, expected)
diff --git a/swh/search/translator.py b/swh/search/translator.py
new file mode 100644
--- /dev/null
+++ b/swh/search/translator.py
@@ -0,0 +1,288 @@
+import os
+
+from pkg_resources import resource_filename
+from tree_sitter import Language, Parser
+
+from swh.search.utils import get_expansion
+
+
+class Translator:
+
+ RANGE_OPERATOR_MAP = {
+ ">": "gt",
+ "<": "lt",
+ ">=": "gte",
+ "<=": "lte",
+ }
+
+ def __init__(self):
+ ql_rel_paths = [
+ "swh_ql.so", # installed
+ "../../query_language/swh_ql.so", # development
+ ]
+ for ql_rel_path in ql_rel_paths:
+ ql_path = resource_filename("swh.search", ql_rel_path)
+ if os.path.exists(ql_path):
+ break
+ else:
+ assert False, "swh_ql.so was not found in any of the expected paths"
+
+ search_ql = Language(ql_path, "swh_search_ql")
+
+ self.parser = Parser()
+ self.parser.set_language(search_ql)
+ self.query = ""
+
+ def parse_query(self, query):
+ self.query = query
+ tree = self.parser.parse(query.encode("utf8"))
+ self.query_node = tree.root_node
+
+ if self.query_node.has_error:
+ raise Exception("Invalid query")
+
+ return self._traverse(self.query_node)
+
+ def _traverse(self, node):
+ if len(node.children) == 3 and node.children[1].type == "filters":
+ # filters => ( filters )
+ return self._traverse(node.children[1]) # Go past the () brackets
+ if node.type == "query":
+ result = {}
+ for child in node.children:
+ # query => filters sort_by limit
+ result[child.type] = self._traverse(child)
+
+ return result
+
+ if node.type == "filters":
+ if len(node.children) == 1:
+ # query => filters
+ # filters => filters
+ # filters => filter
+ # Current node is just a wrapper, so go one level deep
+ return self._traverse(node.children[0])
+
+ if len(node.children) == 3:
+ # filters => filters conj_op filters
+ filters1 = self._traverse(node.children[0])
+ conj_op = self._get_value(node.children[1])
+ filters2 = self._traverse(node.children[2])
+
+ if conj_op == "and":
+ return {"bool": {"must": [filters1, filters2]}}
+ if conj_op == "or":
+ return {"bool": {"should": [filters1, filters2]}}
+
+ if node.type == "filter":
+ filter_category = node.children[0]
+ return self._parse_filter(filter_category)
+
+ if node.type == "sortBy":
+ return self._parse_filter(node)
+
+ if node.type == "limit":
+ return self._parse_filter(node)
+
+ return Exception(
+ f"Unknown node type ({node.type}) "
+ f"or unexpected number of children ({node.children})"
+ )
+
+ def _get_value(self, node):
+ if (
+ len(node.children) > 0
+ and node.children[0].type == "["
+ and node.children[-1].type == "]"
+ ):
+ # array
+ return [self._get_value(child) for child in node.children if child.is_named]
+
+ start = node.start_point[1]
+ end = node.end_point[1]
+
+ value = self.query[start:end]
+
+ if len(value) > 1 and (
+ (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"')
+ ):
+ return value[1:-1]
+
+ if node.type in ["number", "numberVal"]:
+ return int(value)
+
+ return value
+
+ def _parse_filter(self, filter):
+
+ if filter.type == "boundedListFilter":
+ filter = filter.children[0]
+
+ children = filter.children
+ assert len(children) == 3
+
+ category = filter.type
+ name, op, value = [self._get_value(child) for child in children]
+
+ if category == "patternFilter":
+ if name == "origin":
+ return {
+ "multi_match": {
+ "query": value,
+ "type": "bool_prefix",
+ "operator": "and",
+ "fields": [
+ "url.as_you_type",
+ "url.as_you_type._2gram",
+ "url.as_you_type._3gram",
+ ],
+ }
+ }
+ elif name == "metadata":
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": value,
+ "type": "cross_fields",
+ "operator": "and",
+ "fields": ["intrinsic_metadata.*"],
+ "lenient": True,
+ }
+ },
+ }
+ }
+
+ if category == "booleanFilter":
+ if name == "visited":
+ return {"term": {"has_visits": value == "true"}}
+
+ if category == "numericFilter":
+ if name == "visits":
+ if op in ["=", "!="]:
+ return {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {"range": {"nb_visits": {"gte": value, "lte": value}}}
+ ]
+ }
+ }
+ else:
+ return {
+ "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}}
+ }
+
+ if category == "visitTypeFilter":
+ if name == "visit_type":
+ return {"terms": {"visit_types": value}}
+
+ if category == "unboundedListFilter":
+ value_array = value
+
+ if name == "keyword":
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": " ".join(value_array),
+ "fields": [
+ get_expansion("keywords", ".") + "^2",
+ get_expansion("descriptions", "."),
+ ],
+ }
+ },
+ }
+ }
+ elif name in ["language", "license"]:
+ name_mapping = {
+ "language": "programming_languages",
+ "license": "licenses",
+ }
+ name = name_mapping[name]
+
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {"match": {get_expansion(name, "."): val}}
+ for val in value_array
+ ],
+ }
+ },
+ }
+ }
+
+ if category == "dateFilter":
+
+ if name in ["created", "modified", "published"]:
+ if op in ["=", "!="]:
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {
+ "range": {
+ get_expansion(f"date_{name}", "."): {
+ "gte": value,
+ "lte": value,
+ }
+ }
+ }
+ ],
+ }
+ },
+ }
+ }
+
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ get_expansion(f"date_{name}", "."): {
+ self.RANGE_OPERATOR_MAP[op]: value,
+ }
+ }
+ }
+ ],
+ }
+ },
+ }
+ }
+ else:
+ if op in ["=", "!="]:
+ return {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {
+ "range": {
+ f"{name}_date": {"gte": value, "lte": value,}
+ }
+ }
+ ],
+ }
+ }
+ return {
+ "range": {
+ f"{name}_date": {
+ self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"),
+ }
+ }
+ }
+
+ if category == "sortBy":
+ return value
+
+ if category == "limit":
+ return value
+
+ raise Exception(f"Unknown filter {category}.{name}")
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jul 3 2025, 7:45 AM (10 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228412
Attached To
D6025: translator.py: Translate search query language to ES DSL
Event Timeline
Log In to Comment