Page MenuHomeSoftware Heritage

D6025.diff
No OneTemporary

D6025.diff

diff --git a/query_language/grammar.js b/query_language/grammar.js
--- a/query_language/grammar.js
+++ b/query_language/grammar.js
@@ -156,7 +156,7 @@
token.immediate(/[^\\"\n]+/),
$.escape_sequence
)),
- singleWord: $ => /[^\s"'\[\]\(\)]+/,
+ singleWord: $ => /[^\s"'\[\]\(\),]+/,
escape_sequence: $ => token.immediate(seq(
'\\',
/(\"|\'|\\|\/|b|n|r|t|u)/
diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py
new file mode 100644
--- /dev/null
+++ b/swh/search/tests/test_translator.py
@@ -0,0 +1,309 @@
+import pytest
+
+from swh.search.translator import Translator
+from swh.search.utils import get_expansion
+
+
+def _test_results(query, expected):
+ output = Translator().parse_query(query)
+ assert output == expected
+
+
+def test_empty_query():
+ query = ""
+ with pytest.raises(Exception):
+ _test_results(query, {})
+
+
+def test_conjunction_operators():
+ query = "visited = true or visits > 2 and visits < 5"
+ expected = {
+ "filters": {
+ "bool": {
+ "should": [
+ {"term": {"has_visits": True}},
+ {
+ "bool": {
+ "must": [
+ {"range": {"nb_visits": {"gt": 2}}},
+ {"range": {"nb_visits": {"lt": 5}}},
+ ]
+ }
+ },
+ ]
+ }
+ }
+ }
+ _test_results(query, expected)
+
+
+def test_conjunction_op_precedence_override():
+ query = "(visited = false or visits > 2) and visits < 5"
+ expected = {
+ "filters": {
+ "bool": {
+ "must": [
+ {
+ "bool": {
+ "should": [
+ {"term": {"has_visits": False}},
+ {"range": {"nb_visits": {"gt": 2}}},
+ ]
+ }
+ },
+ {"range": {"nb_visits": {"lt": 5}}},
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_limit_and_sortby():
+ query = "visited = true sort_by = [-visits,last_visit] limit = 15"
+ expected = {
+ "filters": {"term": {"has_visits": True}},
+ "sortBy": ["-visits", "last_visit"],
+ "limit": 15,
+ }
+
+ _test_results(query, expected)
+
+
+def test_deeply_nested_filters():
+ query = "(((visited = true and visits > 0)))"
+ expected = {
+ "filters": {
+ "bool": {
+ "must": [
+ {"term": {"has_visits": True},},
+ {"range": {"nb_visits": {"gt": 0}}},
+ ]
+ }
+ },
+ }
+
+ _test_results(query, expected)
+
+
+def test_origin_and_metadata_filters():
+ query = 'origin = django or metadata = "framework and web"'
+ expected = {
+ "filters": {
+ "bool": {
+ "should": [
+ {
+ "multi_match": {
+ "query": "django",
+ "type": "bool_prefix",
+ "operator": "and",
+ "fields": [
+ "url.as_you_type",
+ "url.as_you_type._2gram",
+ "url.as_you_type._3gram",
+ ],
+ }
+ },
+ {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": "framework and web",
+ "type": "cross_fields",
+ "operator": "and",
+ "fields": ["intrinsic_metadata.*"],
+ "lenient": True,
+ }
+ },
+ }
+ },
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_visits_not_equal_to_filter():
+ query = "visits != 5"
+ expected = {
+ "filters": {
+ "bool": {"must_not": [{"range": {"nb_visits": {"gte": 5, "lte": 5}}},]}
+ },
+ }
+
+ _test_results(query, expected)
+
+
+def test_visit_type_filter():
+ query = 'visit_type = [git,"pypi"]'
+ expected = {"filters": {"terms": {"visit_types": ["git", "pypi"]}}}
+
+ _test_results(query, expected)
+
+
+def test_keyword_filter():
+ query = r"""keyword in [word1, "word2 \" ' word3"]"""
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": r"""word1 word2 \" ' word3""",
+ "fields": [
+ get_expansion("keywords", ".") + "^2",
+ get_expansion("descriptions", "."),
+ ],
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_language_filter():
+ query = 'language in [python, "go lang", cpp]'
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {
+ "match": {
+ get_expansion(
+ "programming_languages", "."
+ ): "python"
+ }
+ },
+ {
+ "match": {
+ get_expansion(
+ "programming_languages", "."
+ ): "go lang"
+ }
+ },
+ {
+ "match": {
+ get_expansion("programming_languages", "."): "cpp"
+ }
+ },
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_license_filter():
+ query = 'license in ["GPL 3", Apache, MIT]'
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {"match": {get_expansion("licenses", "."): "GPL 3"}},
+ {"match": {get_expansion("licenses", "."): "Apache"}},
+ {"match": {get_expansion("licenses", "."): "MIT"}},
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_date_created_not_equal_to_filter():
+ query = "created != 2020-01-01"
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must_not": [
+ {
+ "range": {
+ get_expansion("date_created", "."): {
+ "gte": "2020-01-01",
+ "lte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_date_created_greater_than_filter():
+ query = "created >= 2020-01-01"
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ get_expansion("date_created", "."): {
+ "gte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_last_eventful_visit_not_equal_to_filter():
+ query = "last_visit != 2020-01-01"
+ expected = {
+ "filters": {
+ "bool": {
+ "must_not": [
+ {
+ "range": {
+ "last_visit_date": {
+ "gte": "2020-01-01",
+ "lte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_last_eventful_visit_less_than_to_filter():
+ query = "last_visit < 2020-01-01"
+ expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}}
+
+ _test_results(query, expected)
diff --git a/swh/search/translator.py b/swh/search/translator.py
new file mode 100644
--- /dev/null
+++ b/swh/search/translator.py
@@ -0,0 +1,288 @@
+import os
+
+from pkg_resources import resource_filename
+from tree_sitter import Language, Parser
+
+from swh.search.utils import get_expansion
+
+
+class Translator:
+
+ RANGE_OPERATOR_MAP = {
+ ">": "gt",
+ "<": "lt",
+ ">=": "gte",
+ "<=": "lte",
+ }
+
+ def __init__(self):
+ ql_rel_paths = [
+ "swh_ql.so", # installed
+ "../../query_language/swh_ql.so", # development
+ ]
+ for ql_rel_path in ql_rel_paths:
+ ql_path = resource_filename("swh.search", ql_rel_path)
+ if os.path.exists(ql_path):
+ break
+ else:
+ assert False, "swh_ql.so was not found in any of the expected paths"
+
+ search_ql = Language(ql_path, "swh_search_ql")
+
+ self.parser = Parser()
+ self.parser.set_language(search_ql)
+ self.query = ""
+
+ def parse_query(self, query):
+ self.query = query
+ tree = self.parser.parse(query.encode("utf8"))
+ self.query_node = tree.root_node
+
+ if self.query_node.has_error:
+ raise Exception("Invalid query")
+
+ return self._traverse(self.query_node)
+
+ def _traverse(self, node):
+ if len(node.children) == 3 and node.children[1].type == "filters":
+ # filters => ( filters )
+ return self._traverse(node.children[1]) # Go past the () brackets
+ if node.type == "query":
+ result = {}
+ for child in node.children:
+ # query => filters sort_by limit
+ result[child.type] = self._traverse(child)
+
+ return result
+
+ if node.type == "filters":
+ if len(node.children) == 1:
+ # query => filters
+ # filters => filters
+ # filters => filter
+ # Current node is just a wrapper, so go one level deep
+ return self._traverse(node.children[0])
+
+ if len(node.children) == 3:
+ # filters => filters conj_op filters
+ filters1 = self._traverse(node.children[0])
+ conj_op = self._get_value(node.children[1])
+ filters2 = self._traverse(node.children[2])
+
+ if conj_op == "and":
+ return {"bool": {"must": [filters1, filters2]}}
+ if conj_op == "or":
+ return {"bool": {"should": [filters1, filters2]}}
+
+ if node.type == "filter":
+ filter_category = node.children[0]
+ return self._parse_filter(filter_category)
+
+ if node.type == "sortBy":
+ return self._parse_filter(node)
+
+ if node.type == "limit":
+ return self._parse_filter(node)
+
+ return Exception(
+ f"Unknown node type ({node.type}) "
+ f"or unexpected number of children ({node.children})"
+ )
+
+ def _get_value(self, node):
+ if (
+ len(node.children) > 0
+ and node.children[0].type == "["
+ and node.children[-1].type == "]"
+ ):
+ # array
+ return [self._get_value(child) for child in node.children if child.is_named]
+
+ start = node.start_point[1]
+ end = node.end_point[1]
+
+ value = self.query[start:end]
+
+ if len(value) > 1 and (
+ (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"')
+ ):
+ return value[1:-1]
+
+ if node.type in ["number", "numberVal"]:
+ return int(value)
+
+ return value
+
+ def _parse_filter(self, filter):
+
+ if filter.type == "boundedListFilter":
+ filter = filter.children[0]
+
+ children = filter.children
+ assert len(children) == 3
+
+ category = filter.type
+ name, op, value = [self._get_value(child) for child in children]
+
+ if category == "patternFilter":
+ if name == "origin":
+ return {
+ "multi_match": {
+ "query": value,
+ "type": "bool_prefix",
+ "operator": "and",
+ "fields": [
+ "url.as_you_type",
+ "url.as_you_type._2gram",
+ "url.as_you_type._3gram",
+ ],
+ }
+ }
+ elif name == "metadata":
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": value,
+ "type": "cross_fields",
+ "operator": "and",
+ "fields": ["intrinsic_metadata.*"],
+ "lenient": True,
+ }
+ },
+ }
+ }
+
+ if category == "booleanFilter":
+ if name == "visited":
+ return {"term": {"has_visits": value == "true"}}
+
+ if category == "numericFilter":
+ if name == "visits":
+ if op in ["=", "!="]:
+ return {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {"range": {"nb_visits": {"gte": value, "lte": value}}}
+ ]
+ }
+ }
+ else:
+ return {
+ "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}}
+ }
+
+ if category == "visitTypeFilter":
+ if name == "visit_type":
+ return {"terms": {"visit_types": value}}
+
+ if category == "unboundedListFilter":
+ value_array = value
+
+ if name == "keyword":
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": " ".join(value_array),
+ "fields": [
+ get_expansion("keywords", ".") + "^2",
+ get_expansion("descriptions", "."),
+ ],
+ }
+ },
+ }
+ }
+ elif name in ["language", "license"]:
+ name_mapping = {
+ "language": "programming_languages",
+ "license": "licenses",
+ }
+ name = name_mapping[name]
+
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {"match": {get_expansion(name, "."): val}}
+ for val in value_array
+ ],
+ }
+ },
+ }
+ }
+
+ if category == "dateFilter":
+
+ if name in ["created", "modified", "published"]:
+ if op in ["=", "!="]:
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {
+ "range": {
+ get_expansion(f"date_{name}", "."): {
+ "gte": value,
+ "lte": value,
+ }
+ }
+ }
+ ],
+ }
+ },
+ }
+ }
+
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ get_expansion(f"date_{name}", "."): {
+ self.RANGE_OPERATOR_MAP[op]: value,
+ }
+ }
+ }
+ ],
+ }
+ },
+ }
+ }
+ else:
+ if op in ["=", "!="]:
+ return {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {
+ "range": {
+ f"{name}_date": {"gte": value, "lte": value,}
+ }
+ }
+ ],
+ }
+ }
+ return {
+ "range": {
+ f"{name}_date": {
+ self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"),
+ }
+ }
+ }
+
+ if category == "sortBy":
+ return value
+
+ if category == "limit":
+ return value
+
+ raise Exception(f"Unknown filter {category}.{name}")

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 7:45 AM (10 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228412

Event Timeline