Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345563
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
25 KB
Subscribers
None
View Options
diff --git a/query_language/grammar.js b/query_language/grammar.js
index aa94745..1d397c6 100644
--- a/query_language/grammar.js
+++ b/query_language/grammar.js
@@ -1,200 +1,200 @@
// Copyright (C) 2019-2021 The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information
const PRECEDENCE = {
or: 2,
and: 3,
bracket: 4,
}
module.exports = grammar({
name: 'swh_search_ql',
rules: {
query: $ => seq($.filters, optional($.sortBy) ,optional($.limit)),
filters: $ => choice(
prec.left(PRECEDENCE.and,
seq(
field('left', $.filters),
field('operator', $.and),
field('right', $.filters),
)
),
prec.left(PRECEDENCE.or,
seq(
field('left', $.filters),
field('operator', $.or),
field('right', $.filters),
)
),
prec.left(PRECEDENCE.bracket,
seq("(", $.filters, ")"),
),
$.filter
),
sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal),
sortByField: $ => token('sort_by'),
sortByOp: $ => $.equalOp,
sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])),
sortByOptions: $ => seq(optional(token.immediate('-')) ,choice(
'visits',
'last_visit',
'last_eventful_visit',
'last_revision',
'last_release',
'created',
'modified',
'published'
)),
limit: $ => seq('limit', $.equalOp, $.number),
filter: $ => choice(
$.patternFilter,
$.booleanFilter,
$.numericFilter,
$.boundedListFilter,
$.unboundedListFilter,
$.dateFilter
),
patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal),
patternField: $ => token(choice('origin', 'metadata')),
patternOp: $ => $.equalOp,
patternVal: $ => $.string,
booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal),
booleanField: $ => token(choice('visited')),
booleanOp: $ => $.equalOp,
booleanVal: $ => choice($.booleanTrue, $.booleanFalse),
numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal),
numericField: $ => token(choice('visits')),
numericOp: $ => $.rangeOp,
numberVal: $ => $.number,
boundedListFilter: $ => choice($.visitTypeFilter),
visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal),
visitTypeField: $ => token(choice('visit_type')),
visitTypeOp: $ => $.equalOp,
visitTypeVal: $ => createArray(optionalWrapWith($.visitTypeOptions, ["'", '"'])),
visitTypeOptions: $ => choice(
"any",
"cran",
"deb",
"deposit",
"ftp",
"hg",
"git",
"nixguix",
"npm",
"pypi",
"svn",
"tar"
), // TODO: fetch this list dynamically from other swh services?
sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal),
sortByField: $ => token(choice('sort_by')),
sortByOp: $ => $.equalOp,
sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])),
sortByOptions: $ => seq(
optional(token.immediate('-')),
choice(
'visits',
'last_visit',
'last_eventful_visit',
'last_revision',
'last_release',
'created',
'modified',
'published'
)),
unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal),
listField: $ => token(choice('language', 'license', 'keyword')),
listOp: $ => $.choiceOp,
listVal: $ => createArray($.string),
dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal),
dateField: $ => token(choice(
'last_visit',
'last_eventful_visit',
'last_revision',
'last_release',
'created',
'modified',
'published'
)),
dateOp: $ => $.rangeOp,
dateVal: $ => $.isoDateTime,
limit: $ => seq('limit', $.equalOp, $.number),
rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')),
equalOp: $ => token('='),
choiceOp: $ => token(choice('in', 'not in')),
isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/,
string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord),
number: $ => /\d+/,
booleanTrue: $ => "true",
booleanFalse: $ => "false",
or: $ => "or",
and: $ => "and",
stringContent: $ => repeat1(choice(
token.immediate(/[^\\"\n]+/),
$.escape_sequence
)),
- singleWord: $ => /[^\s"'\[\]\(\)]+/,
+ singleWord: $ => /[^\s"'\[\]\(\),]+/,
escape_sequence: $ => token.immediate(seq(
'\\',
/(\"|\'|\\|\/|b|n|r|t|u)/
)),
}
});
function joinBySep1(rule, sep) {
// At least one repetition of the rule separated by `sep`
return seq(rule, repeat(seq(sep, optional(rule))))
}
function joinBySep(rule, sep = ",") {
// Any number of repetitions of the rule separated by `sep`
return optional(joinBySep1(rule, sep))
}
function createArray(rule) {
// An array having `rule` as its member
return seq(
"[",
joinBySep(
field('array_member', rule),
","
),
"]"
)
}
function wrapWith(rule, wrappers = ["'", '"']) {
// The rule must be wrapped with one of the wrappers
const wrappedRules = wrappers.map(wrapper => seq(wrapper, rule, wrapper))
return choice(...wrappedRules)
}
function optionalWrapWith(rule, wrappers = ["'", '"']) {
// The rule may or may not be wrapped with the wrappers
return choice(wrapWith(rule, wrappers), rule)
}
diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py
new file mode 100644
index 0000000..2c85d1c
--- /dev/null
+++ b/swh/search/tests/test_translator.py
@@ -0,0 +1,309 @@
+import pytest
+
+from swh.search.translator import Translator
+from swh.search.utils import get_expansion
+
+
+def _test_results(query, expected):
+ output = Translator().parse_query(query)
+ assert output == expected
+
+
+def test_empty_query():
+ query = ""
+ with pytest.raises(Exception):
+ _test_results(query, {})
+
+
+def test_conjunction_operators():
+ query = "visited = true or visits > 2 and visits < 5"
+ expected = {
+ "filters": {
+ "bool": {
+ "should": [
+ {"term": {"has_visits": True}},
+ {
+ "bool": {
+ "must": [
+ {"range": {"nb_visits": {"gt": 2}}},
+ {"range": {"nb_visits": {"lt": 5}}},
+ ]
+ }
+ },
+ ]
+ }
+ }
+ }
+ _test_results(query, expected)
+
+
+def test_conjunction_op_precedence_override():
+ query = "(visited = false or visits > 2) and visits < 5"
+ expected = {
+ "filters": {
+ "bool": {
+ "must": [
+ {
+ "bool": {
+ "should": [
+ {"term": {"has_visits": False}},
+ {"range": {"nb_visits": {"gt": 2}}},
+ ]
+ }
+ },
+ {"range": {"nb_visits": {"lt": 5}}},
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_limit_and_sortby():
+ query = "visited = true sort_by = [-visits,last_visit] limit = 15"
+ expected = {
+ "filters": {"term": {"has_visits": True}},
+ "sortBy": ["-visits", "last_visit"],
+ "limit": 15,
+ }
+
+ _test_results(query, expected)
+
+
+def test_deeply_nested_filters():
+ query = "(((visited = true and visits > 0)))"
+ expected = {
+ "filters": {
+ "bool": {
+ "must": [
+ {"term": {"has_visits": True},},
+ {"range": {"nb_visits": {"gt": 0}}},
+ ]
+ }
+ },
+ }
+
+ _test_results(query, expected)
+
+
+def test_origin_and_metadata_filters():
+ query = 'origin = django or metadata = "framework and web"'
+ expected = {
+ "filters": {
+ "bool": {
+ "should": [
+ {
+ "multi_match": {
+ "query": "django",
+ "type": "bool_prefix",
+ "operator": "and",
+ "fields": [
+ "url.as_you_type",
+ "url.as_you_type._2gram",
+ "url.as_you_type._3gram",
+ ],
+ }
+ },
+ {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": "framework and web",
+ "type": "cross_fields",
+ "operator": "and",
+ "fields": ["intrinsic_metadata.*"],
+ "lenient": True,
+ }
+ },
+ }
+ },
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_visits_not_equal_to_filter():
+ query = "visits != 5"
+ expected = {
+ "filters": {
+ "bool": {"must_not": [{"range": {"nb_visits": {"gte": 5, "lte": 5}}},]}
+ },
+ }
+
+ _test_results(query, expected)
+
+
+def test_visit_type_filter():
+ query = 'visit_type = [git,"pypi"]'
+ expected = {"filters": {"terms": {"visit_types": ["git", "pypi"]}}}
+
+ _test_results(query, expected)
+
+
+def test_keyword_filter():
+ query = r"""keyword in [word1, "word2 \" ' word3"]"""
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": r"""word1 word2 \" ' word3""",
+ "fields": [
+ get_expansion("keywords", ".") + "^2",
+ get_expansion("descriptions", "."),
+ ],
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_language_filter():
+ query = 'language in [python, "go lang", cpp]'
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {
+ "match": {
+ get_expansion(
+ "programming_languages", "."
+ ): "python"
+ }
+ },
+ {
+ "match": {
+ get_expansion(
+ "programming_languages", "."
+ ): "go lang"
+ }
+ },
+ {
+ "match": {
+ get_expansion("programming_languages", "."): "cpp"
+ }
+ },
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_license_filter():
+ query = 'license in ["GPL 3", Apache, MIT]'
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {"match": {get_expansion("licenses", "."): "GPL 3"}},
+ {"match": {get_expansion("licenses", "."): "Apache"}},
+ {"match": {get_expansion("licenses", "."): "MIT"}},
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_date_created_not_equal_to_filter():
+ query = "created != 2020-01-01"
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must_not": [
+ {
+ "range": {
+ get_expansion("date_created", "."): {
+ "gte": "2020-01-01",
+ "lte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_date_created_greater_than_filter():
+ query = "created >= 2020-01-01"
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ get_expansion("date_created", "."): {
+ "gte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_last_eventful_visit_not_equal_to_filter():
+ query = "last_visit != 2020-01-01"
+ expected = {
+ "filters": {
+ "bool": {
+ "must_not": [
+ {
+ "range": {
+ "last_visit_date": {
+ "gte": "2020-01-01",
+ "lte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_last_eventful_visit_less_than_to_filter():
+ query = "last_visit < 2020-01-01"
+ expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}}
+
+ _test_results(query, expected)
diff --git a/swh/search/translator.py b/swh/search/translator.py
new file mode 100644
index 0000000..5c2b9d7
--- /dev/null
+++ b/swh/search/translator.py
@@ -0,0 +1,288 @@
+import os
+
+from pkg_resources import resource_filename
+from tree_sitter import Language, Parser
+
+from swh.search.utils import get_expansion
+
+
+class Translator:
+
+ RANGE_OPERATOR_MAP = {
+ ">": "gt",
+ "<": "lt",
+ ">=": "gte",
+ "<=": "lte",
+ }
+
+ def __init__(self):
+ ql_rel_paths = [
+ "swh_ql.so", # installed
+ "../../query_language/swh_ql.so", # development
+ ]
+ for ql_rel_path in ql_rel_paths:
+ ql_path = resource_filename("swh.search", ql_rel_path)
+ if os.path.exists(ql_path):
+ break
+ else:
+ assert False, "swh_ql.so was not found in any of the expected paths"
+
+ search_ql = Language(ql_path, "swh_search_ql")
+
+ self.parser = Parser()
+ self.parser.set_language(search_ql)
+ self.query = ""
+
+ def parse_query(self, query):
+ self.query = query
+ tree = self.parser.parse(query.encode("utf8"))
+ self.query_node = tree.root_node
+
+ if self.query_node.has_error:
+ raise Exception("Invalid query")
+
+ return self._traverse(self.query_node)
+
+ def _traverse(self, node):
+ if len(node.children) == 3 and node.children[1].type == "filters":
+ # filters => ( filters )
+ return self._traverse(node.children[1]) # Go past the () brackets
+ if node.type == "query":
+ result = {}
+ for child in node.children:
+ # query => filters sort_by limit
+ result[child.type] = self._traverse(child)
+
+ return result
+
+ if node.type == "filters":
+ if len(node.children) == 1:
+ # query => filters
+ # filters => filters
+ # filters => filter
+ # Current node is just a wrapper, so go one level deep
+ return self._traverse(node.children[0])
+
+ if len(node.children) == 3:
+ # filters => filters conj_op filters
+ filters1 = self._traverse(node.children[0])
+ conj_op = self._get_value(node.children[1])
+ filters2 = self._traverse(node.children[2])
+
+ if conj_op == "and":
+ return {"bool": {"must": [filters1, filters2]}}
+ if conj_op == "or":
+ return {"bool": {"should": [filters1, filters2]}}
+
+ if node.type == "filter":
+ filter_category = node.children[0]
+ return self._parse_filter(filter_category)
+
+ if node.type == "sortBy":
+ return self._parse_filter(node)
+
+ if node.type == "limit":
+ return self._parse_filter(node)
+
+ return Exception(
+ f"Unknown node type ({node.type}) "
+ f"or unexpected number of children ({node.children})"
+ )
+
+ def _get_value(self, node):
+ if (
+ len(node.children) > 0
+ and node.children[0].type == "["
+ and node.children[-1].type == "]"
+ ):
+ # array
+ return [self._get_value(child) for child in node.children if child.is_named]
+
+ start = node.start_point[1]
+ end = node.end_point[1]
+
+ value = self.query[start:end]
+
+ if len(value) > 1 and (
+ (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"')
+ ):
+ return value[1:-1]
+
+ if node.type in ["number", "numberVal"]:
+ return int(value)
+
+ return value
+
+ def _parse_filter(self, filter):
+
+ if filter.type == "boundedListFilter":
+ filter = filter.children[0]
+
+ children = filter.children
+ assert len(children) == 3
+
+ category = filter.type
+ name, op, value = [self._get_value(child) for child in children]
+
+ if category == "patternFilter":
+ if name == "origin":
+ return {
+ "multi_match": {
+ "query": value,
+ "type": "bool_prefix",
+ "operator": "and",
+ "fields": [
+ "url.as_you_type",
+ "url.as_you_type._2gram",
+ "url.as_you_type._3gram",
+ ],
+ }
+ }
+ elif name == "metadata":
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": value,
+ "type": "cross_fields",
+ "operator": "and",
+ "fields": ["intrinsic_metadata.*"],
+ "lenient": True,
+ }
+ },
+ }
+ }
+
+ if category == "booleanFilter":
+ if name == "visited":
+ return {"term": {"has_visits": value == "true"}}
+
+ if category == "numericFilter":
+ if name == "visits":
+ if op in ["=", "!="]:
+ return {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {"range": {"nb_visits": {"gte": value, "lte": value}}}
+ ]
+ }
+ }
+ else:
+ return {
+ "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}}
+ }
+
+ if category == "visitTypeFilter":
+ if name == "visit_type":
+ return {"terms": {"visit_types": value}}
+
+ if category == "unboundedListFilter":
+ value_array = value
+
+ if name == "keyword":
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": " ".join(value_array),
+ "fields": [
+ get_expansion("keywords", ".") + "^2",
+ get_expansion("descriptions", "."),
+ ],
+ }
+ },
+ }
+ }
+ elif name in ["language", "license"]:
+ name_mapping = {
+ "language": "programming_languages",
+ "license": "licenses",
+ }
+ name = name_mapping[name]
+
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {"match": {get_expansion(name, "."): val}}
+ for val in value_array
+ ],
+ }
+ },
+ }
+ }
+
+ if category == "dateFilter":
+
+ if name in ["created", "modified", "published"]:
+ if op in ["=", "!="]:
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {
+ "range": {
+ get_expansion(f"date_{name}", "."): {
+ "gte": value,
+ "lte": value,
+ }
+ }
+ }
+ ],
+ }
+ },
+ }
+ }
+
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ get_expansion(f"date_{name}", "."): {
+ self.RANGE_OPERATOR_MAP[op]: value,
+ }
+ }
+ }
+ ],
+ }
+ },
+ }
+ }
+ else:
+ if op in ["=", "!="]:
+ return {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {
+ "range": {
+ f"{name}_date": {"gte": value, "lte": value,}
+ }
+ }
+ ],
+ }
+ }
+ return {
+ "range": {
+ f"{name}_date": {
+ self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"),
+ }
+ }
+ }
+
+ if category == "sortBy":
+ return value
+
+ if category == "limit":
+ return value
+
+ raise Exception(f"Unknown filter {category}.{name}")
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:25 PM (6 d, 7 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3246703
Attached To
rDSEA Archive search
Event Timeline
Log In to Comment