Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/query_language/grammar.js b/query_language/grammar.js
index aa94745..1d397c6 100644
--- a/query_language/grammar.js
+++ b/query_language/grammar.js
@@ -1,200 +1,200 @@
// Copyright (C) 2019-2021 The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information
const PRECEDENCE = {
or: 2,
and: 3,
bracket: 4,
}
module.exports = grammar({
name: 'swh_search_ql',
rules: {
query: $ => seq($.filters, optional($.sortBy) ,optional($.limit)),
filters: $ => choice(
prec.left(PRECEDENCE.and,
seq(
field('left', $.filters),
field('operator', $.and),
field('right', $.filters),
)
),
prec.left(PRECEDENCE.or,
seq(
field('left', $.filters),
field('operator', $.or),
field('right', $.filters),
)
),
prec.left(PRECEDENCE.bracket,
seq("(", $.filters, ")"),
),
$.filter
),
sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal),
sortByField: $ => token('sort_by'),
sortByOp: $ => $.equalOp,
sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])),
sortByOptions: $ => seq(optional(token.immediate('-')) ,choice(
'visits',
'last_visit',
'last_eventful_visit',
'last_revision',
'last_release',
'created',
'modified',
'published'
)),
limit: $ => seq('limit', $.equalOp, $.number),
filter: $ => choice(
$.patternFilter,
$.booleanFilter,
$.numericFilter,
$.boundedListFilter,
$.unboundedListFilter,
$.dateFilter
),
patternFilter: $ => seq($.patternField, $.patternOp, $.patternVal),
patternField: $ => token(choice('origin', 'metadata')),
patternOp: $ => $.equalOp,
patternVal: $ => $.string,
booleanFilter: $ => seq($.booleanField, $.booleanOp, $.booleanVal),
booleanField: $ => token(choice('visited')),
booleanOp: $ => $.equalOp,
booleanVal: $ => choice($.booleanTrue, $.booleanFalse),
numericFilter: $ => seq($.numericField, $.numericOp, $.numberVal),
numericField: $ => token(choice('visits')),
numericOp: $ => $.rangeOp,
numberVal: $ => $.number,
boundedListFilter: $ => choice($.visitTypeFilter),
visitTypeFilter: $ => seq($.visitTypeField, $.visitTypeOp, $.visitTypeVal),
visitTypeField: $ => token(choice('visit_type')),
visitTypeOp: $ => $.equalOp,
visitTypeVal: $ => createArray(optionalWrapWith($.visitTypeOptions, ["'", '"'])),
visitTypeOptions: $ => choice(
"any",
"cran",
"deb",
"deposit",
"ftp",
"hg",
"git",
"nixguix",
"npm",
"pypi",
"svn",
"tar"
), // TODO: fetch this list dynamically from other swh services?
sortBy: $ => seq($.sortByField, $.sortByOp, $.sortByVal),
sortByField: $ => token(choice('sort_by')),
sortByOp: $ => $.equalOp,
sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])),
sortByOptions: $ => seq(
optional(token.immediate('-')),
choice(
'visits',
'last_visit',
'last_eventful_visit',
'last_revision',
'last_release',
'created',
'modified',
'published'
)),
unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal),
listField: $ => token(choice('language', 'license', 'keyword')),
listOp: $ => $.choiceOp,
listVal: $ => createArray($.string),
dateFilter: $ => seq($.dateField, $.dateOp, $.dateVal),
dateField: $ => token(choice(
'last_visit',
'last_eventful_visit',
'last_revision',
'last_release',
'created',
'modified',
'published'
)),
dateOp: $ => $.rangeOp,
dateVal: $ => $.isoDateTime,
limit: $ => seq('limit', $.equalOp, $.number),
rangeOp: $ => token(choice('<', '<=', '=', '!=', '>=', '>')),
equalOp: $ => token('='),
choiceOp: $ => token(choice('in', 'not in')),
isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/,
string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord),
number: $ => /\d+/,
booleanTrue: $ => "true",
booleanFalse: $ => "false",
or: $ => "or",
and: $ => "and",
stringContent: $ => repeat1(choice(
token.immediate(/[^\\"\n]+/),
$.escape_sequence
)),
- singleWord: $ => /[^\s"'\[\]\(\)]+/,
+ singleWord: $ => /[^\s"'\[\]\(\),]+/,
escape_sequence: $ => token.immediate(seq(
'\\',
/(\"|\'|\\|\/|b|n|r|t|u)/
)),
}
});
function joinBySep1(rule, sep) {
// At least one repetition of the rule separated by `sep`
return seq(rule, repeat(seq(sep, optional(rule))))
}
function joinBySep(rule, sep = ",") {
// Any number of repetitions of the rule separated by `sep`
return optional(joinBySep1(rule, sep))
}
function createArray(rule) {
// An array having `rule` as its member
return seq(
"[",
joinBySep(
field('array_member', rule),
","
),
"]"
)
}
function wrapWith(rule, wrappers = ["'", '"']) {
// The rule must be wrapped with one of the wrappers
const wrappedRules = wrappers.map(wrapper => seq(wrapper, rule, wrapper))
return choice(...wrappedRules)
}
function optionalWrapWith(rule, wrappers = ["'", '"']) {
// The rule may or may not be wrapped with the wrappers
return choice(wrapWith(rule, wrappers), rule)
}
diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py
new file mode 100644
index 0000000..2c85d1c
--- /dev/null
+++ b/swh/search/tests/test_translator.py
@@ -0,0 +1,309 @@
+import pytest
+
+from swh.search.translator import Translator
+from swh.search.utils import get_expansion
+
+
+def _test_results(query, expected):
+ output = Translator().parse_query(query)
+ assert output == expected
+
+
+def test_empty_query():
+ query = ""
+ with pytest.raises(Exception):
+ _test_results(query, {})
+
+
+def test_conjunction_operators():
+ query = "visited = true or visits > 2 and visits < 5"
+ expected = {
+ "filters": {
+ "bool": {
+ "should": [
+ {"term": {"has_visits": True}},
+ {
+ "bool": {
+ "must": [
+ {"range": {"nb_visits": {"gt": 2}}},
+ {"range": {"nb_visits": {"lt": 5}}},
+ ]
+ }
+ },
+ ]
+ }
+ }
+ }
+ _test_results(query, expected)
+
+
+def test_conjunction_op_precedence_override():
+ query = "(visited = false or visits > 2) and visits < 5"
+ expected = {
+ "filters": {
+ "bool": {
+ "must": [
+ {
+ "bool": {
+ "should": [
+ {"term": {"has_visits": False}},
+ {"range": {"nb_visits": {"gt": 2}}},
+ ]
+ }
+ },
+ {"range": {"nb_visits": {"lt": 5}}},
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_limit_and_sortby():
+ query = "visited = true sort_by = [-visits,last_visit] limit = 15"
+ expected = {
+ "filters": {"term": {"has_visits": True}},
+ "sortBy": ["-visits", "last_visit"],
+ "limit": 15,
+ }
+
+ _test_results(query, expected)
+
+
+def test_deeply_nested_filters():
+ query = "(((visited = true and visits > 0)))"
+ expected = {
+ "filters": {
+ "bool": {
+ "must": [
+ {"term": {"has_visits": True},},
+ {"range": {"nb_visits": {"gt": 0}}},
+ ]
+ }
+ },
+ }
+
+ _test_results(query, expected)
+
+
+def test_origin_and_metadata_filters():
+ query = 'origin = django or metadata = "framework and web"'
+ expected = {
+ "filters": {
+ "bool": {
+ "should": [
+ {
+ "multi_match": {
+ "query": "django",
+ "type": "bool_prefix",
+ "operator": "and",
+ "fields": [
+ "url.as_you_type",
+ "url.as_you_type._2gram",
+ "url.as_you_type._3gram",
+ ],
+ }
+ },
+ {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": "framework and web",
+ "type": "cross_fields",
+ "operator": "and",
+ "fields": ["intrinsic_metadata.*"],
+ "lenient": True,
+ }
+ },
+ }
+ },
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_visits_not_equal_to_filter():
+ query = "visits != 5"
+ expected = {
+ "filters": {
+ "bool": {"must_not": [{"range": {"nb_visits": {"gte": 5, "lte": 5}}},]}
+ },
+ }
+
+ _test_results(query, expected)
+
+
+def test_visit_type_filter():
+ query = 'visit_type = [git,"pypi"]'
+ expected = {"filters": {"terms": {"visit_types": ["git", "pypi"]}}}
+
+ _test_results(query, expected)
+
+
+def test_keyword_filter():
+ query = r"""keyword in [word1, "word2 \" ' word3"]"""
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": r"""word1 word2 \" ' word3""",
+ "fields": [
+ get_expansion("keywords", ".") + "^2",
+ get_expansion("descriptions", "."),
+ ],
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_language_filter():
+ query = 'language in [python, "go lang", cpp]'
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {
+ "match": {
+ get_expansion(
+ "programming_languages", "."
+ ): "python"
+ }
+ },
+ {
+ "match": {
+ get_expansion(
+ "programming_languages", "."
+ ): "go lang"
+ }
+ },
+ {
+ "match": {
+ get_expansion("programming_languages", "."): "cpp"
+ }
+ },
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_license_filter():
+ query = 'license in ["GPL 3", Apache, MIT]'
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {"match": {get_expansion("licenses", "."): "GPL 3"}},
+ {"match": {get_expansion("licenses", "."): "Apache"}},
+ {"match": {get_expansion("licenses", "."): "MIT"}},
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_date_created_not_equal_to_filter():
+ query = "created != 2020-01-01"
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must_not": [
+ {
+ "range": {
+ get_expansion("date_created", "."): {
+ "gte": "2020-01-01",
+ "lte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_date_created_greater_than_filter():
+ query = "created >= 2020-01-01"
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ get_expansion("date_created", "."): {
+ "gte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ },
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_last_eventful_visit_not_equal_to_filter():
+ query = "last_visit != 2020-01-01"
+ expected = {
+ "filters": {
+ "bool": {
+ "must_not": [
+ {
+ "range": {
+ "last_visit_date": {
+ "gte": "2020-01-01",
+ "lte": "2020-01-01",
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+
+ _test_results(query, expected)
+
+
+def test_last_eventful_visit_less_than_to_filter():
+ query = "last_visit < 2020-01-01"
+ expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}}
+
+ _test_results(query, expected)
diff --git a/swh/search/translator.py b/swh/search/translator.py
new file mode 100644
index 0000000..5c2b9d7
--- /dev/null
+++ b/swh/search/translator.py
@@ -0,0 +1,288 @@
+import os
+
+from pkg_resources import resource_filename
+from tree_sitter import Language, Parser
+
+from swh.search.utils import get_expansion
+
+
+class Translator:
+
+ RANGE_OPERATOR_MAP = {
+ ">": "gt",
+ "<": "lt",
+ ">=": "gte",
+ "<=": "lte",
+ }
+
+ def __init__(self):
+ ql_rel_paths = [
+ "swh_ql.so", # installed
+ "../../query_language/swh_ql.so", # development
+ ]
+ for ql_rel_path in ql_rel_paths:
+ ql_path = resource_filename("swh.search", ql_rel_path)
+ if os.path.exists(ql_path):
+ break
+ else:
+ assert False, "swh_ql.so was not found in any of the expected paths"
+
+ search_ql = Language(ql_path, "swh_search_ql")
+
+ self.parser = Parser()
+ self.parser.set_language(search_ql)
+ self.query = ""
+
+ def parse_query(self, query):
+ self.query = query
+ tree = self.parser.parse(query.encode("utf8"))
+ self.query_node = tree.root_node
+
+ if self.query_node.has_error:
+ raise Exception("Invalid query")
+
+ return self._traverse(self.query_node)
+
+ def _traverse(self, node):
+ if len(node.children) == 3 and node.children[1].type == "filters":
+ # filters => ( filters )
+ return self._traverse(node.children[1]) # Go past the () brackets
+ if node.type == "query":
+ result = {}
+ for child in node.children:
+ # query => filters sort_by limit
+ result[child.type] = self._traverse(child)
+
+ return result
+
+ if node.type == "filters":
+ if len(node.children) == 1:
+ # query => filters
+ # filters => filters
+ # filters => filter
+ # Current node is just a wrapper, so go one level deep
+ return self._traverse(node.children[0])
+
+ if len(node.children) == 3:
+ # filters => filters conj_op filters
+ filters1 = self._traverse(node.children[0])
+ conj_op = self._get_value(node.children[1])
+ filters2 = self._traverse(node.children[2])
+
+ if conj_op == "and":
+ return {"bool": {"must": [filters1, filters2]}}
+ if conj_op == "or":
+ return {"bool": {"should": [filters1, filters2]}}
+
+ if node.type == "filter":
+ filter_category = node.children[0]
+ return self._parse_filter(filter_category)
+
+ if node.type == "sortBy":
+ return self._parse_filter(node)
+
+ if node.type == "limit":
+ return self._parse_filter(node)
+
+ return Exception(
+ f"Unknown node type ({node.type}) "
+ f"or unexpected number of children ({node.children})"
+ )
+
+ def _get_value(self, node):
+ if (
+ len(node.children) > 0
+ and node.children[0].type == "["
+ and node.children[-1].type == "]"
+ ):
+ # array
+ return [self._get_value(child) for child in node.children if child.is_named]
+
+ start = node.start_point[1]
+ end = node.end_point[1]
+
+ value = self.query[start:end]
+
+ if len(value) > 1 and (
+ (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"')
+ ):
+ return value[1:-1]
+
+ if node.type in ["number", "numberVal"]:
+ return int(value)
+
+ return value
+
+ def _parse_filter(self, filter):
+
+ if filter.type == "boundedListFilter":
+ filter = filter.children[0]
+
+ children = filter.children
+ assert len(children) == 3
+
+ category = filter.type
+ name, op, value = [self._get_value(child) for child in children]
+
+ if category == "patternFilter":
+ if name == "origin":
+ return {
+ "multi_match": {
+ "query": value,
+ "type": "bool_prefix",
+ "operator": "and",
+ "fields": [
+ "url.as_you_type",
+ "url.as_you_type._2gram",
+ "url.as_you_type._3gram",
+ ],
+ }
+ }
+ elif name == "metadata":
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": value,
+ "type": "cross_fields",
+ "operator": "and",
+ "fields": ["intrinsic_metadata.*"],
+ "lenient": True,
+ }
+ },
+ }
+ }
+
+ if category == "booleanFilter":
+ if name == "visited":
+ return {"term": {"has_visits": value == "true"}}
+
+ if category == "numericFilter":
+ if name == "visits":
+ if op in ["=", "!="]:
+ return {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {"range": {"nb_visits": {"gte": value, "lte": value}}}
+ ]
+ }
+ }
+ else:
+ return {
+ "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}}
+ }
+
+ if category == "visitTypeFilter":
+ if name == "visit_type":
+ return {"terms": {"visit_types": value}}
+
+ if category == "unboundedListFilter":
+ value_array = value
+
+ if name == "keyword":
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": " ".join(value_array),
+ "fields": [
+ get_expansion("keywords", ".") + "^2",
+ get_expansion("descriptions", "."),
+ ],
+ }
+ },
+ }
+ }
+ elif name in ["language", "license"]:
+ name_mapping = {
+ "language": "programming_languages",
+ "license": "licenses",
+ }
+ name = name_mapping[name]
+
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "should": [
+ {"match": {get_expansion(name, "."): val}}
+ for val in value_array
+ ],
+ }
+ },
+ }
+ }
+
+ if category == "dateFilter":
+
+ if name in ["created", "modified", "published"]:
+ if op in ["=", "!="]:
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {
+ "range": {
+ get_expansion(f"date_{name}", "."): {
+ "gte": value,
+ "lte": value,
+ }
+ }
+ }
+ ],
+ }
+ },
+ }
+ }
+
+ return {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ get_expansion(f"date_{name}", "."): {
+ self.RANGE_OPERATOR_MAP[op]: value,
+ }
+ }
+ }
+ ],
+ }
+ },
+ }
+ }
+ else:
+ if op in ["=", "!="]:
+ return {
+ "bool": {
+ ("must" if op == "=" else "must_not"): [
+ {
+ "range": {
+ f"{name}_date": {"gte": value, "lte": value,}
+ }
+ }
+ ],
+ }
+ }
+ return {
+ "range": {
+ f"{name}_date": {
+ self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"),
+ }
+ }
+ }
+
+ if category == "sortBy":
+ return value
+
+ if category == "limit":
+ return value
+
+ raise Exception(f"Unknown filter {category}.{name}")

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:25 PM (6 d, 7 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3246703

Event Timeline