diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py --- a/swh/search/tests/test_translator.py +++ b/swh/search/tests/test_translator.py @@ -330,7 +330,7 @@ _test_results(query, expected) -def test_escaped_punctutation_parsing(): +def test_escaped_punctuation_parsing(): query = r"""keyword in ["foo \'\" bar"]""" expected = { "filters": { @@ -349,3 +349,52 @@ } } _test_results(query, expected) + + +def test_nonascii(): + query = r"""keyword in ["café"]""" + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": r"""café""", + "fields": [ + get_expansion("keywords", ".") + "^2", + get_expansion("descriptions", "."), + ], + } + }, + } + } + } + _test_results(query, expected) + + +def test_nonascii_before_operator(): + query = r"""keyword in ["🐍"] and visited = true""" + expected = { + "filters": { + "bool": { + "must": [ + { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": r"""🐍""", + "fields": [ + get_expansion("keywords", ".") + "^2", + get_expansion("descriptions", "."), + ], + } + }, + }, + }, + {"term": {"has_visits": True,},}, + ], + } + } + } + _test_results(query, expected) diff --git a/swh/search/translator.py b/swh/search/translator.py --- a/swh/search/translator.py +++ b/swh/search/translator.py @@ -40,8 +40,8 @@ self.query = "" def parse_query(self, query): - self.query = query - tree = self.parser.parse(query.encode("utf8")) + self.query = query.encode() + tree = self.parser.parse(self.query) self.query_node = tree.root_node if self.query_node.has_error: @@ -109,7 +109,7 @@ start = node.start_point[1] end = node.end_point[1] - value = self.query[start:end] + value = self.query[start:end].decode() if len(value) > 1 and ( (value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"')