D6046.id21958.diff
No OneTemporary
Actions

Size

25 KB

Subscribers

None

D6046.id21958.diff
View Options

	diff --git a/query_language/grammar.js b/query_language/grammar.js
	--- a/query_language/grammar.js
	+++ b/query_language/grammar.js
	@@ -14,7 +14,16 @@
	name: 'swh_search_ql',

	rules: {
	- query: $ => seq($.filters, optional($.sortBy) ,optional($.limit)),
	+ query: $ => seq(
	+ $.filters,
	+ optional($.and),
	+ choice(
	+ seq(optional($.sortBy), optional($.and), optional($.limit)),
	+ seq(optional($.limit), optional($.and), optional($.sortBy)),
	+ ),
	+ ),
	+
	+

	filters: $ => choice(
	prec.left(PRECEDENCE.and,
	@@ -41,7 +50,7 @@
	sortByField: $ => token('sort_by'),
	sortByOp: $ => $.equalOp,
	sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])),
	- sortByOptions: $ => seq(optional(token.immediate('-')) ,choice(
	+ sortByOptions: $ => seq(optional(token.immediate('-')), choice(
	'visits',
	'last_visit',
	'last_eventful_visit',
	@@ -104,17 +113,18 @@
	sortByOp: $ => $.equalOp,
	sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])),
	sortByOptions: $ => seq(
	- optional(token.immediate('-')),
	+ optional('-'),
	choice(
	- 'visits',
	- 'last_visit',
	- 'last_eventful_visit',
	- 'last_revision',
	- 'last_release',
	- 'created',
	- 'modified',
	- 'published'
	- )),
	+ 'visits',
	+ 'last_visit',
	+ 'last_eventful_visit',
	+ 'last_revision',
	+ 'last_release',
	+ 'created',
	+ 'modified',
	+ 'published'
	+ )
	+ ),

	unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal),
	listField: $ => token(choice('language', 'license', 'keyword')),
	@@ -142,7 +152,13 @@
	equalOp: $ => token('='),
	choiceOp: $ => token(choice('in', 'not in')),

	- isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s\|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/,
	+ isoDateTime: $ => {
	+ const dateRegex = (/\d{4}[-]\d{2}[-]\d{2}/).source
	+ const dateTimeSepRegex = (/(\s\|T)*/).source
	+ const timeRegex = (/(\d{2}:\d{2}(:\d{2}(\.\d{6})?)?)?/).source
	+ const timezoneRegex = (/(\+\d{2}:\d{2}\|Z)?/).source
	+ return new RegExp(dateRegex + dateTimeSepRegex + timeRegex + timezoneRegex)
	+ },

	string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord),
	number: $ => /\d+/,
	@@ -153,7 +169,7 @@
	and: $ => "and",

	stringContent: $ => repeat1(choice(
	- token.immediate(/[^\\"\n]+/),
	+ token.immediate(/[^\\'"\n]+/),
	$.escape_sequence
	)),
	singleWord: $ => /[^\s"'\[\],]+/,
	diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
	--- a/swh/search/elasticsearch.py
	+++ b/swh/search/elasticsearch.py
	@@ -22,7 +22,8 @@
	PagedResult,
	)
	from swh.search.metrics import send_metric, timed
	-from swh.search.utils import get_expansion, is_date_parsable
	+from swh.search.translator import Translator
	+from swh.search.utils import escape, get_expansion, is_date_parsable

	logger = logging.getLogger(__name__)

	@@ -99,6 +100,7 @@
	class ElasticSearch:
	def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}):
	self._backend = Elasticsearch(hosts=hosts)
	+ self._translator = Translator()

	# Merge current configuration with default values
	origin_config = indexes.get("origin", {})
	@@ -348,6 +350,7 @@
	def origin_search(
	self,
	*,
	+ query: str = "",
	url_pattern: Optional[str] = None,
	metadata_pattern: Optional[str] = None,
	with_visit: bool = False,
	@@ -369,185 +372,89 @@
	) -> PagedResult[MinimalOriginDict]:
	query_clauses: List[Dict[str, Any]] = []

	+ query_filters = []
	if url_pattern:
	- query_clauses.append(
	- {
	- "multi_match": {
	- "query": url_pattern,
	- "type": "bool_prefix",
	- "operator": "and",
	- "fields": [
	- "url.as_you_type",
	- "url.as_you_type._2gram",
	- "url.as_you_type._3gram",
	- ],
	- }
	- }
	- )
	+ query_filters.append(f"origin = {escape(url_pattern)}")

	if metadata_pattern:
	- query_clauses.append(
	- {
	- "nested": {
	- "path": "intrinsic_metadata",
	- "query": {
	- "multi_match": {
	- "query": metadata_pattern,
	- # Makes it so that the "foo bar" query returns
	- # documents which contain "foo" in a field and "bar"
	- # in a different field
	- "type": "cross_fields",
	- # All keywords must be found in a document for it to
	- # be considered a match.
	- # TODO: allow missing keywords?
	- "operator": "and",
	- # Searches on all fields of the intrinsic_metadata dict,
	- # recursively.
	- "fields": ["intrinsic_metadata.*"],
	- # date{Created,Modified,Published} are of type date
	- "lenient": True,
	- }
	- },
	- }
	- }
	- )
	+ query_filters.append(f"metadata = {escape(metadata_pattern)}")

	- if not query_clauses:
	- raise ValueError(
	- "At least one of url_pattern and metadata_pattern must be provided."
	- )
	+ # if not query_clauses:
	+ # raise ValueError(
	+ # "At least one of url_pattern and metadata_pattern must be provided."
	+ # )

	if with_visit:
	- query_clauses.append({"term": {"has_visits": True,}})
	+ query_filters.append(f"visited = {'true' if with_visit else 'false'}")
	if min_nb_visits:
	- query_clauses.append({"range": {"nb_visits": {"gte": min_nb_visits,},}})
	+ query_filters.append(f"visits >= {min_nb_visits}")
	if min_last_visit_date:
	- query_clauses.append(
	- {
	- "range": {
	- "last_visit_date": {
	- "gte": min_last_visit_date.replace("Z", "+00:00"),
	- }
	- }
	- }
	+ query_filters.append(
	+ f"last_visit >= {min_last_visit_date.replace('Z', '+00:00')}"
	)
	if min_last_eventful_visit_date:
	- query_clauses.append(
	- {
	- "range": {
	- "last_eventful_visit_date": {
	- "gte": min_last_eventful_visit_date.replace("Z", "+00:00"),
	- }
	- }
	- }
	+ query_filters.append(
	+ "last_eventful_visit >= "
	+ f"{min_last_eventful_visit_date.replace('Z', '+00:00')}"
	)
	if min_last_revision_date:
	- query_clauses.append(
	- {
	- "range": {
	- "last_revision_date": {
	- "gte": min_last_revision_date.replace("Z", "+00:00"),
	- }
	- }
	- }
	+ query_filters.append(
	+ f"last_revision >= {min_last_revision_date.replace('Z', '+00:00')}"
	)
	if min_last_release_date:
	- query_clauses.append(
	- {
	- "range": {
	- "last_release_date": {
	- "gte": min_last_release_date.replace("Z", "+00:00"),
	- }
	- }
	- }
	+ query_filters.append(
	+ f"last_release >= {min_last_release_date.replace('Z', '+00:00')}"
	)
	if keywords:
	- query_clauses.append(
	- {
	- "nested": {
	- "path": "intrinsic_metadata",
	- "query": {
	- "multi_match": {
	- "query": " ".join(keywords),
	- "fields": [
	- get_expansion("keywords", ".") + "^2",
	- get_expansion("descriptions", "."),
	- # "^2" boosts an origin's score by 2x
	- # if it the queried keywords are
	- # found in its intrinsic_metadata.keywords
	- ],
	- }
	- },
	- }
	- }
	- )
	-
	- intrinsic_metadata_filters: List[Dict[str, Dict]] = []
	-
	+ query_filters.append(f"keyword in {escape(keywords)}")
	if licenses:
	- license_filters: List[Dict[str, Any]] = []
	- for license in licenses:
	- license_filters.append(
	- {"match": {get_expansion("licenses", "."): license}}
	- )
	- intrinsic_metadata_filters.append({"bool": {"should": license_filters}})
	+ query_filters.append(f"license in {escape(licenses)}")

	if programming_languages:
	- language_filters: List[Dict[str, Any]] = []
	- for language in programming_languages:
	- language_filters.append(
	- {"match": {get_expansion("programming_languages", "."): language}}
	- )
	- intrinsic_metadata_filters.append({"bool": {"should": language_filters}})
	+ query_filters.append(f"language in {escape(programming_languages)}")

	if min_date_created:
	- intrinsic_metadata_filters.append(
	- {
	- "range": {
	- get_expansion("date_created", "."): {"gte": min_date_created,}
	- }
	- }
	+ query_filters.append(
	+ f"created >= {min_date_created.replace('Z', '+00:00')}"
	)
	if min_date_modified:
	- intrinsic_metadata_filters.append(
	- {
	- "range": {
	- get_expansion("date_modified", "."): {"gte": min_date_modified,}
	- }
	- }
	+ query_filters.append(
	+ f"modified >= {min_date_modified.replace('Z', '+00:00')}"
	)
	if min_date_published:
	- intrinsic_metadata_filters.append(
	- {
	- "range": {
	- get_expansion("date_published", "."): {
	- "gte": min_date_published,
	- }
	- }
	- }
	- )
	-
	- if intrinsic_metadata_filters:
	- query_clauses.append(
	- {
	- "nested": {
	- "path": "intrinsic_metadata",
	- "query": {"bool": {"must": intrinsic_metadata_filters,}},
	- # "must" is equivalent to "AND"
	- # "should" is equivalent to "OR"
	- # Resulting origins must return true for the following:
	- # (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..)
	- # This is equivalent to {"must": [
	- # {"should": [license_1,license_2] },
	- # {"should": [lang_1,lang_2]}] }
	- # ]}
	- # Note: Usage of "bool" has been omitted for readability
	- }
	- }
	+ query_filters.append(
	+ f"published >= {min_date_published.replace('Z', '+00:00')}"
	)

	if visit_types is not None:
	- query_clauses.append({"terms": {"visit_types": visit_types}})
	+ query_filters.append(f"visit_type = {escape(visit_types)}")
	+
	+ combined_filters = f"({' and '.join(query_filters)})"
	+ query = f"{combined_filters}{' and ' if query != '' else ' '}{query}"
	+ parsed_query = self._translator.parse_query(query)
	+ query_clauses.append(parsed_query["filters"])
	+
	+ field_map = {
	+ "visits": "nb_visits",
	+ "last_visit": "last_visit_date",
	+ "last_eventful_visit": "last_eventful_visit_date",
	+ "last_revision": "last_revision_date",
	+ "last_release": "last_release_date",
	+ "created": "date_created",
	+ "modified": "date_modified",
	+ "published": "date_published",
	+ }
	+
	+ if "sortBy" in parsed_query:
	+ if sort_by is None:
	+ sort_by = []
	+ for sort_by_option in parsed_query["sortBy"]:
	+ if sort_by_option[0] == "-":
	+ sort_by.append("-" + field_map[sort_by_option[1:]])
	+ else:
	+ sort_by.append(field_map[sort_by_option])
	+ if parsed_query.get("limit", 0):
	+ limit = parsed_query["limit"]

	sorting_params: List[Dict[str, Any]] = []

	diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
	--- a/swh/search/in_memory.py
	+++ b/swh/search/in_memory.py
	@@ -268,6 +268,7 @@
	def origin_search(
	self,
	*,
	+ query: str = "",
	url_pattern: Optional[str] = None,
	metadata_pattern: Optional[str] = None,
	with_visit: bool = False,
	diff --git a/swh/search/interface.py b/swh/search/interface.py
	--- a/swh/search/interface.py
	+++ b/swh/search/interface.py
	@@ -65,6 +65,7 @@
	def origin_search(
	self,
	*,
	+ query: str = "",
	url_pattern: Optional[str] = None,
	metadata_pattern: Optional[str] = None,
	with_visit: bool = False,
	@@ -87,11 +88,12 @@
	"""Searches for origins matching the `url_pattern`.

	Args:
	+ query: Find origins according the queries written as per the
	+ swh-search query language syntax.
	url_pattern: Part of the URL to search for
	metadata_pattern: Keywords to look for
	(across all the fields of intrinsic_metadata)
	- with_visit: Whether origins with no visit are to be
	- filtered out
	+ with_visit: Whether origins with no visits are to be filtered out
	visit_types: Only origins having any of the provided visit types
	(e.g. git, svn, pypi) will be returned
	min_nb_visits: Filter origins that have number of visits >=
	diff --git a/swh/search/tests/test_elasticsearch.py b/swh/search/tests/test_elasticsearch.py
	--- a/swh/search/tests/test_elasticsearch.py
	+++ b/swh/search/tests/test_elasticsearch.py
	@@ -3,6 +3,7 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	+from datetime import datetime, timedelta, timezone
	from textwrap import dedent
	import types
	import unittest
	@@ -117,3 +118,50 @@
	self.search.origin_search(url_pattern="foobar.baz")

	assert mock.call_args[1]["index"] == "test-read"
	+
	+ def test_sort_by_and_limit_query(self):
	+ now = datetime.now(tz=timezone.utc).isoformat()
	+ now_minus_5_hours = (
	+ datetime.now(tz=timezone.utc) - timedelta(hours=5)
	+ ).isoformat()
	+ now_plus_5_hours = (
	+ datetime.now(tz=timezone.utc) + timedelta(hours=5)
	+ ).isoformat()
	+
	+ ORIGINS = [
	+ {
	+ "url": "http://foobar.1.com",
	+ "nb_visits": 1,
	+ "last_visit_date": now_minus_5_hours,
	+ "last_eventful_visit_date": now_minus_5_hours,
	+ },
	+ {
	+ "url": "http://foobar.2.com",
	+ "nb_visits": 2,
	+ "last_visit_date": now,
	+ "last_eventful_visit_date": now,
	+ },
	+ {
	+ "url": "http://foobar.3.com",
	+ "nb_visits": 3,
	+ "last_visit_date": now_plus_5_hours,
	+ "last_eventful_visit_date": now_minus_5_hours,
	+ },
	+ ]
	+
	+ self.search.origin_update(ORIGINS)
	+ self.search.flush()
	+
	+ def _check_results(query, origin_indices):
	+ page = self.search.origin_search(url_pattern="foobar", query=query)
	+ results = [r["url"] for r in page.results]
	+ assert results == [ORIGINS[index]["url"] for index in origin_indices]
	+
	+ _check_results("sort_by = [-visits]", [2, 1, 0])
	+ _check_results("sort_by = [last_visit]", [0, 1, 2])
	+ _check_results("sort_by = [-last_eventful_visit, visits]", [1, 0, 2])
	+ _check_results("sort_by = [last_eventful_visit,-last_visit]", [2, 0, 1])
	+
	+ _check_results("sort_by = [-visits] limit = 1", [2])
	+ _check_results("sort_by = [last_visit] and limit = 2", [0, 1])
	+ _check_results("sort_by = [-last_eventful_visit, visits] limit = 3", [1, 0, 2])
	diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
	--- a/swh/search/tests/test_search.py
	+++ b/swh/search/tests/test_search.py
	@@ -1151,3 +1151,18 @@
	result_page = self.search.origin_search(url_pattern="origin")
	assert result_page.next_page_token is None
	assert result_page.results == []
	+
	+ def test_filter_keyword_in_filter(self):
	+ origin1 = {
	+ "url": "foo language in ['foo baz'] bar",
	+ }
	+ self.search.origin_update([origin1])
	+ self.search.flush()
	+
	+ result_page = self.search.origin_search(url_pattern="language in ['foo bar']")
	+ assert result_page.next_page_token is None
	+ assert result_page.results == [origin1]
	+
	+ result_page = self.search.origin_search(url_pattern="baaz")
	+ assert result_page.next_page_token is None
	+ assert result_page.results == []
	diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py
	--- a/swh/search/tests/test_translator.py
	+++ b/swh/search/tests/test_translator.py
	@@ -146,14 +146,14 @@


	def test_keyword_filter():
	- query = r"""keyword in [word1, "word2 \" ' word3"]"""
	+ query = r"""keyword in [word1, "word2 \" \' word3"]"""
	expected = {
	"filters": {
	"nested": {
	"path": "intrinsic_metadata",
	"query": {
	"multi_match": {
	- "query": r"""word1 word2 \" ' word3""",
	+ "query": r"""word1 word2 " ' word3""",
	"fields": [
	get_expansion("keywords", ".") + "^2",
	get_expansion("descriptions", "."),
	@@ -307,3 +307,45 @@
	expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}}

	_test_results(query, expected)
	+
	+
	+def test_keyword_no_escape_inside_filter():
	+ # any keyword (filter name/operator/value) inside a filter
	+ # must be considered a string.
	+ query = r'''origin = "language in [\'go lang\', python]"'''
	+ expected = {
	+ "filters": {
	+ "multi_match": {
	+ "query": r"""language in ['go lang', python]""",
	+ "type": "bool_prefix",
	+ "operator": "and",
	+ "fields": [
	+ "url.as_you_type",
	+ "url.as_you_type._2gram",
	+ "url.as_you_type._3gram",
	+ ],
	+ }
	+ }
	+ }
	+ _test_results(query, expected)
	+
	+
	+def test_escaped_punctutation_parsing():
	+ query = r"""keyword in ["foo \'\" bar"]"""
	+ expected = {
	+ "filters": {
	+ "nested": {
	+ "path": "intrinsic_metadata",
	+ "query": {
	+ "multi_match": {
	+ "query": r"""foo '" bar""",
	+ "fields": [
	+ get_expansion("keywords", ".") + "^2",
	+ get_expansion("descriptions", "."),
	+ ],
	+ }
	+ },
	+ }
	+ }
	+ }
	+ _test_results(query, expected)
	diff --git a/swh/search/translator.py b/swh/search/translator.py
	--- a/swh/search/translator.py
	+++ b/swh/search/translator.py
	@@ -3,7 +3,7 @@
	from pkg_resources import resource_filename
	from tree_sitter import Language, Parser

	-from swh.search.utils import get_expansion
	+from swh.search.utils import get_expansion, unescape


	class Translator:
	@@ -70,8 +70,10 @@
	filters2 = self._traverse(node.children[2])

	if conj_op == "and":
	+ # "must" is equivalent to "AND"
	return {"bool": {"must": [filters1, filters2]}}
	if conj_op == "or":
	+ # "should" is equivalent to "OR"
	return {"bool": {"should": [filters1, filters2]}}

	if node.type == "filter":
	@@ -104,14 +106,13 @@
	value = self.query[start:end]

	if len(value) > 1 and (
	- (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"')
	+ (value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"')
	):
	- return value[1:-1]
	+ return unescape(value[1:-1])

	if node.type in ["number", "numberVal"]:
	return int(value)
	-
	- return value
	+ return unescape(value)

	def _parse_filter(self, filter):

	@@ -145,9 +146,18 @@
	"query": {
	"multi_match": {
	"query": value,
	+ # Makes it so that the "foo bar" query returns
	+ # documents which contain "foo" in a field and "bar"
	+ # in a different field
	"type": "cross_fields",
	+ # All keywords must be found in a document for it to
	+ # be considered a match.
	+ # TODO: allow missing keywords?
	"operator": "and",
	+ # Searches on all fields of the intrinsic_metadata dict,
	+ # recursively.
	"fields": ["intrinsic_metadata.*"],
	+ # date{Created,Modified,Published} are of type date
	"lenient": True,
	}
	},
	@@ -190,6 +200,9 @@
	"fields": [
	get_expansion("keywords", ".") + "^2",
	get_expansion("descriptions", "."),
	+ # "^2" boosts an origin's score by 2x
	+ # if it the queried keywords are
	+ # found in its intrinsic_metadata.keywords
	],
	}
	},
	diff --git a/swh/search/utils.py b/swh/search/utils.py
	--- a/swh/search/utils.py
	+++ b/swh/search/utils.py
	@@ -55,3 +55,51 @@
	return True
	except Exception:
	return False
	+
	+
	+def escape(obj):
	+ r"""Makes the object directly injectable into the
	+ query language by converting the escapable parts of
	+ the object into escape sequences.
	+
	+ For strings, appends \ before special characters like ', ", and \
	+
	+ For arrays, applies the same transformation on each element, joins the
	+ elements and returns a string-like representation of the list.
	+
	+ >>> print(escape("foo ' bar"))
	+ "foo \' bar"
	+
	+ >>> print(escape([r"foo ' bar", r"bar \\\' baz", r'foo " baz']))
	+ ["foo \' bar", "bar \\\\\\\' baz", "foo \" baz"]
	+
	+ """
	+ if type(obj) == list:
	+ items = [escape(item) for item in obj]
	+ return "[" + ", ".join(items) + "]"
	+ elif type(obj) == str:
	+ return (
	+ '"'
	+ + obj.translate({ord("'"): r"\'", ord('"'): r"\"", ord("\\"): r"\\",})
	+ + '"'
	+ )
	+ else:
	+ raise Exception(f"Unexpected item type {type(obj)}")
	+
	+
	+def unescape(string):
	+ r"""Processes the escaped special characters
	+
	+ >>> unescape(r'''foo " bar''') == r'''foo " bar'''
	+ True
	+ >>> unescape(r'''foo \" bar''') == r'''foo " bar'''
	+ True
	+ >>> unescape(r'''foo \\" bar''') == r'''foo \" bar'''
	+ True
	+ >>> unescape(r'''foo \\\" bar''') == r'''foo \" bar'''
	+ True
	+ >>> unescape(r'''foo \\\\" bar''') == r'''foo \\" bar'''
	+ True
	+ """
	+
	+ return bytes(string, "utf-8").decode("unicode_escape")

File Metadata

Mime Type: text/plain
Expires: Jul 3 2025, 7:55 AM (10 w, 4 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3222330

D6046.id21958.diffNo OneTemporaryActions

D6046.id21958.diffView Options

File Metadata

Event Timeline

D6046.id21958.diff
No OneTemporary
Actions

D6046.id21958.diff
View Options