Page MenuHomeSoftware Heritage

D6046.id21958.diff
No OneTemporary

D6046.id21958.diff

diff --git a/query_language/grammar.js b/query_language/grammar.js
--- a/query_language/grammar.js
+++ b/query_language/grammar.js
@@ -14,7 +14,16 @@
name: 'swh_search_ql',
rules: {
- query: $ => seq($.filters, optional($.sortBy) ,optional($.limit)),
+ query: $ => seq(
+ $.filters,
+ optional($.and),
+ choice(
+ seq(optional($.sortBy), optional($.and), optional($.limit)),
+ seq(optional($.limit), optional($.and), optional($.sortBy)),
+ ),
+ ),
+
+
filters: $ => choice(
prec.left(PRECEDENCE.and,
@@ -41,7 +50,7 @@
sortByField: $ => token('sort_by'),
sortByOp: $ => $.equalOp,
sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])),
- sortByOptions: $ => seq(optional(token.immediate('-')) ,choice(
+ sortByOptions: $ => seq(optional(token.immediate('-')), choice(
'visits',
'last_visit',
'last_eventful_visit',
@@ -104,17 +113,18 @@
sortByOp: $ => $.equalOp,
sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])),
sortByOptions: $ => seq(
- optional(token.immediate('-')),
+ optional('-'),
choice(
- 'visits',
- 'last_visit',
- 'last_eventful_visit',
- 'last_revision',
- 'last_release',
- 'created',
- 'modified',
- 'published'
- )),
+ 'visits',
+ 'last_visit',
+ 'last_eventful_visit',
+ 'last_revision',
+ 'last_release',
+ 'created',
+ 'modified',
+ 'published'
+ )
+ ),
unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal),
listField: $ => token(choice('language', 'license', 'keyword')),
@@ -142,7 +152,13 @@
equalOp: $ => token('='),
choiceOp: $ => token(choice('in', 'not in')),
- isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/,
+ isoDateTime: $ => {
+ const dateRegex = (/\d{4}[-]\d{2}[-]\d{2}/).source
+ const dateTimeSepRegex = (/(\s|T)*/).source
+ const timeRegex = (/(\d{2}:\d{2}(:\d{2}(\.\d{6})?)?)?/).source
+ const timezoneRegex = (/(\+\d{2}:\d{2}|Z)?/).source
+ return new RegExp(dateRegex + dateTimeSepRegex + timeRegex + timezoneRegex)
+ },
string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord),
number: $ => /\d+/,
@@ -153,7 +169,7 @@
and: $ => "and",
stringContent: $ => repeat1(choice(
- token.immediate(/[^\\"\n]+/),
+ token.immediate(/[^\\'"\n]+/),
$.escape_sequence
)),
singleWord: $ => /[^\s"'\[\]\(\),]+/,
diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -22,7 +22,8 @@
PagedResult,
)
from swh.search.metrics import send_metric, timed
-from swh.search.utils import get_expansion, is_date_parsable
+from swh.search.translator import Translator
+from swh.search.utils import escape, get_expansion, is_date_parsable
logger = logging.getLogger(__name__)
@@ -99,6 +100,7 @@
class ElasticSearch:
def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}):
self._backend = Elasticsearch(hosts=hosts)
+ self._translator = Translator()
# Merge current configuration with default values
origin_config = indexes.get("origin", {})
@@ -348,6 +350,7 @@
def origin_search(
self,
*,
+ query: str = "",
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
@@ -369,185 +372,89 @@
) -> PagedResult[MinimalOriginDict]:
query_clauses: List[Dict[str, Any]] = []
+ query_filters = []
if url_pattern:
- query_clauses.append(
- {
- "multi_match": {
- "query": url_pattern,
- "type": "bool_prefix",
- "operator": "and",
- "fields": [
- "url.as_you_type",
- "url.as_you_type._2gram",
- "url.as_you_type._3gram",
- ],
- }
- }
- )
+ query_filters.append(f"origin = {escape(url_pattern)}")
if metadata_pattern:
- query_clauses.append(
- {
- "nested": {
- "path": "intrinsic_metadata",
- "query": {
- "multi_match": {
- "query": metadata_pattern,
- # Makes it so that the "foo bar" query returns
- # documents which contain "foo" in a field and "bar"
- # in a different field
- "type": "cross_fields",
- # All keywords must be found in a document for it to
- # be considered a match.
- # TODO: allow missing keywords?
- "operator": "and",
- # Searches on all fields of the intrinsic_metadata dict,
- # recursively.
- "fields": ["intrinsic_metadata.*"],
- # date{Created,Modified,Published} are of type date
- "lenient": True,
- }
- },
- }
- }
- )
+ query_filters.append(f"metadata = {escape(metadata_pattern)}")
- if not query_clauses:
- raise ValueError(
- "At least one of url_pattern and metadata_pattern must be provided."
- )
+ # if not query_clauses:
+ # raise ValueError(
+ # "At least one of url_pattern and metadata_pattern must be provided."
+ # )
if with_visit:
- query_clauses.append({"term": {"has_visits": True,}})
+ query_filters.append(f"visited = {'true' if with_visit else 'false'}")
if min_nb_visits:
- query_clauses.append({"range": {"nb_visits": {"gte": min_nb_visits,},}})
+ query_filters.append(f"visits >= {min_nb_visits}")
if min_last_visit_date:
- query_clauses.append(
- {
- "range": {
- "last_visit_date": {
- "gte": min_last_visit_date.replace("Z", "+00:00"),
- }
- }
- }
+ query_filters.append(
+ f"last_visit >= {min_last_visit_date.replace('Z', '+00:00')}"
)
if min_last_eventful_visit_date:
- query_clauses.append(
- {
- "range": {
- "last_eventful_visit_date": {
- "gte": min_last_eventful_visit_date.replace("Z", "+00:00"),
- }
- }
- }
+ query_filters.append(
+ "last_eventful_visit >= "
+ f"{min_last_eventful_visit_date.replace('Z', '+00:00')}"
)
if min_last_revision_date:
- query_clauses.append(
- {
- "range": {
- "last_revision_date": {
- "gte": min_last_revision_date.replace("Z", "+00:00"),
- }
- }
- }
+ query_filters.append(
+ f"last_revision >= {min_last_revision_date.replace('Z', '+00:00')}"
)
if min_last_release_date:
- query_clauses.append(
- {
- "range": {
- "last_release_date": {
- "gte": min_last_release_date.replace("Z", "+00:00"),
- }
- }
- }
+ query_filters.append(
+ f"last_release >= {min_last_release_date.replace('Z', '+00:00')}"
)
if keywords:
- query_clauses.append(
- {
- "nested": {
- "path": "intrinsic_metadata",
- "query": {
- "multi_match": {
- "query": " ".join(keywords),
- "fields": [
- get_expansion("keywords", ".") + "^2",
- get_expansion("descriptions", "."),
- # "^2" boosts an origin's score by 2x
- # if it the queried keywords are
- # found in its intrinsic_metadata.keywords
- ],
- }
- },
- }
- }
- )
-
- intrinsic_metadata_filters: List[Dict[str, Dict]] = []
-
+ query_filters.append(f"keyword in {escape(keywords)}")
if licenses:
- license_filters: List[Dict[str, Any]] = []
- for license in licenses:
- license_filters.append(
- {"match": {get_expansion("licenses", "."): license}}
- )
- intrinsic_metadata_filters.append({"bool": {"should": license_filters}})
+ query_filters.append(f"license in {escape(licenses)}")
if programming_languages:
- language_filters: List[Dict[str, Any]] = []
- for language in programming_languages:
- language_filters.append(
- {"match": {get_expansion("programming_languages", "."): language}}
- )
- intrinsic_metadata_filters.append({"bool": {"should": language_filters}})
+ query_filters.append(f"language in {escape(programming_languages)}")
if min_date_created:
- intrinsic_metadata_filters.append(
- {
- "range": {
- get_expansion("date_created", "."): {"gte": min_date_created,}
- }
- }
+ query_filters.append(
+ f"created >= {min_date_created.replace('Z', '+00:00')}"
)
if min_date_modified:
- intrinsic_metadata_filters.append(
- {
- "range": {
- get_expansion("date_modified", "."): {"gte": min_date_modified,}
- }
- }
+ query_filters.append(
+ f"modified >= {min_date_modified.replace('Z', '+00:00')}"
)
if min_date_published:
- intrinsic_metadata_filters.append(
- {
- "range": {
- get_expansion("date_published", "."): {
- "gte": min_date_published,
- }
- }
- }
- )
-
- if intrinsic_metadata_filters:
- query_clauses.append(
- {
- "nested": {
- "path": "intrinsic_metadata",
- "query": {"bool": {"must": intrinsic_metadata_filters,}},
- # "must" is equivalent to "AND"
- # "should" is equivalent to "OR"
- # Resulting origins must return true for the following:
- # (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..)
- # This is equivalent to {"must": [
- # {"should": [license_1,license_2] },
- # {"should": [lang_1,lang_2]}] }
- # ]}
- # Note: Usage of "bool" has been omitted for readability
- }
- }
+ query_filters.append(
+ f"published >= {min_date_published.replace('Z', '+00:00')}"
)
if visit_types is not None:
- query_clauses.append({"terms": {"visit_types": visit_types}})
+ query_filters.append(f"visit_type = {escape(visit_types)}")
+
+ combined_filters = f"({' and '.join(query_filters)})"
+ query = f"{combined_filters}{' and ' if query != '' else ' '}{query}"
+ parsed_query = self._translator.parse_query(query)
+ query_clauses.append(parsed_query["filters"])
+
+ field_map = {
+ "visits": "nb_visits",
+ "last_visit": "last_visit_date",
+ "last_eventful_visit": "last_eventful_visit_date",
+ "last_revision": "last_revision_date",
+ "last_release": "last_release_date",
+ "created": "date_created",
+ "modified": "date_modified",
+ "published": "date_published",
+ }
+
+ if "sortBy" in parsed_query:
+ if sort_by is None:
+ sort_by = []
+ for sort_by_option in parsed_query["sortBy"]:
+ if sort_by_option[0] == "-":
+ sort_by.append("-" + field_map[sort_by_option[1:]])
+ else:
+ sort_by.append(field_map[sort_by_option])
+ if parsed_query.get("limit", 0):
+ limit = parsed_query["limit"]
sorting_params: List[Dict[str, Any]] = []
diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -268,6 +268,7 @@
def origin_search(
self,
*,
+ query: str = "",
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
diff --git a/swh/search/interface.py b/swh/search/interface.py
--- a/swh/search/interface.py
+++ b/swh/search/interface.py
@@ -65,6 +65,7 @@
def origin_search(
self,
*,
+ query: str = "",
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
@@ -87,11 +88,12 @@
"""Searches for origins matching the `url_pattern`.
Args:
+ query: Find origins according the queries written as per the
+ swh-search query language syntax.
url_pattern: Part of the URL to search for
metadata_pattern: Keywords to look for
(across all the fields of intrinsic_metadata)
- with_visit: Whether origins with no visit are to be
- filtered out
+ with_visit: Whether origins with no visits are to be filtered out
visit_types: Only origins having any of the provided visit types
(e.g. git, svn, pypi) will be returned
min_nb_visits: Filter origins that have number of visits >=
diff --git a/swh/search/tests/test_elasticsearch.py b/swh/search/tests/test_elasticsearch.py
--- a/swh/search/tests/test_elasticsearch.py
+++ b/swh/search/tests/test_elasticsearch.py
@@ -3,6 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from datetime import datetime, timedelta, timezone
from textwrap import dedent
import types
import unittest
@@ -117,3 +118,50 @@
self.search.origin_search(url_pattern="foobar.baz")
assert mock.call_args[1]["index"] == "test-read"
+
+ def test_sort_by_and_limit_query(self):
+ now = datetime.now(tz=timezone.utc).isoformat()
+ now_minus_5_hours = (
+ datetime.now(tz=timezone.utc) - timedelta(hours=5)
+ ).isoformat()
+ now_plus_5_hours = (
+ datetime.now(tz=timezone.utc) + timedelta(hours=5)
+ ).isoformat()
+
+ ORIGINS = [
+ {
+ "url": "http://foobar.1.com",
+ "nb_visits": 1,
+ "last_visit_date": now_minus_5_hours,
+ "last_eventful_visit_date": now_minus_5_hours,
+ },
+ {
+ "url": "http://foobar.2.com",
+ "nb_visits": 2,
+ "last_visit_date": now,
+ "last_eventful_visit_date": now,
+ },
+ {
+ "url": "http://foobar.3.com",
+ "nb_visits": 3,
+ "last_visit_date": now_plus_5_hours,
+ "last_eventful_visit_date": now_minus_5_hours,
+ },
+ ]
+
+ self.search.origin_update(ORIGINS)
+ self.search.flush()
+
+ def _check_results(query, origin_indices):
+ page = self.search.origin_search(url_pattern="foobar", query=query)
+ results = [r["url"] for r in page.results]
+ assert results == [ORIGINS[index]["url"] for index in origin_indices]
+
+ _check_results("sort_by = [-visits]", [2, 1, 0])
+ _check_results("sort_by = [last_visit]", [0, 1, 2])
+ _check_results("sort_by = [-last_eventful_visit, visits]", [1, 0, 2])
+ _check_results("sort_by = [last_eventful_visit,-last_visit]", [2, 0, 1])
+
+ _check_results("sort_by = [-visits] limit = 1", [2])
+ _check_results("sort_by = [last_visit] and limit = 2", [0, 1])
+ _check_results("sort_by = [-last_eventful_visit, visits] limit = 3", [1, 0, 2])
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -1151,3 +1151,18 @@
result_page = self.search.origin_search(url_pattern="origin")
assert result_page.next_page_token is None
assert result_page.results == []
+
+ def test_filter_keyword_in_filter(self):
+ origin1 = {
+ "url": "foo language in ['foo baz'] bar",
+ }
+ self.search.origin_update([origin1])
+ self.search.flush()
+
+ result_page = self.search.origin_search(url_pattern="language in ['foo bar']")
+ assert result_page.next_page_token is None
+ assert result_page.results == [origin1]
+
+ result_page = self.search.origin_search(url_pattern="baaz")
+ assert result_page.next_page_token is None
+ assert result_page.results == []
diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py
--- a/swh/search/tests/test_translator.py
+++ b/swh/search/tests/test_translator.py
@@ -146,14 +146,14 @@
def test_keyword_filter():
- query = r"""keyword in [word1, "word2 \" ' word3"]"""
+ query = r"""keyword in [word1, "word2 \" \' word3"]"""
expected = {
"filters": {
"nested": {
"path": "intrinsic_metadata",
"query": {
"multi_match": {
- "query": r"""word1 word2 \" ' word3""",
+ "query": r"""word1 word2 " ' word3""",
"fields": [
get_expansion("keywords", ".") + "^2",
get_expansion("descriptions", "."),
@@ -307,3 +307,45 @@
expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}}
_test_results(query, expected)
+
+
+def test_keyword_no_escape_inside_filter():
+ # any keyword (filter name/operator/value) inside a filter
+ # must be considered a string.
+ query = r'''origin = "language in [\'go lang\', python]"'''
+ expected = {
+ "filters": {
+ "multi_match": {
+ "query": r"""language in ['go lang', python]""",
+ "type": "bool_prefix",
+ "operator": "and",
+ "fields": [
+ "url.as_you_type",
+ "url.as_you_type._2gram",
+ "url.as_you_type._3gram",
+ ],
+ }
+ }
+ }
+ _test_results(query, expected)
+
+
+def test_escaped_punctutation_parsing():
+ query = r"""keyword in ["foo \'\" bar"]"""
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": r"""foo '" bar""",
+ "fields": [
+ get_expansion("keywords", ".") + "^2",
+ get_expansion("descriptions", "."),
+ ],
+ }
+ },
+ }
+ }
+ }
+ _test_results(query, expected)
diff --git a/swh/search/translator.py b/swh/search/translator.py
--- a/swh/search/translator.py
+++ b/swh/search/translator.py
@@ -3,7 +3,7 @@
from pkg_resources import resource_filename
from tree_sitter import Language, Parser
-from swh.search.utils import get_expansion
+from swh.search.utils import get_expansion, unescape
class Translator:
@@ -70,8 +70,10 @@
filters2 = self._traverse(node.children[2])
if conj_op == "and":
+ # "must" is equivalent to "AND"
return {"bool": {"must": [filters1, filters2]}}
if conj_op == "or":
+ # "should" is equivalent to "OR"
return {"bool": {"should": [filters1, filters2]}}
if node.type == "filter":
@@ -104,14 +106,13 @@
value = self.query[start:end]
if len(value) > 1 and (
- (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"')
+ (value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"')
):
- return value[1:-1]
+ return unescape(value[1:-1])
if node.type in ["number", "numberVal"]:
return int(value)
-
- return value
+ return unescape(value)
def _parse_filter(self, filter):
@@ -145,9 +146,18 @@
"query": {
"multi_match": {
"query": value,
+ # Makes it so that the "foo bar" query returns
+ # documents which contain "foo" in a field and "bar"
+ # in a different field
"type": "cross_fields",
+ # All keywords must be found in a document for it to
+ # be considered a match.
+ # TODO: allow missing keywords?
"operator": "and",
+ # Searches on all fields of the intrinsic_metadata dict,
+ # recursively.
"fields": ["intrinsic_metadata.*"],
+ # date{Created,Modified,Published} are of type date
"lenient": True,
}
},
@@ -190,6 +200,9 @@
"fields": [
get_expansion("keywords", ".") + "^2",
get_expansion("descriptions", "."),
+ # "^2" boosts an origin's score by 2x
+ # if it the queried keywords are
+ # found in its intrinsic_metadata.keywords
],
}
},
diff --git a/swh/search/utils.py b/swh/search/utils.py
--- a/swh/search/utils.py
+++ b/swh/search/utils.py
@@ -55,3 +55,51 @@
return True
except Exception:
return False
+
+
+def escape(obj):
+ r"""Makes the object directly injectable into the
+ query language by converting the escapable parts of
+ the object into escape sequences.
+
+ For strings, appends \ before special characters like ', ", and \
+
+ For arrays, applies the same transformation on each element, joins the
+ elements and returns a string-like representation of the list.
+
+ >>> print(escape("foo ' bar"))
+ "foo \' bar"
+
+ >>> print(escape([r"foo ' bar", r"bar \\\' baz", r'foo " baz']))
+ ["foo \' bar", "bar \\\\\\\' baz", "foo \" baz"]
+
+ """
+ if type(obj) == list:
+ items = [escape(item) for item in obj]
+ return "[" + ", ".join(items) + "]"
+ elif type(obj) == str:
+ return (
+ '"'
+ + obj.translate({ord("'"): r"\'", ord('"'): r"\"", ord("\\"): r"\\",})
+ + '"'
+ )
+ else:
+ raise Exception(f"Unexpected item type {type(obj)}")
+
+
+def unescape(string):
+ r"""Processes the escaped special characters
+
+ >>> unescape(r'''foo " bar''') == r'''foo " bar'''
+ True
+ >>> unescape(r'''foo \" bar''') == r'''foo " bar'''
+ True
+ >>> unescape(r'''foo \\" bar''') == r'''foo \" bar'''
+ True
+ >>> unescape(r'''foo \\\" bar''') == r'''foo \" bar'''
+ True
+ >>> unescape(r'''foo \\\\" bar''') == r'''foo \\" bar'''
+ True
+ """
+
+ return bytes(string, "utf-8").decode("unicode_escape")

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 7:55 AM (10 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222330

Event Timeline