Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9337135
D6046.id21958.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
25 KB
Subscribers
None
D6046.id21958.diff
View Options
diff --git a/query_language/grammar.js b/query_language/grammar.js
--- a/query_language/grammar.js
+++ b/query_language/grammar.js
@@ -14,7 +14,16 @@
name: 'swh_search_ql',
rules: {
- query: $ => seq($.filters, optional($.sortBy) ,optional($.limit)),
+ query: $ => seq(
+ $.filters,
+ optional($.and),
+ choice(
+ seq(optional($.sortBy), optional($.and), optional($.limit)),
+ seq(optional($.limit), optional($.and), optional($.sortBy)),
+ ),
+ ),
+
+
filters: $ => choice(
prec.left(PRECEDENCE.and,
@@ -41,7 +50,7 @@
sortByField: $ => token('sort_by'),
sortByOp: $ => $.equalOp,
sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])),
- sortByOptions: $ => seq(optional(token.immediate('-')) ,choice(
+ sortByOptions: $ => seq(optional(token.immediate('-')), choice(
'visits',
'last_visit',
'last_eventful_visit',
@@ -104,17 +113,18 @@
sortByOp: $ => $.equalOp,
sortByVal: $ => createArray(optionalWrapWith($.sortByOptions, ["'", '"'])),
sortByOptions: $ => seq(
- optional(token.immediate('-')),
+ optional('-'),
choice(
- 'visits',
- 'last_visit',
- 'last_eventful_visit',
- 'last_revision',
- 'last_release',
- 'created',
- 'modified',
- 'published'
- )),
+ 'visits',
+ 'last_visit',
+ 'last_eventful_visit',
+ 'last_revision',
+ 'last_release',
+ 'created',
+ 'modified',
+ 'published'
+ )
+ ),
unboundedListFilter: $ => seq($.listField, $.listOp, $.listVal),
listField: $ => token(choice('language', 'license', 'keyword')),
@@ -142,7 +152,13 @@
equalOp: $ => token('='),
choiceOp: $ => token(choice('in', 'not in')),
- isoDateTime: $ => /\d{4}[-]\d{2}[-]\d{2}(\s|T)*(\d{2}:\d{2}(:\d{2})?)?(Z)?/,
+ isoDateTime: $ => {
+ const dateRegex = (/\d{4}[-]\d{2}[-]\d{2}/).source
+ const dateTimeSepRegex = (/(\s|T)*/).source
+ const timeRegex = (/(\d{2}:\d{2}(:\d{2}(\.\d{6})?)?)?/).source
+ const timezoneRegex = (/(\+\d{2}:\d{2}|Z)?/).source
+ return new RegExp(dateRegex + dateTimeSepRegex + timeRegex + timezoneRegex)
+ },
string: $ => choice(wrapWith($.stringContent, ["'", '"']), $.singleWord),
number: $ => /\d+/,
@@ -153,7 +169,7 @@
and: $ => "and",
stringContent: $ => repeat1(choice(
- token.immediate(/[^\\"\n]+/),
+ token.immediate(/[^\\'"\n]+/),
$.escape_sequence
)),
singleWord: $ => /[^\s"'\[\]\(\),]+/,
diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -22,7 +22,8 @@
PagedResult,
)
from swh.search.metrics import send_metric, timed
-from swh.search.utils import get_expansion, is_date_parsable
+from swh.search.translator import Translator
+from swh.search.utils import escape, get_expansion, is_date_parsable
logger = logging.getLogger(__name__)
@@ -99,6 +100,7 @@
class ElasticSearch:
def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}):
self._backend = Elasticsearch(hosts=hosts)
+ self._translator = Translator()
# Merge current configuration with default values
origin_config = indexes.get("origin", {})
@@ -348,6 +350,7 @@
def origin_search(
self,
*,
+ query: str = "",
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
@@ -369,185 +372,89 @@
) -> PagedResult[MinimalOriginDict]:
query_clauses: List[Dict[str, Any]] = []
+ query_filters = []
if url_pattern:
- query_clauses.append(
- {
- "multi_match": {
- "query": url_pattern,
- "type": "bool_prefix",
- "operator": "and",
- "fields": [
- "url.as_you_type",
- "url.as_you_type._2gram",
- "url.as_you_type._3gram",
- ],
- }
- }
- )
+ query_filters.append(f"origin = {escape(url_pattern)}")
if metadata_pattern:
- query_clauses.append(
- {
- "nested": {
- "path": "intrinsic_metadata",
- "query": {
- "multi_match": {
- "query": metadata_pattern,
- # Makes it so that the "foo bar" query returns
- # documents which contain "foo" in a field and "bar"
- # in a different field
- "type": "cross_fields",
- # All keywords must be found in a document for it to
- # be considered a match.
- # TODO: allow missing keywords?
- "operator": "and",
- # Searches on all fields of the intrinsic_metadata dict,
- # recursively.
- "fields": ["intrinsic_metadata.*"],
- # date{Created,Modified,Published} are of type date
- "lenient": True,
- }
- },
- }
- }
- )
+ query_filters.append(f"metadata = {escape(metadata_pattern)}")
- if not query_clauses:
- raise ValueError(
- "At least one of url_pattern and metadata_pattern must be provided."
- )
+ # if not query_clauses:
+ # raise ValueError(
+ # "At least one of url_pattern and metadata_pattern must be provided."
+ # )
if with_visit:
- query_clauses.append({"term": {"has_visits": True,}})
+ query_filters.append(f"visited = {'true' if with_visit else 'false'}")
if min_nb_visits:
- query_clauses.append({"range": {"nb_visits": {"gte": min_nb_visits,},}})
+ query_filters.append(f"visits >= {min_nb_visits}")
if min_last_visit_date:
- query_clauses.append(
- {
- "range": {
- "last_visit_date": {
- "gte": min_last_visit_date.replace("Z", "+00:00"),
- }
- }
- }
+ query_filters.append(
+ f"last_visit >= {min_last_visit_date.replace('Z', '+00:00')}"
)
if min_last_eventful_visit_date:
- query_clauses.append(
- {
- "range": {
- "last_eventful_visit_date": {
- "gte": min_last_eventful_visit_date.replace("Z", "+00:00"),
- }
- }
- }
+ query_filters.append(
+ "last_eventful_visit >= "
+ f"{min_last_eventful_visit_date.replace('Z', '+00:00')}"
)
if min_last_revision_date:
- query_clauses.append(
- {
- "range": {
- "last_revision_date": {
- "gte": min_last_revision_date.replace("Z", "+00:00"),
- }
- }
- }
+ query_filters.append(
+ f"last_revision >= {min_last_revision_date.replace('Z', '+00:00')}"
)
if min_last_release_date:
- query_clauses.append(
- {
- "range": {
- "last_release_date": {
- "gte": min_last_release_date.replace("Z", "+00:00"),
- }
- }
- }
+ query_filters.append(
+ f"last_release >= {min_last_release_date.replace('Z', '+00:00')}"
)
if keywords:
- query_clauses.append(
- {
- "nested": {
- "path": "intrinsic_metadata",
- "query": {
- "multi_match": {
- "query": " ".join(keywords),
- "fields": [
- get_expansion("keywords", ".") + "^2",
- get_expansion("descriptions", "."),
- # "^2" boosts an origin's score by 2x
- # if it the queried keywords are
- # found in its intrinsic_metadata.keywords
- ],
- }
- },
- }
- }
- )
-
- intrinsic_metadata_filters: List[Dict[str, Dict]] = []
-
+ query_filters.append(f"keyword in {escape(keywords)}")
if licenses:
- license_filters: List[Dict[str, Any]] = []
- for license in licenses:
- license_filters.append(
- {"match": {get_expansion("licenses", "."): license}}
- )
- intrinsic_metadata_filters.append({"bool": {"should": license_filters}})
+ query_filters.append(f"license in {escape(licenses)}")
if programming_languages:
- language_filters: List[Dict[str, Any]] = []
- for language in programming_languages:
- language_filters.append(
- {"match": {get_expansion("programming_languages", "."): language}}
- )
- intrinsic_metadata_filters.append({"bool": {"should": language_filters}})
+ query_filters.append(f"language in {escape(programming_languages)}")
if min_date_created:
- intrinsic_metadata_filters.append(
- {
- "range": {
- get_expansion("date_created", "."): {"gte": min_date_created,}
- }
- }
+ query_filters.append(
+ f"created >= {min_date_created.replace('Z', '+00:00')}"
)
if min_date_modified:
- intrinsic_metadata_filters.append(
- {
- "range": {
- get_expansion("date_modified", "."): {"gte": min_date_modified,}
- }
- }
+ query_filters.append(
+ f"modified >= {min_date_modified.replace('Z', '+00:00')}"
)
if min_date_published:
- intrinsic_metadata_filters.append(
- {
- "range": {
- get_expansion("date_published", "."): {
- "gte": min_date_published,
- }
- }
- }
- )
-
- if intrinsic_metadata_filters:
- query_clauses.append(
- {
- "nested": {
- "path": "intrinsic_metadata",
- "query": {"bool": {"must": intrinsic_metadata_filters,}},
- # "must" is equivalent to "AND"
- # "should" is equivalent to "OR"
- # Resulting origins must return true for the following:
- # (license_1 OR license_2 ..) AND (lang_1 OR lang_2 ..)
- # This is equivalent to {"must": [
- # {"should": [license_1,license_2] },
- # {"should": [lang_1,lang_2]}] }
- # ]}
- # Note: Usage of "bool" has been omitted for readability
- }
- }
+ query_filters.append(
+ f"published >= {min_date_published.replace('Z', '+00:00')}"
)
if visit_types is not None:
- query_clauses.append({"terms": {"visit_types": visit_types}})
+ query_filters.append(f"visit_type = {escape(visit_types)}")
+
+ combined_filters = f"({' and '.join(query_filters)})"
+ query = f"{combined_filters}{' and ' if query != '' else ' '}{query}"
+ parsed_query = self._translator.parse_query(query)
+ query_clauses.append(parsed_query["filters"])
+
+ field_map = {
+ "visits": "nb_visits",
+ "last_visit": "last_visit_date",
+ "last_eventful_visit": "last_eventful_visit_date",
+ "last_revision": "last_revision_date",
+ "last_release": "last_release_date",
+ "created": "date_created",
+ "modified": "date_modified",
+ "published": "date_published",
+ }
+
+ if "sortBy" in parsed_query:
+ if sort_by is None:
+ sort_by = []
+ for sort_by_option in parsed_query["sortBy"]:
+ if sort_by_option[0] == "-":
+ sort_by.append("-" + field_map[sort_by_option[1:]])
+ else:
+ sort_by.append(field_map[sort_by_option])
+ if parsed_query.get("limit", 0):
+ limit = parsed_query["limit"]
sorting_params: List[Dict[str, Any]] = []
diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -268,6 +268,7 @@
def origin_search(
self,
*,
+ query: str = "",
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
diff --git a/swh/search/interface.py b/swh/search/interface.py
--- a/swh/search/interface.py
+++ b/swh/search/interface.py
@@ -65,6 +65,7 @@
def origin_search(
self,
*,
+ query: str = "",
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
@@ -87,11 +88,12 @@
"""Searches for origins matching the `url_pattern`.
Args:
+ query: Find origins according the queries written as per the
+ swh-search query language syntax.
url_pattern: Part of the URL to search for
metadata_pattern: Keywords to look for
(across all the fields of intrinsic_metadata)
- with_visit: Whether origins with no visit are to be
- filtered out
+ with_visit: Whether origins with no visits are to be filtered out
visit_types: Only origins having any of the provided visit types
(e.g. git, svn, pypi) will be returned
min_nb_visits: Filter origins that have number of visits >=
diff --git a/swh/search/tests/test_elasticsearch.py b/swh/search/tests/test_elasticsearch.py
--- a/swh/search/tests/test_elasticsearch.py
+++ b/swh/search/tests/test_elasticsearch.py
@@ -3,6 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from datetime import datetime, timedelta, timezone
from textwrap import dedent
import types
import unittest
@@ -117,3 +118,50 @@
self.search.origin_search(url_pattern="foobar.baz")
assert mock.call_args[1]["index"] == "test-read"
+
+ def test_sort_by_and_limit_query(self):
+ now = datetime.now(tz=timezone.utc).isoformat()
+ now_minus_5_hours = (
+ datetime.now(tz=timezone.utc) - timedelta(hours=5)
+ ).isoformat()
+ now_plus_5_hours = (
+ datetime.now(tz=timezone.utc) + timedelta(hours=5)
+ ).isoformat()
+
+ ORIGINS = [
+ {
+ "url": "http://foobar.1.com",
+ "nb_visits": 1,
+ "last_visit_date": now_minus_5_hours,
+ "last_eventful_visit_date": now_minus_5_hours,
+ },
+ {
+ "url": "http://foobar.2.com",
+ "nb_visits": 2,
+ "last_visit_date": now,
+ "last_eventful_visit_date": now,
+ },
+ {
+ "url": "http://foobar.3.com",
+ "nb_visits": 3,
+ "last_visit_date": now_plus_5_hours,
+ "last_eventful_visit_date": now_minus_5_hours,
+ },
+ ]
+
+ self.search.origin_update(ORIGINS)
+ self.search.flush()
+
+ def _check_results(query, origin_indices):
+ page = self.search.origin_search(url_pattern="foobar", query=query)
+ results = [r["url"] for r in page.results]
+ assert results == [ORIGINS[index]["url"] for index in origin_indices]
+
+ _check_results("sort_by = [-visits]", [2, 1, 0])
+ _check_results("sort_by = [last_visit]", [0, 1, 2])
+ _check_results("sort_by = [-last_eventful_visit, visits]", [1, 0, 2])
+ _check_results("sort_by = [last_eventful_visit,-last_visit]", [2, 0, 1])
+
+ _check_results("sort_by = [-visits] limit = 1", [2])
+ _check_results("sort_by = [last_visit] and limit = 2", [0, 1])
+ _check_results("sort_by = [-last_eventful_visit, visits] limit = 3", [1, 0, 2])
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -1151,3 +1151,18 @@
result_page = self.search.origin_search(url_pattern="origin")
assert result_page.next_page_token is None
assert result_page.results == []
+
+ def test_filter_keyword_in_filter(self):
+ origin1 = {
+ "url": "foo language in ['foo baz'] bar",
+ }
+ self.search.origin_update([origin1])
+ self.search.flush()
+
+ result_page = self.search.origin_search(url_pattern="language in ['foo bar']")
+ assert result_page.next_page_token is None
+ assert result_page.results == [origin1]
+
+ result_page = self.search.origin_search(url_pattern="baaz")
+ assert result_page.next_page_token is None
+ assert result_page.results == []
diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py
--- a/swh/search/tests/test_translator.py
+++ b/swh/search/tests/test_translator.py
@@ -146,14 +146,14 @@
def test_keyword_filter():
- query = r"""keyword in [word1, "word2 \" ' word3"]"""
+ query = r"""keyword in [word1, "word2 \" \' word3"]"""
expected = {
"filters": {
"nested": {
"path": "intrinsic_metadata",
"query": {
"multi_match": {
- "query": r"""word1 word2 \" ' word3""",
+ "query": r"""word1 word2 " ' word3""",
"fields": [
get_expansion("keywords", ".") + "^2",
get_expansion("descriptions", "."),
@@ -307,3 +307,45 @@
expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}}
_test_results(query, expected)
+
+
+def test_keyword_no_escape_inside_filter():
+ # any keyword (filter name/operator/value) inside a filter
+ # must be considered a string.
+ query = r'''origin = "language in [\'go lang\', python]"'''
+ expected = {
+ "filters": {
+ "multi_match": {
+ "query": r"""language in ['go lang', python]""",
+ "type": "bool_prefix",
+ "operator": "and",
+ "fields": [
+ "url.as_you_type",
+ "url.as_you_type._2gram",
+ "url.as_you_type._3gram",
+ ],
+ }
+ }
+ }
+ _test_results(query, expected)
+
+
+def test_escaped_punctutation_parsing():
+ query = r"""keyword in ["foo \'\" bar"]"""
+ expected = {
+ "filters": {
+ "nested": {
+ "path": "intrinsic_metadata",
+ "query": {
+ "multi_match": {
+ "query": r"""foo '" bar""",
+ "fields": [
+ get_expansion("keywords", ".") + "^2",
+ get_expansion("descriptions", "."),
+ ],
+ }
+ },
+ }
+ }
+ }
+ _test_results(query, expected)
diff --git a/swh/search/translator.py b/swh/search/translator.py
--- a/swh/search/translator.py
+++ b/swh/search/translator.py
@@ -3,7 +3,7 @@
from pkg_resources import resource_filename
from tree_sitter import Language, Parser
-from swh.search.utils import get_expansion
+from swh.search.utils import get_expansion, unescape
class Translator:
@@ -70,8 +70,10 @@
filters2 = self._traverse(node.children[2])
if conj_op == "and":
+ # "must" is equivalent to "AND"
return {"bool": {"must": [filters1, filters2]}}
if conj_op == "or":
+ # "should" is equivalent to "OR"
return {"bool": {"should": [filters1, filters2]}}
if node.type == "filter":
@@ -104,14 +106,13 @@
value = self.query[start:end]
if len(value) > 1 and (
- (value[0] == "'" and value[1] == "'") or (value[0] and value[-1] == '"')
+ (value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"')
):
- return value[1:-1]
+ return unescape(value[1:-1])
if node.type in ["number", "numberVal"]:
return int(value)
-
- return value
+ return unescape(value)
def _parse_filter(self, filter):
@@ -145,9 +146,18 @@
"query": {
"multi_match": {
"query": value,
+ # Makes it so that the "foo bar" query returns
+ # documents which contain "foo" in a field and "bar"
+ # in a different field
"type": "cross_fields",
+ # All keywords must be found in a document for it to
+ # be considered a match.
+ # TODO: allow missing keywords?
"operator": "and",
+ # Searches on all fields of the intrinsic_metadata dict,
+ # recursively.
"fields": ["intrinsic_metadata.*"],
+ # date{Created,Modified,Published} are of type date
"lenient": True,
}
},
@@ -190,6 +200,9 @@
"fields": [
get_expansion("keywords", ".") + "^2",
get_expansion("descriptions", "."),
+ # "^2" boosts an origin's score by 2x
+ # if it the queried keywords are
+ # found in its intrinsic_metadata.keywords
],
}
},
diff --git a/swh/search/utils.py b/swh/search/utils.py
--- a/swh/search/utils.py
+++ b/swh/search/utils.py
@@ -55,3 +55,51 @@
return True
except Exception:
return False
+
+
+def escape(obj):
+ r"""Makes the object directly injectable into the
+ query language by converting the escapable parts of
+ the object into escape sequences.
+
+ For strings, appends \ before special characters like ', ", and \
+
+ For arrays, applies the same transformation on each element, joins the
+ elements and returns a string-like representation of the list.
+
+ >>> print(escape("foo ' bar"))
+ "foo \' bar"
+
+ >>> print(escape([r"foo ' bar", r"bar \\\' baz", r'foo " baz']))
+ ["foo \' bar", "bar \\\\\\\' baz", "foo \" baz"]
+
+ """
+ if type(obj) == list:
+ items = [escape(item) for item in obj]
+ return "[" + ", ".join(items) + "]"
+ elif type(obj) == str:
+ return (
+ '"'
+ + obj.translate({ord("'"): r"\'", ord('"'): r"\"", ord("\\"): r"\\",})
+ + '"'
+ )
+ else:
+ raise Exception(f"Unexpected item type {type(obj)}")
+
+
+def unescape(string):
+ r"""Processes the escaped special characters
+
+ >>> unescape(r'''foo " bar''') == r'''foo " bar'''
+ True
+ >>> unescape(r'''foo \" bar''') == r'''foo " bar'''
+ True
+ >>> unescape(r'''foo \\" bar''') == r'''foo \" bar'''
+ True
+ >>> unescape(r'''foo \\\" bar''') == r'''foo \" bar'''
+ True
+ >>> unescape(r'''foo \\\\" bar''') == r'''foo \\" bar'''
+ True
+ """
+
+ return bytes(string, "utf-8").decode("unicode_escape")
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jul 3 2025, 7:55 AM (10 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222330
Attached To
D6046: elasticsearch.py: Integrate query langauge translator
Event Timeline
Log In to Comment