diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -309,6 +309,7 @@ min_last_revision_date: str = "", min_last_release_date: str = "", page_token: Optional[str] = None, + sort_by: str = "", limit: int = 50, ) -> PagedResult[MinimalOriginDict]: query_clauses: List[Dict[str, Any]] = [] @@ -407,6 +408,26 @@ if visit_types is not None: query_clauses.append({"terms": {"visit_types": visit_types}}) + sorting_params = [] + order = "asc" + + if sort_by and sort_by[0] == "-": + sort_by = sort_by[1:] + order = "desc" + if sort_by in [ + "nb_visits", + "last_visit_date", + "last_eventful_visit_date", + "last_revision_date", + "last_release_date", + ]: + sorting_params = [{sort_by: order}] + + sorting_params += [ + {"_score": "desc"}, + {"sha1": "asc"}, + ] + body = { "query": { "bool": { @@ -414,7 +435,7 @@ "must_not": [{"term": {"blocklisted": True}}], } }, - "sort": [{"_score": "desc"}, {"sha1": "asc"},], + "sort": sorting_params, } if page_token: diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -5,7 +5,6 @@ from collections import defaultdict from datetime import datetime -import itertools import re from typing import Any, Dict, Iterable, Iterator, List, Optional @@ -136,6 +135,7 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + sort_by: str = "", limit: int = 50, ) -> PagedResult[MinimalOriginDict]: hits: Iterator[Dict[str, Any]] = ( @@ -239,11 +239,35 @@ hits, ) + order = "asc" + if sort_by and sort_by[0] == "-": + sort_by = sort_by[1:] + order = "desc" + + hits_list = list(hits) + if sort_by in [ + "last_visit_date", + "last_eventful_visit_date", + "last_revision_date", + "last_release_date", + ]: # for date fields + hits_list = sorted( + hits_list, + key=lambda o: datetime.fromisoformat( + o.get(sort_by, "0001-01-01T00:00:00Z").replace("Z", "+00:00") + ), + reverse=(order == "desc"), + ) + elif sort_by in ["nb_visits"]: # for integer fields + hits_list = sorted( + hits_list, key=lambda o: o.get(sort_by, 0), reverse=(order == "desc") + ) + start_at_index = int(page_token) if page_token else 0 origins = [ {"url": hit["url"]} - for hit in itertools.islice(hits, start_at_index, start_at_index + limit) + for hit in hits_list[start_at_index : start_at_index + limit] ] if len(origins) == limit: diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -64,6 +64,7 @@ min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", + sort_by: str = "", limit: int = 50, ) -> PagedResult[MinimalOriginDict]: """Searches for origins matching the `url_pattern`. @@ -86,6 +87,9 @@ last_revision_date on or after the provided date(ISO format) min_last_release_date: Filter origins that have last_release_date on or after the provided date(ISO format) + sort_by: Sort results based on nb_visits, last_visit_date, + last_eventful_visit_date, last_revision_date and + last_release_date. Put "-" at the start to reverse the results limit: number of results to return Returns: diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -409,6 +409,52 @@ date_type="last_revision_date" ) + def test_origin_sort_by_search(self): + + now = datetime.now(tz=timezone.utc).isoformat() + now_minus_5_hours = ( + datetime.now(tz=timezone.utc) - timedelta(hours=5) + ).isoformat() + now_plus_5_hours = ( + datetime.now(tz=timezone.utc) + timedelta(hours=5) + ).isoformat() + + ORIGINS = [ + { + "url": "http://foobar.1.com", + "nb_visits": 1, + "last_visit_date": now_minus_5_hours, + }, + {"url": "http://foobar.2.com", "nb_visits": 2, "last_visit_date": now}, + { + "url": "http://foobar.3.com", + "nb_visits": 3, + "last_visit_date": now_plus_5_hours, + }, + ] + self.search.origin_update(ORIGINS) + self.search.flush() + + page = self.search.origin_search(url_pattern="foobar", sort_by="nb_visits") + results = [r["url"] for r in page.results] + assert results == [origin["url"] for origin in ORIGINS] + + page = self.search.origin_search(url_pattern="foobar", sort_by="-nb_visits") + results = [r["url"] for r in page.results] + assert results == [origin["url"] for origin in ORIGINS][::-1] + + page = self.search.origin_search( + url_pattern="foobar", sort_by="last_visit_date" + ) + results = [r["url"] for r in page.results] + assert results == [origin["url"] for origin in ORIGINS] + + page = self.search.origin_search( + url_pattern="foobar", sort_by="-last_visit_date" + ) + results = [r["url"] for r in page.results] + assert results == [origin["url"] for origin in ORIGINS][::-1] + def test_origin_update_with_no_visit_types(self): """ Update an origin with visit types first then with no visit types,