diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -37,6 +37,8 @@ "has_visits", "intrinsic_metadata", "visit_types", + "nb_visits", + "last_visit_date", ): if field_name in origin: res[field_name] = origin.pop(field_name) @@ -141,6 +143,8 @@ "visit_types": {"type": "keyword"}, # used to filter out origins that were never visited "has_visits": {"type": "boolean",}, + "nb_visits": {"type": "integer"}, + "last_visit_date": {"type": "date"}, "intrinsic_metadata": { "type": "nested", "properties": { @@ -171,21 +175,40 @@ # painless script that will be executed when updating an origin document update_script = dedent( """ - // backup current visit_types field value - List visit_types = ctx._source.getOrDefault("visit_types", []); + // backup current visit_types field value + List visit_types = ctx._source.getOrDefault("visit_types", []); + int nb_visits = ctx._source.getOrDefault("nb_visits", 0); + ZonedDateTime last_visit_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_visit_date", "0001-01-01T00:00:00Z")); + + // update origin document with new field values + ctx._source.putAll(params); + + // restore previous visit types after visit_types field overriding + if (ctx._source.containsKey("visit_types")) { + for (int i = 0; i < visit_types.length; ++i) { + if (!ctx._source.visit_types.contains(visit_types[i])) { + ctx._source.visit_types.add(visit_types[i]); + } + } + } - // update origin document with new field values - ctx._source.putAll(params); + // Undo overwrite if incoming nb_visits is smaller + if (ctx._source.containsKey("nb_visits")) { + int incoming_nb_visits = ctx._source.getOrDefault("nb_visits", 0); + if(incoming_nb_visits < nb_visits){ + ctx._source.nb_visits = nb_visits; + } + } - // restore previous visit types after visit_types field overriding - if (ctx._source.containsKey("visit_types")) { - for (int i = 0; i < visit_types.length; ++i) { - if (!ctx._source.visit_types.contains(visit_types[i])) { - ctx._source.visit_types.add(visit_types[i]); - } - } + // Undo overwrite if incoming last_visit_date is older + if (ctx._source.containsKey("last_visit_date")) { + ZonedDateTime incoming_last_visit_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_visit_date", "0001-01-01T00:00:00Z")); + int difference = incoming_last_visit_date.compareTo(last_visit_date); // returns -1, 0 or 1 + if(difference < 0){ + ctx._source.last_visit_date = last_visit_date; } - """ + } + """ # noqa ) actions = [ @@ -227,6 +250,8 @@ metadata_pattern: Optional[str] = None, with_visit: bool = False, visit_types: Optional[List[str]] = None, + min_nb_visits: int = 0, + min_last_visit_date: str = "", page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -280,6 +305,18 @@ if with_visit: query_clauses.append({"term": {"has_visits": True,}}) + if min_nb_visits: + query_clauses.append({"range": {"nb_visits": {"gte": min_nb_visits,},}}) + if min_last_visit_date: + query_clauses.append( + { + "range": { + "last_visit_date": { + "gte": min_last_visit_date.replace("Z", "+00:00"), + } + } + } + ) if visit_types is not None: query_clauses.append({"terms": {"visit_types": visit_types}}) @@ -293,6 +330,7 @@ }, "sort": [{"_score": "desc"}, {"sha1": "asc"},], } + if page_token: # TODO: use ElasticSearch's scroll API? page_token_content = token_decode(page_token) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from collections import defaultdict +from datetime import datetime import itertools import re from typing import Any, Dict, Iterable, Iterator, List, Optional @@ -65,6 +66,19 @@ document["visit_types"] = set(source_document["visit_types"]) if "visit_types" in self._origins[id_]: document["visit_types"].update(self._origins[id_]["visit_types"]) + if "nb_visits" in document: + document["nb_visits"] = max( + document["nb_visits"], self._origins[id_].get("nb_visits", 0) + ) + if "last_visit_date" in document: + document["last_visit_date"] = max( + datetime.fromisoformat(document["last_visit_date"]), + datetime.fromisoformat( + self._origins[id_] + .get("last_visit_date", "0001-01-01T00:00:00.000000Z",) + .replace("Z", "+00:00") + ), + ).isoformat() self._origins[id_].update(document) if id_ not in self._origin_ids: @@ -78,6 +92,8 @@ with_visit: bool = False, visit_types: Optional[List[str]] = None, page_token: Optional[str] = None, + min_nb_visits: int = 0, + min_last_visit_date: str = "", limit: int = 50, ) -> PagedResult[MinimalOriginDict]: hits: Iterator[Dict[str, Any]] = ( @@ -129,6 +145,14 @@ if with_visit: hits = filter(lambda o: o.get("has_visits"), hits) + if min_nb_visits: + hits = filter(lambda o: o.get("nb_visits", 0) >= min_nb_visits, hits) + if min_last_visit_date: + hits = filter( + lambda o: datetime.fromisoformat(o.get("last_visit_date", "")) + >= datetime.fromisoformat(min_last_visit_date), + hits, + ) if visit_types is not None: visit_types_set = set(visit_types) diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -59,6 +59,8 @@ with_visit: bool = False, visit_types: Optional[List[str]] = None, page_token: Optional[str] = None, + min_nb_visits: int = 0, + min_last_visit_date: str = "", limit: int = 50, ) -> PagedResult[MinimalOriginDict]: """Searches for origins matching the `url_pattern`. @@ -70,6 +72,10 @@ visit_types: Only origins having any of the provided visit types (e.g. git, svn, pypi) will be returned page_token: Opaque value used for pagination + min_nb_visits: Filter origins that have number of visits >= + the provided value + min_last_visit_date: Filter origins that have + last_visit_date on or after the provided date(ISO format) limit: number of results to return Returns: diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -59,7 +59,12 @@ logging.debug("processing origin visit statuses %r", visit_statuses) full_visit_status = [ - {"url": (visit_status["origin"]), "has_visits": True,} + { + "url": (visit_status["origin"]), + "has_visits": True, + "nb_visits": visit_status["visit"], + "last_visit_date": visit_status["date"].isoformat(), + } for visit_status in visit_statuses if visit_status["status"] == "full" ] diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py --- a/swh/search/tests/test_cli.py +++ b/swh/search/tests/test_cli.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import copy +from datetime import datetime, timezone import tempfile from click.testing import CliRunner @@ -179,6 +180,7 @@ { "origin": origin_foobar["url"], "visit": 1, + "date": datetime.now(tz=timezone.utc), "snapshot": None, "status": "full", } diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timezone import functools from unittest.mock import MagicMock @@ -42,23 +43,45 @@ search_mock = MagicMock() worker_fn = functools.partial(process_journal_objects, search=search_mock,) + current_datetime = datetime.now(tz=timezone.utc) worker_fn( { "origin_visit_status": [ - {"origin": "http://foobar.baz", "status": "full"} # full visits ok + { + "origin": "http://foobar.baz", + "status": "full", + "visit": 5, + "date": current_datetime, + } # full visits ok ] } ) search_mock.origin_update.assert_called_once_with( - [{"url": "http://foobar.baz", "has_visits": True},] + [ + { + "url": "http://foobar.baz", + "has_visits": True, + "nb_visits": 5, + "last_visit_date": current_datetime.isoformat(), + }, + ] ) search_mock.reset_mock() # non-full visits are filtered out worker_fn( - {"origin_visit_status": [{"origin": "http://foobar.baz", "status": "partial"}]} + { + "origin_visit_status": [ + { + "origin": "http://foobar.baz", + "status": "partial", + "visit": 5, + "date": current_datetime, + } + ] + } ) search_mock.origin_update.assert_not_called() diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -3,7 +3,10 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timedelta, timezone + from hypothesis import given, settings, strategies +import pytest from swh.core.api.classes import stream_results @@ -184,6 +187,77 @@ ] ) + def test_origin_nb_visits_update_search(self): + origin_url = "http://foobar.baz" + self.search.origin_update([{"url": origin_url}]) + self.search.flush() + + def _update_nb_visits(nb_visits): + self.search.origin_update([{"url": origin_url, "nb_visits": nb_visits}]) + self.search.flush() + + def _check_min_nb_visits(min_nb_visits): + actual_page = self.search.origin_search( + url_pattern=origin_url, min_nb_visits=min_nb_visits, + ) + assert actual_page.next_page_token is None + results = [r["url"] for r in actual_page.results] + expected_results = [origin_url] + assert sorted(results) == sorted(expected_results) + + _update_nb_visits(2) + _check_min_nb_visits(2) # Works for = 2 + _check_min_nb_visits(1) # Works for < 2 + + with pytest.raises(AssertionError): + _check_min_nb_visits( + 5 + ) # No results for nb_visits >= 5 (should throw error) + + _update_nb_visits(5) + _check_min_nb_visits(5) # Works for = 5 + _check_min_nb_visits(3) # Works for < 5 + + def test_origin_last_visit_date_update_search(self): + origin_url = "http://foobar.baz" + self.search.origin_update([{"url": origin_url}]) + self.search.flush() + + def _update_last_visit_date(last_visit_date): + self.search.origin_update( + [{"url": origin_url, "last_visit_date": last_visit_date}] + ) + self.search.flush() + + def _check_min_last_visit_date(min_last_visit_date): + actual_page = self.search.origin_search( + url_pattern=origin_url, min_last_visit_date=min_last_visit_date, + ) + assert actual_page.next_page_token is None + results = [r["url"] for r in actual_page.results] + expected_results = [origin_url] + assert sorted(results) == sorted(expected_results) + + now = datetime.now(tz=timezone.utc).isoformat() + now_minus_5_hours = ( + datetime.now(tz=timezone.utc) - timedelta(hours=5) + ).isoformat() + now_plus_5_hours = ( + datetime.now(tz=timezone.utc) + timedelta(hours=5) + ).isoformat() + + _update_last_visit_date(now) + + _check_min_last_visit_date(now) # Works for = + _check_min_last_visit_date(now_minus_5_hours) # Works for < + with pytest.raises(AssertionError): + _check_min_last_visit_date(now_plus_5_hours) # Fails for > + + _update_last_visit_date(now_plus_5_hours) + + _check_min_last_visit_date(now_plus_5_hours) # Works for = + _check_min_last_visit_date(now) # Works for < + def test_origin_update_with_no_visit_types(self): """ Update an origin with visit types first then with no visit types,