diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -38,7 +38,9 @@ "intrinsic_metadata", "visit_types", "nb_visits", + "snapshot_id", "last_visit_date", + "last_eventful_visit_date", ): if field_name in origin: res[field_name] = origin.pop(field_name) @@ -144,7 +146,9 @@ # used to filter out origins that were never visited "has_visits": {"type": "boolean",}, "nb_visits": {"type": "integer"}, + "snapshot_id": {"type": "keyword"}, "last_visit_date": {"type": "date"}, + "last_eventful_visit_date": {"type": "date"}, "intrinsic_metadata": { "type": "nested", "properties": { @@ -179,6 +183,8 @@ List visit_types = ctx._source.getOrDefault("visit_types", []); int nb_visits = ctx._source.getOrDefault("nb_visits", 0); ZonedDateTime last_visit_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_visit_date", "0001-01-01T00:00:00Z")); + String snapshot_id = ctx._source.getOrDefault("snapshot_id", ""); + ZonedDateTime last_eventful_visit_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_eventful_visit_date", "0001-01-01T00:00:00Z")); // update origin document with new field values ctx._source.putAll(params); @@ -208,6 +214,18 @@ ctx._source.last_visit_date = last_visit_date; } } + + // Undo update of last_eventful_date and snapshot_id if + // snapshot_id hasn't changed OR incoming_last_eventful_visit_date is older + if (ctx._source.containsKey("snapshot_id")) { + String incoming_snapshot_id = ctx._source.getOrDefault("snapshot_id", ""); + ZonedDateTime incoming_last_eventful_visit_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_eventful_visit_date", "0001-01-01T00:00:00Z")); + int difference = incoming_last_eventful_visit_date.compareTo(last_eventful_visit_date); // returns -1, 0 or 1 + if(snapshot_id == incoming_snapshot_id || difference < 0){ + ctx._source.snapshot_id = snapshot_id; + ctx._source.last_eventful_visit_date = last_eventful_visit_date; + } + } """ # noqa ) @@ -252,6 +270,7 @@ visit_types: Optional[List[str]] = None, min_nb_visits: int = 0, min_last_visit_date: str = "", + min_last_eventful_visit_date: str = "", page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -317,6 +336,16 @@ } } ) + if min_last_eventful_visit_date: + query_clauses.append( + { + "range": { + "last_eventful_visit_date": { + "gte": min_last_eventful_visit_date.replace("Z", "+00:00"), + } + } + } + ) if visit_types is not None: query_clauses.append({"terms": {"visit_types": visit_types}}) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -79,6 +79,27 @@ .replace("Z", "+00:00") ), ).isoformat() + + if "snapshot_id" in document and "last_eventful_visit_date" in document: + incoming_date = datetime.fromisoformat( + document["last_eventful_visit_date"] + ) + current_date = datetime.fromisoformat( + self._origins[id_] + .get("last_eventful_visit_date", "0001-01-01T00:00:00Z",) + .replace("Z", "+00:00") + ) + incoming_snapshot_id = document["snapshot_id"] + current_snapshot_id = self._origins[id_].get("snapshot_id", "") + + if ( + incoming_snapshot_id == current_snapshot_id + or incoming_date < current_date + ): + # update not required so override the incoming_values + document["snapshot_id"] = current_snapshot_id + document["last_eventful_visit_date"] = current_date.isoformat() + self._origins[id_].update(document) if id_ not in self._origin_ids: @@ -94,6 +115,7 @@ page_token: Optional[str] = None, min_nb_visits: int = 0, min_last_visit_date: str = "", + min_last_eventful_visit_date: str = "", limit: int = 50, ) -> PagedResult[MinimalOriginDict]: hits: Iterator[Dict[str, Any]] = ( @@ -149,11 +171,26 @@ hits = filter(lambda o: o.get("nb_visits", 0) >= min_nb_visits, hits) if min_last_visit_date: hits = filter( - lambda o: datetime.fromisoformat(o.get("last_visit_date", "")) + lambda o: datetime.fromisoformat( + o.get("last_visit_date", "0001-01-01T00:00:00Z").replace( + "Z", "+00:00" + ) + ) >= datetime.fromisoformat(min_last_visit_date), hits, ) + if min_last_eventful_visit_date: + hits = filter( + lambda o: datetime.fromisoformat( + o.get("last_eventful_visit_date", "0001-01-01T00:00:00Z").replace( + "Z", "+00:00" + ) + ) + >= datetime.fromisoformat(min_last_eventful_visit_date), + hits, + ) + if visit_types is not None: visit_types_set = set(visit_types) hits = filter( diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -61,6 +61,7 @@ page_token: Optional[str] = None, min_nb_visits: int = 0, min_last_visit_date: str = "", + min_last_eventful_visit_date: str = "", limit: int = 50, ) -> PagedResult[MinimalOriginDict]: """Searches for origins matching the `url_pattern`. @@ -76,6 +77,9 @@ the provided value min_last_visit_date: Filter origins that have last_visit_date on or after the provided date(ISO format) + min_last_eventful_visit_date: Filter origins that have + last_eventful_visit_date (eventful = snapshot_id changed) + on or after the provided date(ISO format) limit: number of results to return Returns: diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -63,7 +63,9 @@ "url": (visit_status["origin"]), "has_visits": True, "nb_visits": visit_status["visit"], + "snapshot_id": visit_status.get("snapshot"), "last_visit_date": visit_status["date"].isoformat(), + "last_eventful_visit_date": visit_status["date"].isoformat(), } for visit_status in visit_statuses if visit_status["status"] == "full" diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -53,6 +53,7 @@ "status": "full", "visit": 5, "date": current_datetime, + "snapshot": None, } # full visits ok ] } @@ -63,7 +64,9 @@ "url": "http://foobar.baz", "has_visits": True, "nb_visits": 5, + "snapshot_id": None, "last_visit_date": current_datetime.isoformat(), + "last_eventful_visit_date": current_datetime.isoformat(), }, ] ) diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -4,12 +4,35 @@ # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone +from itertools import permutations from hypothesis import given, settings, strategies import pytest from swh.core.api.classes import stream_results +NOW = datetime.now(tz=timezone.utc).isoformat() +NOW_MINUS_5_HOURS = (datetime.now(tz=timezone.utc) - timedelta(hours=5)).isoformat() +NOW_PLUS_5_HOURS = (datetime.now(tz=timezone.utc) + timedelta(hours=5)).isoformat() + +VISIT_STATUSES = [ + { + "url": "http://foobar.baz", + "snapshot_id": "SNAPSHOT_1", + "last_eventful_visit_date": NOW, + }, + { + "url": "http://foobar.baz", + "snapshot_id": "SNAPSHOT_1", + "last_eventful_visit_date": NOW_MINUS_5_HOURS, + }, + { + "url": "http://foobar.baz", + "snapshot_id": "SNAPSHOT_2", + "last_eventful_visit_date": NOW_PLUS_5_HOURS, + }, +] + class CommonSearchTest: def test_origin_url_unique_word_prefix(self): @@ -258,6 +281,80 @@ _check_min_last_visit_date(now_plus_5_hours) # Works for = _check_min_last_visit_date(now) # Works for < + def test_journal_client_origin_visit_status_permutation(self): + for visit_statuses in permutations(VISIT_STATUSES, len(VISIT_STATUSES)): + self.search.origin_update(visit_statuses) + self.search.flush() + origin_url = "http://foobar.baz" + actual_page = self.search.origin_search( + url_pattern=origin_url, min_last_eventful_visit_date=NOW_PLUS_5_HOURS, + ) + assert actual_page.next_page_token is None + results = [r["url"] for r in actual_page.results] + expected_results = [origin_url] + assert sorted(results) == sorted(expected_results) + + def test_origin_last_eventful_visit_date_update_search(self): + origin_url = "http://foobar.baz" + self.search.origin_update([{"url": origin_url}]) + self.search.flush() + + def _update_last_eventful_visit_date(snapshot_id, last_eventful_visit_date): + self.search.origin_update( + [ + { + "url": origin_url, + "snapshot_id": snapshot_id, + "last_eventful_visit_date": last_eventful_visit_date, + } + ] + ) + self.search.flush() + + def _check_min_last_eventful_visit_date(min_last_eventful_visit_date): + actual_page = self.search.origin_search( + url_pattern=origin_url, + min_last_eventful_visit_date=min_last_eventful_visit_date, + ) + assert actual_page.next_page_token is None + results = [r["url"] for r in actual_page.results] + expected_results = [origin_url] + assert sorted(results) == sorted(expected_results) + + now = datetime.now(tz=timezone.utc).isoformat() + now_minus_5_hours = ( + datetime.now(tz=timezone.utc) - timedelta(hours=5) + ).isoformat() + now_plus_5_hours = ( + datetime.now(tz=timezone.utc) + timedelta(hours=5) + ).isoformat() + + snapshot_1 = "SNAPSHOT_1" + snapshot_2 = "SNAPSHOT_2" + + _update_last_eventful_visit_date(snapshot_1, now) + + _check_min_last_eventful_visit_date(now) # Works for = + _check_min_last_eventful_visit_date(now_minus_5_hours) # Works for < + with pytest.raises(AssertionError): + _check_min_last_eventful_visit_date(now_plus_5_hours) # Fails for > + + _update_last_eventful_visit_date( + snapshot_1, now_plus_5_hours + ) # Revisit(not eventful) same origin + + _check_min_last_eventful_visit_date( + now + ) # Should remain the same because recent visit wasn't eventful + with pytest.raises(AssertionError): + _check_min_last_eventful_visit_date(now_plus_5_hours) + + _update_last_eventful_visit_date( + snapshot_2, now_plus_5_hours + ) # Revisit(eventful) same origin + _check_min_last_eventful_visit_date(now_plus_5_hours) # Works for = + _check_min_last_eventful_visit_date(now) # Works for < + def test_origin_update_with_no_visit_types(self): """ Update an origin with visit types first then with no visit types,