diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -316,6 +316,7 @@ count: int, policy: str, timestamp: Optional[datetime.datetime] = None, + scheduled_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=7), db=None, cur=None, ) -> List[ListedOrigin]: @@ -335,21 +336,23 @@ where_clauses.append("visit_type = %s") query_args.append(visit_type) - # Don't re-schedule visits if they're already scheduled but we haven't - # recorded a result yet, unless they've been scheduled more than a week - # ago (it probably means we've lost them in flight somewhere). - where_clauses.append( - """origin_visit_stats.last_scheduled IS NULL - OR origin_visit_stats.last_scheduled < GREATEST( - %s - '7 day'::interval, - origin_visit_stats.last_eventful, - origin_visit_stats.last_uneventful, - origin_visit_stats.last_failed, - origin_visit_stats.last_notfound + if scheduled_cooldown: + # Don't re-schedule visits if they're already scheduled but we haven't + # recorded a result yet, unless they've been scheduled more than a week + # ago (it probably means we've lost them in flight somewhere). + where_clauses.append( + """origin_visit_stats.last_scheduled IS NULL + OR origin_visit_stats.last_scheduled < GREATEST( + %s - %s, + origin_visit_stats.last_eventful, + origin_visit_stats.last_uneventful, + origin_visit_stats.last_failed, + origin_visit_stats.last_notfound + ) + """ ) - """ - ) - query_args.append(timestamp) + query_args.append(timestamp) + query_args.append(scheduled_cooldown) if policy == "oldest_scheduled_first": order_by = "origin_visit_stats.last_scheduled NULLS FIRST" diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py --- a/swh/scheduler/interface.py +++ b/swh/scheduler/interface.py @@ -389,6 +389,7 @@ count: int, policy: str, timestamp: Optional[datetime.datetime] = None, + scheduled_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=7), ) -> List[ListedOrigin]: """Get at most the `count` next origins that need to be visited with the `visit_type` loader according to the given scheduling `policy`. @@ -402,6 +403,8 @@ policy: the scheduling policy used to select which visits to schedule timestamp: the mocked timestamp at which we're recording that the visits are being scheduled (defaults to the current time) + scheduled_cooldown: the minimal interval before which we can schedule + the same origin again """ ... diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -751,11 +751,20 @@ return visit_type, recorded_origins - def _check_grab_next_visit(self, swh_scheduler, visit_type, policy, expected): - """Calls grab_next_visits with the passed policy, and check that all - the origins returned are the expected ones (in the same order), and - that no extra origins are returned. Also checks the origin visits have - been marked as scheduled, and are only re-scheduled a week later""" + def _check_grab_next_visit_basic( + self, swh_scheduler, visit_type, policy, expected, **kwargs + ): + """Calls grab_next_visits with the passed policy, and check that: + + - all the origins returned are the expected ones (in the same order) + - no extra origins are returned + - the last_scheduled field has been set properly. + + Pass the extra keyword arguments to the calls to grab_next_visits. + + Returns a timestamp greater than all `last_scheduled` values for the grabbed + visits. + """ assert len(expected) != 0 before = utcnow() @@ -764,6 +773,7 @@ # Request one more than expected to check that no extra origin is returned count=len(expected) + 1, policy=policy, + **kwargs, ) after = utcnow() @@ -778,11 +788,24 @@ # They should not be scheduled again ret = swh_scheduler.grab_next_visits( - visit_type=visit_type, count=len(expected) + 1, policy=policy, + visit_type=visit_type, count=len(expected) + 1, policy=policy, **kwargs ) assert ret == [], "grab_next_visits returned already-scheduled origins" - # But a week later, they should + return after + + def _check_grab_next_visit( + self, swh_scheduler, visit_type, policy, expected, **kwargs + ): + """Run the same check as _check_grab_next_visit_basic, but also checks the + origin visits have been marked as scheduled, and are only re-scheduled a + week later + """ + + after = self._check_grab_next_visit_basic( + swh_scheduler, visit_type, policy, expected, **kwargs + ) + # But a week, later, they should ret = swh_scheduler.grab_next_visits( visit_type=visit_type, count=len(expected) + 1, @@ -795,8 +818,8 @@ expected ), "grab_next_visits didn't reschedule visits after a week" - def test_grab_next_visits_oldest_scheduled_first( - self, swh_scheduler, listed_origins_by_type, + def _prepare_oldest_scheduled_first_origins( + self, swh_scheduler, listed_origins_by_type ): visit_type, origins = self._grab_next_visits_setup( swh_scheduler, listed_origins_by_type @@ -823,6 +846,14 @@ # as well as those with the oldest values (i.e. the last ones), in order. expected = [origins[0]] + origins[1:][::-1] + return visit_type, origins, expected + + def test_grab_next_visits_oldest_scheduled_first( + self, swh_scheduler, listed_origins_by_type, + ): + visit_type, origins, expected = self._prepare_oldest_scheduled_first_origins( + swh_scheduler, listed_origins_by_type + ) self._check_grab_next_visit( swh_scheduler, visit_type=visit_type, @@ -830,6 +861,44 @@ expected=expected, ) + @pytest.mark.parametrize("cooldown", (7, 15)) + def test_grab_next_visits_oldest_scheduled_first_scheduled_cooldown( + self, swh_scheduler, listed_origins_by_type, cooldown + ): + visit_type, origins, expected = self._prepare_oldest_scheduled_first_origins( + swh_scheduler, listed_origins_by_type + ) + after = self._check_grab_next_visit_basic( + swh_scheduler, + visit_type=visit_type, + policy="oldest_scheduled_first", + expected=expected, + ) + + cooldown_td = datetime.timedelta(days=cooldown) + + ret = swh_scheduler.grab_next_visits( + visit_type=visit_type, + count=len(expected) + 1, + policy="oldest_scheduled_first", + timestamp=after + cooldown_td - datetime.timedelta(days=1), + scheduled_cooldown=cooldown_td, + ) + + assert ret == [], "Scheduled cooldown ignored" + + ret = swh_scheduler.grab_next_visits( + visit_type=visit_type, + count=len(expected) + 1, + policy="oldest_scheduled_first", + timestamp=after + cooldown_td, + scheduled_cooldown=cooldown_td, + ) + + assert sorted(ret) == sorted( + expected + ), "grab_next_visits didn't reschedule visits after the configured cooldown" + def test_grab_next_visits_never_visited_oldest_update_first( self, swh_scheduler, listed_origins_by_type, ):