diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -332,6 +332,7 @@ policy: str, timestamp: Optional[datetime.datetime] = None, scheduled_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=7), + failed_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=14), db=None, cur=None, ) -> List[ListedOrigin]: @@ -369,6 +370,15 @@ query_args.append(timestamp) query_args.append(scheduled_cooldown) + if failed_cooldown: + # Don't retry failed origins too often + where_clauses.append( + "origin_visit_stats.last_failed is null " + "or origin_visit_stats.last_failed < %s - %s" + ) + query_args.append(timestamp) + query_args.append(failed_cooldown) + if policy == "oldest_scheduled_first": order_by = "origin_visit_stats.last_scheduled NULLS FIRST" elif policy == "never_visited_oldest_update_first": diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py --- a/swh/scheduler/interface.py +++ b/swh/scheduler/interface.py @@ -396,6 +396,7 @@ policy: str, timestamp: Optional[datetime.datetime] = None, scheduled_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=7), + failed_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=14), ) -> List[ListedOrigin]: """Get at most the `count` next origins that need to be visited with the `visit_type` loader according to the given scheduling `policy`. @@ -411,6 +412,8 @@ being scheduled (defaults to the current time) scheduled_cooldown: the minimal interval before which we can schedule the same origin again + failed_cooldown: the minimal interval before which we can reschedule a + failed origin """ ... diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -870,9 +870,10 @@ expected=expected, ) + @pytest.mark.parametrize("which_cooldown", ("scheduled", "failed")) @pytest.mark.parametrize("cooldown", (7, 15)) - def test_grab_next_visits_oldest_scheduled_first_scheduled_cooldown( - self, swh_scheduler, listed_origins_by_type, cooldown + def test_grab_next_visits_cooldowns( + self, swh_scheduler, listed_origins_by_type, which_cooldown, cooldown, ): visit_type, origins, expected = self._prepare_oldest_scheduled_first_origins( swh_scheduler, listed_origins_by_type @@ -884,24 +885,48 @@ expected=expected, ) + # Mark all the visits as `{which_cooldown}` (scheduled, failed or notfound) on + # the `after` timestamp + ovs_args = {"last_failed": None, "last_scheduled": None} + ovs_args[f"last_{which_cooldown}"] = after + + visit_stats = [ + OriginVisitStats( + url=origin.url, + visit_type=origin.visit_type, + last_snapshot=None, + last_eventful=None, + last_uneventful=None, + last_notfound=None, + **ovs_args, + ) + for i, origin in enumerate(origins) + ] + swh_scheduler.origin_visit_stats_upsert(visit_stats) + cooldown_td = datetime.timedelta(days=cooldown) + cooldown_args = { + "scheduled_cooldown": None, + "failed_cooldown": None, + } + cooldown_args[f"{which_cooldown}_cooldown"] = cooldown_td ret = swh_scheduler.grab_next_visits( visit_type=visit_type, count=len(expected) + 1, policy="oldest_scheduled_first", timestamp=after + cooldown_td - datetime.timedelta(seconds=1), - scheduled_cooldown=cooldown_td, + **cooldown_args, ) - assert ret == [], "Scheduled cooldown ignored" + assert ret == [], f"{which_cooldown}_cooldown ignored" ret = swh_scheduler.grab_next_visits( visit_type=visit_type, count=len(expected) + 1, policy="oldest_scheduled_first", timestamp=after + cooldown_td + datetime.timedelta(seconds=1), - scheduled_cooldown=cooldown_td, + **cooldown_args, ) assert sorted(ret) == sorted(