diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -348,8 +348,21 @@ where_clauses.append("visit_type = %s") query_args.append(visit_type) - # TODO: filter on last_scheduled "too recent" to avoid always - # re-scheduling the same tasks. + # Don't re-schedule visits if they're already scheduled but we haven't + # recorded a result yet, unless they've been scheduled more than a week + # ago (it probably means we've lost them in flight somewhere). + where_clauses.append( + """origin_visit_stats.last_scheduled IS NULL + OR origin_visit_stats.last_scheduled < GREATEST( + %s - '7 day'::interval, + origin_visit_stats.last_eventful, + origin_visit_stats.last_uneventful, + origin_visit_stats.last_failed, + origin_visit_stats.last_notfound + ) + """ + ) + query_args.append(timestamp) if policy == "oldest_scheduled_first": order_by = "origin_visit_stats.last_scheduled NULLS FIRST" diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -764,7 +764,7 @@ """Calls grab_next_visits with the passed policy, and check that all the origins returned are the expected ones (in the same order), and that no extra origins are returned. Also checks the origin visits have - been marked as scheduled.""" + been marked as scheduled, and are only re-scheduled a week later""" assert len(expected) != 0 before = utcnow() @@ -785,6 +785,25 @@ # Check that last_scheduled got updated assert before <= visit_stats.last_scheduled <= after + # They should not be scheduled again + ret = swh_scheduler.grab_next_visits( + visit_type=visit_type, count=len(expected) + 1, policy=policy, + ) + assert ret == [], "grab_next_visits returned already-scheduled origins" + + # But a week later, they should + ret = swh_scheduler.grab_next_visits( + visit_type=visit_type, + count=len(expected) + 1, + policy=policy, + timestamp=after + datetime.timedelta(days=7), + ) + # We need to sort them because their 'last_scheduled' field is updated to + # exactly the same value, so the order is not deterministic + assert sorted(ret) == sorted( + expected + ), "grab_next_visits didn't reschedule visits after a week" + def test_grab_next_visits_oldest_scheduled_first( self, swh_scheduler, listed_origins_by_type, ):