diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -346,6 +346,7 @@ scheduled_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=7), failed_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=14), not_found_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=31), + tablesample: Optional[float] = None, db=None, cur=None, ) -> List[ListedOrigin]: @@ -450,12 +451,18 @@ else: raise UnknownPolicy(f"Unknown scheduling policy {policy}") + if tablesample: + table = "listed_origins tablesample SYSTEM (%s)" + query_args.insert(0, tablesample) + else: + table = "listed_origins" + # fmt: off common_table_expressions.insert(0, ("selected_origins", f""" SELECT {origin_select_cols}, next_visit_queue_position FROM - listed_origins + {table} LEFT JOIN origin_visit_stats USING (url, visit_type) WHERE diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py --- a/swh/scheduler/interface.py +++ b/swh/scheduler/interface.py @@ -398,6 +398,7 @@ scheduled_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=7), failed_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=14), not_found_cooldown: Optional[datetime.timedelta] = datetime.timedelta(days=31), + tablesample: Optional[float] = None, ) -> List[ListedOrigin]: """Get at most the `count` next origins that need to be visited with the `visit_type` loader according to the given scheduling `policy`. @@ -417,6 +418,8 @@ failed origin not_found_cooldown: the minimal interval before which we can reschedule a not_found origin + tablesample: the percentage of the table on which we run the query + (None: no sampling) """ ... diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -943,6 +943,22 @@ expected ), "grab_next_visits didn't reschedule visits after the configured cooldown" + def test_grab_next_visits_tablesample( + self, swh_scheduler, listed_origins_by_type, + ): + visit_type, origins, expected = self._prepare_oldest_scheduled_first_origins( + swh_scheduler, listed_origins_by_type + ) + ret = swh_scheduler.grab_next_visits( + visit_type=visit_type, + policy="oldest_scheduled_first", + tablesample=50, + count=len(expected), + ) + + # Just a smoke test, not obvious how to test this more reliably + assert ret is not None + def test_grab_next_visits_never_visited_oldest_update_first( self, swh_scheduler, listed_origins_by_type, ):