Changeset View
Changeset View
Standalone View
Standalone View
swh/scheduler/tests/test_scheduler.py
Show First 20 Lines • Show All 749 Lines • ▼ Show 20 Lines | class TestScheduler: | ||||
def test_get_listed_origins_all(self, swh_scheduler, listed_origins) -> None: | def test_get_listed_origins_all(self, swh_scheduler, listed_origins) -> None: | ||||
swh_scheduler.record_listed_origins(listed_origins) | swh_scheduler.record_listed_origins(listed_origins) | ||||
ret = swh_scheduler.get_listed_origins(limit=len(listed_origins) + 1) | ret = swh_scheduler.get_listed_origins(limit=len(listed_origins) + 1) | ||||
assert ret.next_page_token is None | assert ret.next_page_token is None | ||||
assert len(ret.origins) == len(listed_origins) | assert len(ret.origins) == len(listed_origins) | ||||
@pytest.mark.parametrize("policy", ["oldest_scheduled_first"]) | def _grab_next_visits_setup(self, swh_scheduler, listed_origins_by_type): | ||||
def test_grab_next_visits(self, swh_scheduler, listed_origins_by_type, policy): | """Basic origins setup for scheduling policy tests""" | ||||
NUM_RESULTS = 5 | |||||
# Strict inequality to check that grab_next_visits doesn't return more | |||||
# results than requested | |||||
visit_type = next(iter(listed_origins_by_type)) | visit_type = next(iter(listed_origins_by_type)) | ||||
assert len(listed_origins_by_type[visit_type]) > NUM_RESULTS | origins = listed_origins_by_type[visit_type][:100] | ||||
assert len(origins) > 0 | |||||
for origins in listed_origins_by_type.values(): | recorded_origins = swh_scheduler.record_listed_origins(origins) | ||||
swh_scheduler.record_listed_origins(origins) | |||||
return visit_type, recorded_origins | |||||
def _check_grab_next_visit(self, swh_scheduler, visit_type, policy, expected): | |||||
"""Calls grab_next_visits with the passed policy, and check that all | |||||
the origins returned are the expected ones (in the same order), and | |||||
that no extra origins are returned. Also checks the origin visits have | |||||
been marked as scheduled.""" | |||||
assert len(expected) != 0 | |||||
before = utcnow() | before = utcnow() | ||||
ret = swh_scheduler.grab_next_visits(visit_type, NUM_RESULTS, policy=policy) | ret = swh_scheduler.grab_next_visits( | ||||
visit_type=visit_type, | |||||
# Request one more than expected to check that no extra origin is returned | |||||
count=len(expected) + 1, | |||||
policy=policy, | |||||
) | |||||
after = utcnow() | after = utcnow() | ||||
assert len(ret) == NUM_RESULTS | assert ret == expected | ||||
for origin in ret: | visit_stats_list = swh_scheduler.origin_visit_stats_get( | ||||
pk = (origin.url, origin.visit_type) | [(origin.url, origin.visit_type) for origin in expected] | ||||
visit_stats = swh_scheduler.origin_visit_stats_get([pk])[0] | ) | ||||
assert visit_stats is not None | assert len(visit_stats_list) == len(expected) | ||||
for visit_stats in visit_stats_list: | |||||
# Check that last_scheduled got updated | |||||
assert before <= visit_stats.last_scheduled <= after | assert before <= visit_stats.last_scheduled <= after | ||||
@pytest.mark.parametrize("policy", ["oldest_scheduled_first"]) | def test_grab_next_visits_oldest_scheduled_first( | ||||
def test_grab_next_visits_underflow( | self, swh_scheduler, listed_origins_by_type, | ||||
self, swh_scheduler, listed_origins_by_type, policy | |||||
): | ): | ||||
NUM_RESULTS = 5 | visit_type, origins = self._grab_next_visits_setup( | ||||
visit_type = next(iter(listed_origins_by_type)) | swh_scheduler, listed_origins_by_type | ||||
assert len(listed_origins_by_type[visit_type]) > NUM_RESULTS | ) | ||||
# Give all origins but one a last_scheduled date | |||||
base_date = datetime.datetime(2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) | |||||
visit_stats = [ | |||||
OriginVisitStats( | |||||
url=origin.url, | |||||
visit_type=origin.visit_type, | |||||
last_snapshot=None, | |||||
last_eventful=None, | |||||
last_uneventful=None, | |||||
last_failed=None, | |||||
last_notfound=None, | |||||
last_scheduled=base_date - datetime.timedelta(seconds=i), | |||||
) | |||||
for i, origin in enumerate(origins[1:]) | |||||
] | |||||
swh_scheduler.origin_visit_stats_upsert(visit_stats) | |||||
swh_scheduler.record_listed_origins( | # We expect to retrieve the origin with a NULL last_scheduled | ||||
listed_origins_by_type[visit_type][:NUM_RESULTS] | # as well as those with the oldest values (i.e. the last ones), in order. | ||||
expected = [origins[0]] + origins[1:][::-1] | |||||
self._check_grab_next_visit( | |||||
swh_scheduler, | |||||
visit_type=visit_type, | |||||
policy="oldest_scheduled_first", | |||||
expected=expected, | |||||
) | ) | ||||
ret = swh_scheduler.grab_next_visits(visit_type, NUM_RESULTS + 2, policy=policy) | def test_grab_next_visits_underflow(self, swh_scheduler, listed_origins_by_type): | ||||
ardumont: there are* | |||||
"""Check that grab_next_visits works when there not enough origins in | |||||
the database""" | |||||
visit_type = next(iter(listed_origins_by_type)) | |||||
assert len(ret) == NUM_RESULTS | # Only add 5 origins to the database | ||||
origins = listed_origins_by_type[visit_type][:5] | |||||
assert origins | |||||
swh_scheduler.record_listed_origins(origins) | |||||
ret = swh_scheduler.grab_next_visits( | |||||
visit_type, len(origins) + 2, policy="oldest_scheduled_first" | |||||
) | |||||
assert len(ret) == 5 | |||||
def test_grab_next_visits_unknown_policy(self, swh_scheduler): | def test_grab_next_visits_unknown_policy(self, swh_scheduler): | ||||
unknown_policy = "non_existing_policy" | |||||
NUM_RESULTS = 5 | NUM_RESULTS = 5 | ||||
with pytest.raises(UnknownPolicy, match="non_existing_policy"): | with pytest.raises(UnknownPolicy, match=unknown_policy): | ||||
swh_scheduler.grab_next_visits( | swh_scheduler.grab_next_visits("type", NUM_RESULTS, policy=unknown_policy) | ||||
"type", NUM_RESULTS, policy="non_existing_policy" | |||||
) | |||||
def _create_task_types(self, scheduler): | def _create_task_types(self, scheduler): | ||||
for tt in TASK_TYPES.values(): | for tt in TASK_TYPES.values(): | ||||
scheduler.create_task_type(tt) | scheduler.create_task_type(tt) | ||||
def test_origin_visit_stats_get_empty(self, swh_scheduler) -> None: | def test_origin_visit_stats_get_empty(self, swh_scheduler) -> None: | ||||
assert swh_scheduler.origin_visit_stats_get([]) == [] | assert swh_scheduler.origin_visit_stats_get([]) == [] | ||||
▲ Show 20 Lines • Show All 350 Lines • Show Last 20 Lines |
there are*