Changeset View
Changeset View
Standalone View
Standalone View
swh/scheduler/tests/test_journal_client.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
from datetime import timedelta | from datetime import timedelta | ||||
import functools | import functools | ||||
from itertools import permutations | from itertools import permutations | ||||
import random | |||||
from unittest.mock import Mock | from unittest.mock import Mock | ||||
import attr | import attr | ||||
import pytest | import pytest | ||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||
from swh.scheduler.journal_client import ( | from swh.scheduler.journal_client import ( | ||||
INTERESTING_EVENTS, | |||||
compute_last_event, | |||||
from_position_offset_to_days, | from_position_offset_to_days, | ||||
max_date, | max_date, | ||||
next_visit_queue_position, | next_visit_queue_position, | ||||
process_journal_objects, | process_journal_objects, | ||||
) | ) | ||||
from swh.scheduler.model import ListedOrigin, OriginVisitStats | from swh.scheduler.model import ListedOrigin, OriginVisitStats | ||||
from swh.scheduler.utils import utcnow | from swh.scheduler.utils import utcnow | ||||
▲ Show 20 Lines • Show All 100 Lines • ▼ Show 20 Lines | def test_journal_client_ignore_missing_type(swh_scheduler): | ||||
process_journal_objects( | process_journal_objects( | ||||
{"origin_visit_status": visit_statuses}, scheduler=swh_scheduler | {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler | ||||
) | ) | ||||
# The message has been ignored: no stats have been upserted | # The message has been ignored: no stats have been upserted | ||||
swh_scheduler.origin_visit_stats_upsert.assert_not_called() | swh_scheduler.origin_visit_stats_upsert.assert_not_called() | ||||
def assert_visit_stats_ok(actual_visit_stats, expected_visit_stats): | def assert_visit_stats_ok( | ||||
actual_visit_stats, expected_visit_stats, compare_successive_visits=True | |||||
): | |||||
"""Utility test function to ensure visits stats read from the backend are in the right | """Utility test function to ensure visits stats read from the backend are in the right | ||||
shape. The comparison on the next_visit_queue_position will be dealt with in | shape. The comparison on the next_visit_queue_position will be dealt with in | ||||
dedicated tests so it's not tested in tests that are calling this function. | dedicated tests so it's not tested in tests that are calling this function. | ||||
""" | """ | ||||
assert len(actual_visit_stats) == len(expected_visit_stats) | assert len(actual_visit_stats) == len(expected_visit_stats) | ||||
for visit_stats in actual_visit_stats: | for visit_stats in actual_visit_stats: | ||||
visit_stats = attr.evolve(visit_stats, next_visit_queue_position=None) | visit_stats = attr.evolve(visit_stats, next_visit_queue_position=None) | ||||
if not compare_successive_visits: | |||||
visit_stats = attr.evolve(visit_stats, successive_visits=0) | |||||
assert visit_stats in expected_visit_stats | assert visit_stats in expected_visit_stats | ||||
def test_journal_client_origin_visit_status_from_journal_last_notfound(swh_scheduler): | def test_journal_client_origin_visit_status_from_journal_last_notfound(swh_scheduler): | ||||
visit_status = { | visit_status = { | ||||
"origin": "foo", | "origin": "foo", | ||||
"visit": 1, | "visit": 1, | ||||
Show All 15 Lines | assert_visit_stats_ok( | ||||
url="foo", | url="foo", | ||||
visit_type="git", | visit_type="git", | ||||
last_eventful=None, | last_eventful=None, | ||||
last_uneventful=None, | last_uneventful=None, | ||||
last_failed=None, | last_failed=None, | ||||
last_notfound=visit_status["date"], | last_notfound=visit_status["date"], | ||||
last_snapshot=None, | last_snapshot=None, | ||||
next_position_offset=5, | next_position_offset=5, | ||||
successive_visits=1, | |||||
) | ) | ||||
], | ], | ||||
) | ) | ||||
visit_statuses = [ | visit_statuses = [ | ||||
{ | { | ||||
"origin": "foo", | "origin": "foo", | ||||
"visit": 3, | "visit": 3, | ||||
Show All 24 Lines | assert_visit_stats_ok( | ||||
url="foo", | url="foo", | ||||
visit_type="git", | visit_type="git", | ||||
last_eventful=None, | last_eventful=None, | ||||
last_uneventful=None, | last_uneventful=None, | ||||
last_failed=None, | last_failed=None, | ||||
last_notfound=DATE3, | last_notfound=DATE3, | ||||
last_snapshot=None, | last_snapshot=None, | ||||
next_position_offset=7, | next_position_offset=7, | ||||
successive_visits=3, | |||||
) | ) | ||||
], | ], | ||||
) | ) | ||||
def test_journal_client_origin_visit_status_from_journal_last_failed(swh_scheduler): | def test_journal_client_origin_visit_status_from_journal_last_failed(swh_scheduler): | ||||
visit_statuses = [ | visit_statuses = [ | ||||
{ | { | ||||
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines | assert_visit_stats_ok( | ||||
url="bar", | url="bar", | ||||
visit_type="git", | visit_type="git", | ||||
last_eventful=None, | last_eventful=None, | ||||
last_uneventful=None, | last_uneventful=None, | ||||
last_failed=DATE3, | last_failed=DATE3, | ||||
last_notfound=None, | last_notfound=None, | ||||
last_snapshot=None, | last_snapshot=None, | ||||
next_position_offset=7, | next_position_offset=7, | ||||
successive_visits=3, | |||||
) | ) | ||||
], | ], | ||||
) | ) | ||||
def test_journal_client_origin_visit_status_from_journal_last_failed2(swh_scheduler): | def test_journal_client_origin_visit_status_from_journal_last_failed2(swh_scheduler): | ||||
visit_statuses = [ | visit_statuses = [ | ||||
{ | { | ||||
Show All 26 Lines | assert_visit_stats_ok( | ||||
url="bar", | url="bar", | ||||
visit_type="git", | visit_type="git", | ||||
last_eventful=None, | last_eventful=None, | ||||
last_uneventful=None, | last_uneventful=None, | ||||
last_failed=DATE2, | last_failed=DATE2, | ||||
last_notfound=None, | last_notfound=None, | ||||
last_snapshot=None, | last_snapshot=None, | ||||
next_position_offset=6, | next_position_offset=6, | ||||
successive_visits=2, | |||||
) | ) | ||||
], | ], | ||||
) | ) | ||||
def test_journal_client_origin_visit_status_from_journal_last_eventful(swh_scheduler): | def test_journal_client_origin_visit_status_from_journal_last_eventful(swh_scheduler): | ||||
visit_statuses = [ | visit_statuses = [ | ||||
{ | { | ||||
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines | assert_visit_stats_ok( | ||||
url="foo", | url="foo", | ||||
visit_type="git", | visit_type="git", | ||||
last_eventful=DATE3, | last_eventful=DATE3, | ||||
last_uneventful=None, | last_uneventful=None, | ||||
last_failed=None, | last_failed=None, | ||||
last_notfound=None, | last_notfound=None, | ||||
last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), | last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), | ||||
next_position_offset=0, | next_position_offset=0, | ||||
successive_visits=3, | |||||
) | ) | ||||
], | ], | ||||
) | ) | ||||
def test_journal_client_origin_visit_status_from_journal_last_uneventful(swh_scheduler): | def test_journal_client_origin_visit_status_from_journal_last_uneventful(swh_scheduler): | ||||
visit_status = { | visit_status = { | ||||
"origin": "foo", | "origin": "foo", | ||||
Show All 12 Lines | swh_scheduler.origin_visit_stats_upsert( | ||||
visit_type=visit_status["type"], | visit_type=visit_status["type"], | ||||
last_eventful=DATE1, | last_eventful=DATE1, | ||||
last_uneventful=DATE3, | last_uneventful=DATE3, | ||||
last_failed=DATE2, | last_failed=DATE2, | ||||
last_notfound=DATE1, | last_notfound=DATE1, | ||||
last_snapshot=visit_status["snapshot"], | last_snapshot=visit_status["snapshot"], | ||||
next_visit_queue_position=None, | next_visit_queue_position=None, | ||||
next_position_offset=4, | next_position_offset=4, | ||||
successive_visits=1, | |||||
) | ) | ||||
] | ] | ||||
) | ) | ||||
process_journal_objects( | process_journal_objects( | ||||
{"origin_visit_status": [visit_status]}, scheduler=swh_scheduler | {"origin_visit_status": [visit_status]}, scheduler=swh_scheduler | ||||
) | ) | ||||
actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( | actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( | ||||
[(visit_status["origin"], visit_status["type"])] | [(visit_status["origin"], visit_status["type"])] | ||||
) | ) | ||||
assert_visit_stats_ok( | assert_visit_stats_ok( | ||||
actual_origin_visit_stats, | actual_origin_visit_stats, | ||||
[ | [ | ||||
OriginVisitStats( | OriginVisitStats( | ||||
url=visit_status["origin"], | url=visit_status["origin"], | ||||
visit_type=visit_status["type"], | visit_type=visit_status["type"], | ||||
last_eventful=DATE1, | last_eventful=DATE1, | ||||
last_uneventful=visit_status["date"], # most recent date but uneventful | last_uneventful=visit_status["date"], # most recent date but uneventful | ||||
last_failed=DATE2, | last_failed=DATE2, | ||||
last_notfound=DATE1, | last_notfound=DATE1, | ||||
last_snapshot=visit_status["snapshot"], | last_snapshot=visit_status["snapshot"], | ||||
next_position_offset=5, # uneventful so visit less often | next_position_offset=5, # uneventful so visit less often | ||||
successive_visits=2, | |||||
) | ) | ||||
], | ], | ||||
) | ) | ||||
VISIT_STATUSES = [ | VISIT_STATUSES = [ | ||||
{**ovs, "date": DATE1 + n * ONE_DAY} | {**ovs, "date": DATE1 + n * ONE_DAY} | ||||
for n, ovs in enumerate( | for n, ovs in enumerate( | ||||
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines | assert_visit_stats_ok( | ||||
url="foo", | url="foo", | ||||
visit_type="git", | visit_type="git", | ||||
last_eventful=DATE1 + ONE_DAY, | last_eventful=DATE1 + ONE_DAY, | ||||
last_uneventful=DATE1 + 3 * ONE_DAY, | last_uneventful=DATE1 + 3 * ONE_DAY, | ||||
last_failed=None, | last_failed=None, | ||||
last_notfound=None, | last_notfound=None, | ||||
last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), | last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), | ||||
next_position_offset=5, # uneventful, visit origin less often in future | next_position_offset=5, # uneventful, visit origin less often in future | ||||
successive_visits=1, | |||||
) | ) | ||||
], | ], | ||||
) | ) | ||||
VISIT_STATUSES_1 = [ | VISIT_STATUSES_1 = [ | ||||
{**ovs, "date": DATE1 + n * ONE_DAY} | {**ovs, "date": DATE1 + n * ONE_DAY} | ||||
for n, ovs in enumerate( | for n, ovs in enumerate( | ||||
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | def test_journal_client_origin_visit_status_permutation1(visit_statuses, swh_scheduler): | ||||
assert actual_visit_stats.visit_type == "hg" | assert actual_visit_stats.visit_type == "hg" | ||||
assert actual_visit_stats.last_eventful == DATE1 + 2 * ONE_DAY | assert actual_visit_stats.last_eventful == DATE1 + 2 * ONE_DAY | ||||
assert actual_visit_stats.last_uneventful == DATE1 + 3 * ONE_DAY | assert actual_visit_stats.last_uneventful == DATE1 + 3 * ONE_DAY | ||||
assert actual_visit_stats.last_failed is None | assert actual_visit_stats.last_failed is None | ||||
assert actual_visit_stats.last_notfound is None | assert actual_visit_stats.last_notfound is None | ||||
assert actual_visit_stats.last_snapshot == hash_to_bytes( | assert actual_visit_stats.last_snapshot == hash_to_bytes( | ||||
"aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd" | "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd" | ||||
) | ) | ||||
assert actual_visit_stats.successive_visits == 1 | |||||
VISIT_STATUSES_2 = [ | VISIT_STATUSES_2 = [ | ||||
{**ovs, "date": DATE1 + n * ONE_DAY} | {**ovs, "date": DATE1 + n * ONE_DAY} | ||||
for n, ovs in enumerate( | for n, ovs in enumerate( | ||||
[ | [ | ||||
{ | { | ||||
"origin": "cavabarder", | "origin": "cavabarder", | ||||
▲ Show 20 Lines • Show All 127 Lines • ▼ Show 20 Lines | assert_visit_stats_ok( | ||||
OriginVisitStats( | OriginVisitStats( | ||||
url="foo", | url="foo", | ||||
visit_type="git", | visit_type="git", | ||||
last_eventful=DATE1, | last_eventful=DATE1, | ||||
last_uneventful=None, | last_uneventful=None, | ||||
last_failed=None, | last_failed=None, | ||||
last_notfound=None, | last_notfound=None, | ||||
last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), | last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), | ||||
next_position_offset=4, | |||||
successive_visits=1, | |||||
) | ) | ||||
], | ], | ||||
) | ) | ||||
def test_journal_client_origin_visit_status_several_upsert(swh_scheduler): | def test_journal_client_origin_visit_status_several_upsert(swh_scheduler): | ||||
"""An old message updates old information | """An old message updates old information | ||||
Show All 32 Lines | assert_visit_stats_ok( | ||||
url="foo", | url="foo", | ||||
visit_type="git", | visit_type="git", | ||||
last_eventful=DATE1, | last_eventful=DATE1, | ||||
last_uneventful=DATE2, | last_uneventful=DATE2, | ||||
last_failed=None, | last_failed=None, | ||||
last_notfound=None, | last_notfound=None, | ||||
last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), | last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), | ||||
next_position_offset=5, | next_position_offset=5, | ||||
successive_visits=1, | |||||
) | ) | ||||
], | ], | ||||
) | ) | ||||
VISIT_STATUSES_SAME_SNAPSHOT = [ | VISIT_STATUSES_SAME_SNAPSHOT = [ | ||||
{**ovs, "date": DATE1 + n * ONE_YEAR} | {**ovs, "date": DATE1 + n * ONE_YEAR} | ||||
for n, ovs in enumerate( | for n, ovs in enumerate( | ||||
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines | assert_visit_stats_ok( | ||||
last_eventful=DATE1, | last_eventful=DATE1, | ||||
last_uneventful=DATE1 + 2 * ONE_YEAR, | last_uneventful=DATE1 + 2 * ONE_YEAR, | ||||
last_failed=None, | last_failed=None, | ||||
last_notfound=None, | last_notfound=None, | ||||
last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), | last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), | ||||
next_position_offset=6, # 2 uneventful visits, whatever the permutation | next_position_offset=6, # 2 uneventful visits, whatever the permutation | ||||
) | ) | ||||
], | ], | ||||
compare_successive_visits=False, # depending on permutations, result change | |||||
) | ) | ||||
@pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
"position_offset, interval", | "position_offset, interval", | ||||
[ | [ | ||||
(0, 1), | (0, 1), | ||||
(1, 1), | (1, 1), | ||||
▲ Show 20 Lines • Show All 84 Lines • ▼ Show 20 Lines | actual_position = next_visit_queue_position( | ||||
}, | }, | ||||
) | ) | ||||
assert actual_position == date_now + timedelta( | assert actual_position == date_now + timedelta( | ||||
days=from_position_offset_to_days(next_position_offset) * (1 + fudge_factor) | days=from_position_offset_to_days(next_position_offset) * (1 + fudge_factor) | ||||
) | ) | ||||
assert mock_random.called | assert mock_random.called | ||||
@pytest.mark.parametrize( | |||||
"last_event", INTERESTING_EVENTS, | |||||
) | |||||
def test_compute_last_event(last_event): | |||||
"""Compute last event of visit stats should always return most recent event if any | |||||
""" | |||||
maxdate = utcnow() | |||||
event_key = f"last_{last_event}" | |||||
visit_stats_d = { | |||||
event_key: maxdate, | |||||
} | |||||
other_events = set(INTERESTING_EVENTS) - set([last_event]) | |||||
# For the remaining event, even have some empty dates or have some dates in the past | |||||
for i, event in enumerate(other_events): | |||||
empty_date = random.choice([True, False]) | |||||
visit_stats_d[f"last_{event}"] = ( | |||||
None if empty_date else (maxdate - timedelta(days=1 + i)) | |||||
) | |||||
# Ensure our visit_stats_d is configured properly | |||||
assert max_date(*visit_stats_d.values()) == maxdate | |||||
actual_last_event = compute_last_event(visit_stats_d) | |||||
assert actual_last_event == event_key | |||||
def test_compute_last_event_none(): | |||||
"""Compute last event out of visit stats without any date should return None | |||||
""" | |||||
visit_stats_d = {f"last_{event}": None for event in INTERESTING_EVENTS} | |||||
assert compute_last_event(visit_stats_d) is None |