diff --git a/sql/updates/24.sql b/sql/updates/24.sql new file mode 100644 --- /dev/null +++ b/sql/updates/24.sql @@ -0,0 +1,9 @@ +insert into dbversion (version, release, description) + values (24, now(), 'Work In Progress'); + +alter table origin_visit_stats add column last_scheduled timestamptz; +comment on column origin_visit_stats.last_scheduled is 'Time when this origin was scheduled to be visited last'; + +-- no need for a proper migration script of this last_schedules column: this +-- have not been published or deployed; just drop it +alter table listed_origins drop column last_scheduled; diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -302,19 +302,36 @@ if policy == "oldest_scheduled_first": query = f""" - with filtered_origins as ( - select lister_id, url, visit_type - from listed_origins - where visit_type = %s - order by last_scheduled nulls first - limit %s - for update skip locked + WITH selected_origins AS ( + SELECT + {origin_select_cols} + FROM + listed_origins + LEFT JOIN + origin_visit_stats USING (url, visit_type) + WHERE + visit_type = %s + ORDER BY + origin_visit_stats.last_scheduled NULLS FIRST + LIMIT %s + ), + update_stats AS ( + INSERT INTO + origin_visit_stats ( + url, visit_type, last_scheduled ) - update listed_origins - set last_scheduled = now() - where (lister_id, url, visit_type) in (select * from filtered_origins) - returning {origin_select_cols} - """ + SELECT + url, visit_type, now() + FROM + selected_origins + ON CONFLICT (url, visit_type) DO UPDATE + SET last_scheduled = GREATEST(origin_visit_stats.last_scheduled, EXCLUDED.last_scheduled) + ) + SELECT + * + FROM + selected_origins + """ # noqa cur.execute(query, (visit_type, count)) return [ListedOrigin(**d) for d in cur] @@ -799,7 +816,7 @@ last_snapshot = (select case when ovi.last_eventful < excluded.last_eventful then excluded.last_snapshot - else ovi.last_snapshot + else coalesce(ovi.last_snapshot, excluded.last_snapshot) end ) """ # noqa diff --git a/swh/scheduler/model.py b/swh/scheduler/model.py --- a/swh/scheduler/model.py +++ b/swh/scheduler/model.py @@ -152,10 +152,6 @@ type=Optional[datetime.datetime], validator=[type_validator()], default=None, ) - last_scheduled = attr.ib( - type=Optional[datetime.datetime], validator=[type_validator()], default=None, - ) - enabled = attr.ib(type=bool, validator=[type_validator()], default=True) first_seen = attr.ib( @@ -233,6 +229,9 @@ last_notfound = attr.ib( type=Optional[datetime.datetime], validator=type_validator() ) + last_scheduled = attr.ib( + type=Optional[datetime.datetime], validator=[type_validator()], default=None, + ) last_snapshot = attr.ib( type=Optional[bytes], validator=type_validator(), default=None ) diff --git a/swh/scheduler/sql/30-schema.sql b/swh/scheduler/sql/30-schema.sql --- a/swh/scheduler/sql/30-schema.sql +++ b/swh/scheduler/sql/30-schema.sql @@ -11,7 +11,7 @@ comment on column dbversion.description is 'Version description'; insert into dbversion (version, release, description) - values (23, now(), 'Work In Progress'); + values (24, now(), 'Work In Progress'); create table task_type ( type text primary key, @@ -145,9 +145,6 @@ -- potentially provided by the lister last_update timestamptz, - -- visit scheduling information - last_scheduled timestamptz, - primary key (lister_id, url, visit_type) ); @@ -163,7 +160,6 @@ comment on column listed_origins.last_update is 'Time of the last update to the origin recorded by the remote'; -comment on column listed_origins.last_scheduled is 'Time when this origin was scheduled to be visited last'; create table origin_visit_stats ( url text not null, @@ -172,6 +168,9 @@ last_uneventful timestamptz, last_failed timestamptz, last_notfound timestamptz, + -- visit scheduling information + last_scheduled timestamptz, + -- last snapshot resulting from an eventful visit last_snapshot bytea, primary key (url, visit_type) @@ -183,4 +182,5 @@ comment on column origin_visit_stats.last_uneventful is 'Date of the last uneventful event'; comment on column origin_visit_stats.last_failed is 'Date of the last failed event'; comment on column origin_visit_stats.last_notfound is 'Date of the last notfound event'; +comment on column origin_visit_stats.last_scheduled is 'Time when this origin was scheduled to be visited last'; comment on column origin_visit_stats.last_snapshot is 'sha1_git of the last visit snapshot'; diff --git a/swh/scheduler/sql/60-indexes.sql b/swh/scheduler/sql/60-indexes.sql --- a/swh/scheduler/sql/60-indexes.sql +++ b/swh/scheduler/sql/60-indexes.sql @@ -16,5 +16,8 @@ create unique index on listers (name, instance_name); -- listed origins -create index on listed_origins (url); -create index on listed_origins (visit_type, last_scheduled); +create index on listed_origins (url, visit_type); + +-- visit stats +create index on origin_visit_stats (url, visit_type); +-- XXX probably add indexes on most (visit_type, last_xxx) couples diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py --- a/swh/scheduler/tests/test_journal_client.py +++ b/swh/scheduler/tests/test_journal_client.py @@ -11,7 +11,7 @@ from swh.model.hashutil import hash_to_bytes from swh.scheduler.journal_client import max_date, process_journal_objects -from swh.scheduler.model import OriginVisitStats +from swh.scheduler.model import ListedOrigin, OriginVisitStats from swh.scheduler.utils import utcnow @@ -477,3 +477,111 @@ assert ( swh_scheduler.origin_visit_stats_get("cavabarder", "hg") == expected_visit_stats ) + + +VISIT_STATUSES_2 = [ + {**ovs, "date": DATE1 + n * ONE_DAY} + for n, ovs in enumerate( + [ + { + "origin": "cavabarder", + "type": "hg", + "visit": 1, + "status": "full", + "snapshot": hash_to_bytes("0000000000000000000000000000000000000000"), + }, + { + "origin": "cavabarder", + "type": "hg", + "visit": 2, + "status": "full", + "snapshot": hash_to_bytes("1111111111111111111111111111111111111111"), + }, + { + "origin": "iciaussi", + "type": "hg", + "visit": 1, + "status": "full", + "snapshot": hash_to_bytes("2222222222222222222222222222222222222222"), + }, + { + "origin": "iciaussi", + "type": "hg", + "visit": 2, + "status": "full", + "snapshot": hash_to_bytes("3333333333333333333333333333333333333333"), + }, + { + "origin": "cavabarder", + "type": "git", + "visit": 1, + "status": "full", + "snapshot": hash_to_bytes("4444444444444444444444444444444444444444"), + }, + { + "origin": "cavabarder", + "type": "git", + "visit": 2, + "status": "full", + "snapshot": hash_to_bytes("5555555555555555555555555555555555555555"), + }, + { + "origin": "iciaussi", + "type": "git", + "visit": 1, + "status": "full", + "snapshot": hash_to_bytes("6666666666666666666666666666666666666666"), + }, + { + "origin": "iciaussi", + "type": "git", + "visit": 2, + "status": "full", + "snapshot": hash_to_bytes("7777777777777777777777777777777777777777"), + }, + ] + ) +] + + +def test_journal_client_origin_visit_status_after_grab_next_visits( + swh_scheduler, stored_lister +): + """Ensure OriginVisitStat entries created in the db as a result of calling + grab_next_visits() do not mess the OriginVisitStats upsert mechanism. + + """ + + listed_origins = [ + ListedOrigin(lister_id=stored_lister.id, url=url, visit_type=visit_type) + for (url, visit_type) in set((v["origin"], v["type"]) for v in VISIT_STATUSES_2) + ] + swh_scheduler.record_listed_origins(listed_origins) + before = utcnow() + swh_scheduler.grab_next_visits( + visit_type="git", count=10, policy="oldest_scheduled_first" + ) + after = utcnow() + + assert swh_scheduler.origin_visit_stats_get("cavabarder", "hg") is None + assert swh_scheduler.origin_visit_stats_get("cavabarder", "git") is not None + + process_journal_objects( + {"origin_visit_status": VISIT_STATUSES_2}, scheduler=swh_scheduler + ) + + for url in ("cavabarder", "iciaussi"): + ovs = swh_scheduler.origin_visit_stats_get(url, "git") + assert before <= ovs.last_scheduled <= after + + ovs = swh_scheduler.origin_visit_stats_get(url, "hg") + assert ovs.last_scheduled is None + + ovs = swh_scheduler.origin_visit_stats_get("cavabarder", "git") + assert ovs.last_eventful == DATE1 + 5 * ONE_DAY + assert ovs.last_uneventful is None + assert ovs.last_failed is None + assert ovs.last_notfound is None + assert ovs.last_snapshot == hash_to_bytes( + "5555555555555555555555555555555555555555" + ) diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -744,7 +744,11 @@ assert len(ret) == NUM_RESULTS for origin in ret: - assert before <= origin.last_scheduled <= after + visit_stats = swh_scheduler.origin_visit_stats_get( + origin.url, origin.visit_type + ) + assert visit_stats is not None + assert before <= visit_stats.last_scheduled <= after @pytest.mark.parametrize("policy", ["oldest_scheduled_first"]) def test_grab_next_visits_underflow(