diff --git a/sql/updates/29.sql b/sql/updates/29.sql --- a/sql/updates/29.sql +++ b/sql/updates/29.sql @@ -12,8 +12,12 @@ alter table origin_visit_stats add column next_position_offset int not null default 4; +alter table origin_visit_stats +add column successive_visits int not null default 0; + comment on column origin_visit_stats.next_visit_queue_position is 'Time at which some new objects are expected to be found'; comment on column origin_visit_stats.next_position_offset is 'Duration that we expect to wait between visits of this origin'; +comment on column origin_visit_stats.successive_visits is 'number of successive visits with the same status'; create table visit_scheduler_queue_position ( visit_type text not null, diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py --- a/swh/scheduler/journal_client.py +++ b/swh/scheduler/journal_client.py @@ -220,6 +220,8 @@ # eventfulness last_visit_status, eventful = get_last_status(msg_dict, visit_stats_d) + increment_successive_visits = False + # Update the position offset according to the visit status, # if we had already visited this origin before. @@ -230,6 +232,10 @@ visit_stats_d["next_position_offset"] = max( 0, visit_stats_d["next_position_offset"] + increment ) + # increment the counter when last_visit_status is the same + increment_successive_visits = ( + last_visit_status == visit_stats_d["last_visit_status"] + ) # Record current visit date as highest known date (we've rejected out of order # messages earlier). @@ -250,6 +256,11 @@ queue_position_per_visit_type, visit_stats_d ) + if increment_successive_visits: + visit_stats_d["successive_visits"] += 1 + else: + visit_stats_d["successive_visits"] = 1 + scheduler.origin_visit_stats_upsert( OriginVisitStats(**ovs) for ovs in origin_visit_stats.values() ) diff --git a/swh/scheduler/model.py b/swh/scheduler/model.py --- a/swh/scheduler/model.py +++ b/swh/scheduler/model.py @@ -226,6 +226,8 @@ ) next_position_offset = attr.ib(type=int, validator=type_validator(), default=4) + successive_visits = attr.ib(type=int, validator=type_validator(), default=0) + @last_successful.validator def check_last_successful(self, attribute, value): check_timestamptz(value) diff --git a/swh/scheduler/sql/30-schema.sql b/swh/scheduler/sql/30-schema.sql --- a/swh/scheduler/sql/30-schema.sql +++ b/swh/scheduler/sql/30-schema.sql @@ -178,6 +178,7 @@ next_visit_queue_position timestamptz, -- duration that we expect to wait between visits of this origin next_position_offset int not null default 4, + successive_visits int not null default 0, primary key (url, visit_type) ); @@ -193,6 +194,7 @@ comment on column origin_visit_stats.next_visit_queue_position is 'Time at which some new objects are expected to be found'; comment on column origin_visit_stats.next_position_offset is 'Duration that we expect to wait between visits of this origin'; +comment on column origin_visit_stats.successive_visits is 'number of successive visits with the same status'; create table visit_scheduler_queue_position ( visit_type text not null, diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py --- a/swh/scheduler/tests/test_journal_client.py +++ b/swh/scheduler/tests/test_journal_client.py @@ -171,6 +171,7 @@ last_visit=visit_status["date"], last_visit_status=LastVisitStatus.not_found, next_position_offset=4, + successive_visits=1, ), ) @@ -206,6 +207,7 @@ last_visit=DATE3, last_visit_status=LastVisitStatus.not_found, next_position_offset=6, + successive_visits=3, ), ) @@ -259,6 +261,7 @@ last_visit=DATE3, last_visit_status=LastVisitStatus.failed, next_position_offset=6, + successive_visits=3, ), ) @@ -296,6 +299,7 @@ last_visit=DATE2, last_visit_status=LastVisitStatus.failed, next_position_offset=5, + successive_visits=2, ), ) @@ -351,6 +355,7 @@ last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), next_position_offset=0, + successive_visits=3, ), ) @@ -377,6 +382,7 @@ last_snapshot=visit_status["snapshot"], next_visit_queue_position=None, next_position_offset=4, + successive_visits=1, ) ] ) @@ -388,6 +394,7 @@ actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( [(visit_status["origin"], visit_status["type"])] ) + assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( @@ -399,6 +406,7 @@ last_snapshot=visit_status["snapshot"], next_visit_queue_position=None, next_position_offset=5, + successive_visits=1, ), ) @@ -463,13 +471,19 @@ last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ), - ignore_fields=["next_visit_queue_position", "next_position_offset"], + ignore_fields=[ + "next_visit_queue_position", + "next_position_offset", + "successive_visits", + ], ) # We ignore out of order messages, so the next_position_offset isn't exact # depending on the permutation. What matters is consistency of the final # dates (last_visit and last_successful). assert 4 <= visit_stats.next_position_offset <= 5 + # same goes for successive_visits + assert 1 <= visit_stats.successive_visits <= 2 VISIT_STATUSES_1 = [ @@ -533,13 +547,19 @@ last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), ), - ignore_fields=["next_visit_queue_position", "next_position_offset"], + ignore_fields=[ + "next_visit_queue_position", + "next_position_offset", + "successive_visits", + ], ) # We ignore out of order messages, so the next_position_offset isn't exact # depending on the permutation. What matters is consistency of the final # dates (last_visit and last_successful). assert 2 <= visit_stats.next_position_offset <= 5 + # same goes for successive_visits + assert 0 <= visit_stats.successive_visits <= 4 VISIT_STATUSES_2 = [ @@ -680,6 +700,7 @@ last_visit=DATE1, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), + successive_visits=1, ), ) @@ -725,6 +746,7 @@ last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), next_position_offset=4, + successive_visits=1, ), ) @@ -787,7 +809,11 @@ last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), ), - ignore_fields=["next_visit_queue_position", "next_position_offset"], + ignore_fields=[ + "next_visit_queue_position", + "next_position_offset", + "successive_visits", + ], ) # We ignore out of order messages, so the next_position_offset isn't exact