diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py --- a/swh/scheduler/journal_client.py +++ b/swh/scheduler/journal_client.py @@ -62,6 +62,7 @@ "last_failed": None, "last_notfound": None, "last_snapshot": None, + "successive_visits": 0, } pk = origin, visit_type if pk not in origin_visit_stats: @@ -72,14 +73,33 @@ visit_stats_d = origin_visit_stats[pk] + # get the name of the most recetn event we got + date_keys = [ + f"last_{k}" for k in ("eventful", "uneventful", "failed", "notfound") + ] + event_dates = dict( + (v, k) for k, v in visit_stats_d.items() if k in date_keys and v is not None + ) + if not event_dates: + maxdate = None + last_event = None + else: + maxdate = max_date(*event_dates.keys()) + last_event = event_dates[maxdate] + increment_successive_visits = False if msg_dict["status"] == "not_found": visit_stats_d["last_notfound"] = max_date( msg_dict["date"], visit_stats_d.get("last_notfound") ) + if last_event == "last_notfound": + increment_successive_visits = True + elif msg_dict["snapshot"] is None: visit_stats_d["last_failed"] = max_date( msg_dict["date"], visit_stats_d.get("last_failed") ) + if last_event == "last_failed": + increment_successive_visits = True else: # visit with snapshot, something happened if visit_stats_d["last_snapshot"] is None: # first time visit with snapshot, we keep relevant information @@ -104,6 +124,9 @@ # new eventful visit (new snapshot) visit_stats_d["last_eventful"] = current_status_date visit_stats_d["last_snapshot"] = msg_dict["snapshot"] + if last_event == "last_eventful": + increment_successive_visits = True + else: # same snapshot as before if ( @@ -118,9 +141,18 @@ "last_eventful" ] visit_stats_d["last_eventful"] = current_status_date + # there is no way we can do anything but reset the + # successive_visits here... else: # uneventful event visit_stats_d["last_uneventful"] = current_status_date + if last_event == "last_uneventful": + increment_successive_visits = True + + if increment_successive_visits: + visit_stats_d["successive_visits"] += 1 + else: + visit_stats_d["successive_visits"] = 1 scheduler.origin_visit_stats_upsert( OriginVisitStats(**ovs) for ovs in origin_visit_stats.values() diff --git a/swh/scheduler/model.py b/swh/scheduler/model.py --- a/swh/scheduler/model.py +++ b/swh/scheduler/model.py @@ -235,6 +235,7 @@ last_snapshot = attr.ib( type=Optional[bytes], validator=type_validator(), default=None ) + successive_visits = attr.ib(type=int, validator=type_validator(), default=0) @last_eventful.validator def check_last_eventful(self, attribute, value): diff --git a/swh/scheduler/sql/30-schema.sql b/swh/scheduler/sql/30-schema.sql --- a/swh/scheduler/sql/30-schema.sql +++ b/swh/scheduler/sql/30-schema.sql @@ -172,6 +172,7 @@ last_scheduled timestamptz, -- last snapshot resulting from an eventful visit last_snapshot bytea, + successive_visits int default 0, primary key (url, visit_type) ); @@ -184,6 +185,7 @@ comment on column origin_visit_stats.last_notfound is 'Date of the last notfound event'; comment on column origin_visit_stats.last_scheduled is 'Time when this origin was scheduled to be visited last'; comment on column origin_visit_stats.last_snapshot is 'sha1_git of the last visit snapshot'; +comment on column origin_visit_stats.successive_visits is 'number of successive visits with the same status'; create table scheduler_metrics ( diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py --- a/swh/scheduler/tests/test_journal_client.py +++ b/swh/scheduler/tests/test_journal_client.py @@ -120,6 +120,7 @@ last_failed=None, last_notfound=visit_status["date"], last_snapshot=None, + successive_visits=1, ) visit_statuses = [ @@ -155,6 +156,7 @@ last_failed=None, last_notfound=DATE3, last_snapshot=None, + successive_visits=1, ) @@ -208,6 +210,7 @@ last_failed=DATE3, last_notfound=None, last_snapshot=None, + successive_visits=3, ) @@ -261,6 +264,7 @@ last_failed=None, last_notfound=None, last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), + successive_visits=3, ) @@ -285,6 +289,7 @@ last_failed=DATE2, last_notfound=DATE1, last_snapshot=visit_status["snapshot"], + successive_visits=1, ) ] ) @@ -305,6 +310,7 @@ last_failed=DATE2, last_notfound=DATE1, last_snapshot=visit_status["snapshot"], + successive_visits=1, ) @@ -364,6 +370,7 @@ last_failed=None, last_notfound=None, last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), + successive_visits=1, ) assert swh_scheduler.origin_visit_stats_get("foo", "git") == expected_visit_stats @@ -425,6 +432,7 @@ last_failed=None, last_notfound=None, last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), + successive_visits=1, ) assert (