Page MenuHomeSoftware Heritage

D5978.id21889.diff
No OneTemporary

D5978.id21889.diff

diff --git a/sql/updates/29.sql b/sql/updates/29.sql
--- a/sql/updates/29.sql
+++ b/sql/updates/29.sql
@@ -12,8 +12,12 @@
alter table origin_visit_stats
add column next_position_offset int not null default 4;
+alter table origin_visit_stats
+add column successive_visits int not null default 0;
+
comment on column origin_visit_stats.next_visit_queue_position is 'Time at which some new objects are expected to be found';
comment on column origin_visit_stats.next_position_offset is 'Duration that we expect to wait between visits of this origin';
+comment on column origin_visit_stats.successive_visits is 'number of successive visits with the same status';
create table visit_scheduler_queue_position (
visit_type text not null,
diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py
--- a/swh/scheduler/journal_client.py
+++ b/swh/scheduler/journal_client.py
@@ -220,6 +220,8 @@
# eventfulness
last_visit_status, eventful = get_last_status(msg_dict, visit_stats_d)
+ increment_successive_visits = False
+
# Update the position offset according to the visit status,
# if we had already visited this origin before.
@@ -230,6 +232,10 @@
visit_stats_d["next_position_offset"] = max(
0, visit_stats_d["next_position_offset"] + increment
)
+ # increment the counter when last_visit_status is the same
+ increment_successive_visits = (
+ last_visit_status == visit_stats_d["last_visit_status"]
+ )
# Record current visit date as highest known date (we've rejected out of order
# messages earlier).
@@ -250,6 +256,11 @@
queue_position_per_visit_type, visit_stats_d
)
+ if increment_successive_visits:
+ visit_stats_d["successive_visits"] += 1
+ else:
+ visit_stats_d["successive_visits"] = 0
+
scheduler.origin_visit_stats_upsert(
OriginVisitStats(**ovs) for ovs in origin_visit_stats.values()
)
diff --git a/swh/scheduler/model.py b/swh/scheduler/model.py
--- a/swh/scheduler/model.py
+++ b/swh/scheduler/model.py
@@ -226,6 +226,8 @@
)
next_position_offset = attr.ib(type=int, validator=type_validator(), default=4)
+ successive_visits = attr.ib(type=int, validator=type_validator(), default=0)
+
@last_successful.validator
def check_last_successful(self, attribute, value):
check_timestamptz(value)
diff --git a/swh/scheduler/sql/30-schema.sql b/swh/scheduler/sql/30-schema.sql
--- a/swh/scheduler/sql/30-schema.sql
+++ b/swh/scheduler/sql/30-schema.sql
@@ -178,6 +178,7 @@
next_visit_queue_position timestamptz,
-- duration that we expect to wait between visits of this origin
next_position_offset int not null default 4,
+ successive_visits int not null default 0,
primary key (url, visit_type)
);
@@ -193,6 +194,7 @@
comment on column origin_visit_stats.next_visit_queue_position is 'Time at which some new objects are expected to be found';
comment on column origin_visit_stats.next_position_offset is 'Duration that we expect to wait between visits of this origin';
+comment on column origin_visit_stats.successive_visits is 'number of successive visits with the same status';
create table visit_scheduler_queue_position (
visit_type text not null,
diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py
--- a/swh/scheduler/tests/test_journal_client.py
+++ b/swh/scheduler/tests/test_journal_client.py
@@ -171,6 +171,7 @@
last_visit=visit_status["date"],
last_visit_status=LastVisitStatus.not_found,
next_position_offset=4,
+ successive_visits=0,
),
)
@@ -206,6 +207,7 @@
last_visit=DATE3,
last_visit_status=LastVisitStatus.not_found,
next_position_offset=6,
+ successive_visits=2,
),
)
@@ -259,6 +261,7 @@
last_visit=DATE3,
last_visit_status=LastVisitStatus.failed,
next_position_offset=6,
+ successive_visits=2,
),
)
@@ -296,6 +299,7 @@
last_visit=DATE2,
last_visit_status=LastVisitStatus.failed,
next_position_offset=5,
+ successive_visits=1,
),
)
@@ -351,6 +355,7 @@
last_visit_status=LastVisitStatus.successful,
last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"),
next_position_offset=0,
+ successive_visits=2,
),
)
@@ -377,6 +382,7 @@
last_snapshot=visit_status["snapshot"],
next_visit_queue_position=None,
next_position_offset=4,
+ successive_visits=1,
)
]
)
@@ -388,6 +394,7 @@
actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
[(visit_status["origin"], visit_status["type"])]
)
+
assert_visit_stats_ok(
actual_origin_visit_stats[0],
OriginVisitStats(
@@ -399,6 +406,7 @@
last_snapshot=visit_status["snapshot"],
next_visit_queue_position=None,
next_position_offset=5,
+ successive_visits=0,
),
)
@@ -463,13 +471,19 @@
last_visit_status=LastVisitStatus.successful,
last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
),
- ignore_fields=["next_visit_queue_position", "next_position_offset"],
+ ignore_fields=[
+ "next_visit_queue_position",
+ "next_position_offset",
+ "successive_visits",
+ ],
)
# We ignore out of order messages, so the next_position_offset isn't exact
# depending on the permutation. What matters is consistency of the final
# dates (last_visit and last_successful).
assert 4 <= visit_stats.next_position_offset <= 5
+ # same goes for successive_visits
+ assert 0 <= visit_stats.successive_visits <= 1
VISIT_STATUSES_1 = [
@@ -533,13 +547,19 @@
last_visit_status=LastVisitStatus.successful,
last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
),
- ignore_fields=["next_visit_queue_position", "next_position_offset"],
+ ignore_fields=[
+ "next_visit_queue_position",
+ "next_position_offset",
+ "successive_visits",
+ ],
)
# We ignore out of order messages, so the next_position_offset isn't exact
# depending on the permutation. What matters is consistency of the final
# dates (last_visit and last_successful).
assert 2 <= visit_stats.next_position_offset <= 5
+ # same goes for successive_visits
+ assert 0 <= visit_stats.successive_visits <= 3
VISIT_STATUSES_2 = [
@@ -680,6 +700,7 @@
last_visit=DATE1,
last_visit_status=LastVisitStatus.successful,
last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
+ successive_visits=0,
),
)
@@ -725,6 +746,7 @@
last_visit_status=LastVisitStatus.successful,
last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
next_position_offset=4,
+ successive_visits=0,
),
)
@@ -787,13 +809,19 @@
last_visit_status=LastVisitStatus.successful,
last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
),
- ignore_fields=["next_visit_queue_position", "next_position_offset"],
+ ignore_fields=[
+ "next_visit_queue_position",
+ "next_position_offset",
+ "successive_visits",
+ ],
)
# We ignore out of order messages, so the next_position_offset isn't exact
# depending on the permutation. What matters is consistency of the final
# dates (last_visit and last_successful).
assert 4 <= visit_stats.next_position_offset <= 6
+ # same goes for successive_visits
+ assert 0 <= visit_stats.successive_visits <= 2
@pytest.mark.parametrize(

File Metadata

Mime Type
text/plain
Expires
Dec 21 2024, 10:00 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3231994

Event Timeline