Page MenuHomeSoftware Heritage

D6136.id22212.diff
No OneTemporary

D6136.id22212.diff

diff --git a/sql/updates/30-bis.sql b/sql/updates/30-bis.sql
new file mode 100644
--- /dev/null
+++ b/sql/updates/30-bis.sql
@@ -0,0 +1,8 @@
+-- SWH DB schema upgrade
+-- from_version: 30
+-- to_version: 30
+-- description: Bounds existing next position offset to a max of 10
+
+update origin_visit_stats
+ set next_position_offset = 10
+ where next_position_offset > 10;
diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py
--- a/swh/scheduler/journal_client.py
+++ b/swh/scheduler/journal_client.py
@@ -18,6 +18,9 @@
DISABLE_ORIGIN_THRESHOLD = 3
"""Threshold to disable failing origins"""
+MAX_NEXT_POSITION_OFFSET = 10
+"""Max next position offset to avoid date computation overflow"""
+
def max_date(*dates: Optional[datetime]) -> datetime:
"""Return the max date of given (possibly None) dates
@@ -32,17 +35,19 @@
def from_position_offset_to_days(position_offset: int) -> int:
- """Compute position offset to interval in days.
+ """Compute position offset to interval in days. Note that this does not bounds the
+ position_offset input so client code should limit the date computation to avoid
+ overflow errors.
- - index 0 and 1: interval 1 day
- - index 2, 3 and 4: interval 2 days
- - index 5 and up: interval `4^(n-4)` days for n in (4, 16, 64, 256, 1024, ...)
+ - index in [0:1]: interval 1 day
+ - index in [2:4]: interval 2 days
+ - index in [5:+inf]: interval `4^(index-4)` days
Args:
position_offset: The actual position offset for a given visit stats
Returns:
- The offset as an interval in number of days
+ The offset as an interval in number of days.
"""
assert position_offset >= 0
@@ -236,9 +241,12 @@
# Update the next position offset according to the existing value and the
# eventfulness of the visit.
increment = -2 if eventful else 1
- visit_stats_d["next_position_offset"] = max(
- 0, visit_stats_d["next_position_offset"] + increment
+ # Limit the next_position_offset for acceptable date computations
+ current_offset = min(
+ visit_stats_d["next_position_offset"] + increment,
+ MAX_NEXT_POSITION_OFFSET,
)
+ visit_stats_d["next_position_offset"] = max(0, current_offset)
# increment the counter when last_visit_status is the same
same_visit_status = last_visit_status == visit_stats_d["last_visit_status"]
else:
diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py
--- a/swh/scheduler/tests/test_journal_client.py
+++ b/swh/scheduler/tests/test_journal_client.py
@@ -838,6 +838,7 @@
(8, 256),
(9, 1024),
(10, 4096),
+ (11, 16384),
],
)
def test_journal_client_from_position_offset_to_days(position_offset, interval):

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 7:55 AM (10 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230572

Event Timeline