diff --git a/sql/updates/30-bis.sql b/sql/updates/30-bis.sql new file mode 100644 --- /dev/null +++ b/sql/updates/30-bis.sql @@ -0,0 +1,8 @@ +-- SWH DB schema upgrade +-- from_version: 30 +-- to_version: 30 +-- description: Bound existing next position offset to a max of 10 + +update origin_visit_stats + set next_position_offset = 10 + where next_position_offset > 10; diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py --- a/swh/scheduler/journal_client.py +++ b/swh/scheduler/journal_client.py @@ -18,6 +18,9 @@ DISABLE_ORIGIN_THRESHOLD = 3 """Threshold to disable failing origins""" +MAX_NEXT_POSITION_OFFSET = 10 +"""Max next position offset to avoid date computation overflow""" + def max_date(*dates: Optional[datetime]) -> datetime: """Return the max date of given (possibly None) dates @@ -32,17 +35,19 @@ def from_position_offset_to_days(position_offset: int) -> int: - """Compute position offset to interval in days. + """Compute position offset to interval in days. Note that this does not bound the + position_offset input so client code should limit the date computation to avoid + overflow errors. - - index 0 and 1: interval 1 day - - index 2, 3 and 4: interval 2 days - - index 5 and up: interval `4^(n-4)` days for n in (4, 16, 64, 256, 1024, ...) + - index in [0:1]: interval 1 day + - index in [2:4]: interval 2 days + - index in [5:+inf]: interval `4^(index-4)` days Args: position_offset: The actual position offset for a given visit stats Returns: - The offset as an interval in number of days + The offset as an interval in number of days. """ assert position_offset >= 0 @@ -236,9 +241,12 @@ # Update the next position offset according to the existing value and the # eventfulness of the visit. increment = -2 if eventful else 1 - visit_stats_d["next_position_offset"] = max( - 0, visit_stats_d["next_position_offset"] + increment + # Limit the next_position_offset for acceptable date computations + current_offset = min( + visit_stats_d["next_position_offset"] + increment, + MAX_NEXT_POSITION_OFFSET, ) + visit_stats_d["next_position_offset"] = max(0, current_offset) # increment the counter when last_visit_status is the same same_visit_status = last_visit_status == visit_stats_d["last_visit_status"] else: diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py --- a/swh/scheduler/tests/test_journal_client.py +++ b/swh/scheduler/tests/test_journal_client.py @@ -838,6 +838,7 @@ (8, 256), (9, 1024), (10, 4096), + (11, 16384), ], ) def test_journal_client_from_position_offset_to_days(position_offset, interval):