diff --git a/swh/scrubber/db.py b/swh/scrubber/db.py --- a/swh/scrubber/db.py +++ b/swh/scrubber/db.py @@ -131,8 +131,8 @@ Returns the last date the given range was checked in the given datastore, or :const:`None` if it was never checked. - Currently, this checks range boundaries exactly, with no regard for ranges - that contain or are contained by it. + Currently, this matches range boundaries exactly, with no regard for + ranges that contain or are contained by it. """ datastore_id = self.datastore_get_or_add(datastore) with self.transaction() as cur: diff --git a/swh/scrubber/storage_checker.py b/swh/scrubber/storage_checker.py --- a/swh/scrubber/storage_checker.py +++ b/swh/scrubber/storage_checker.py @@ -145,6 +145,26 @@ ) start_time = datetime.datetime.now(tz=datetime.timezone.utc) + + # Currently, this matches range boundaries exactly, with no regard for + # ranges that contain or are contained by it. + last_check_time = self.db.checked_range_get_last_date( + self.datastore_info(), + range_start_swhid, + range_end_swhid, + ) + + if last_check_time is not None: + # TODO: re-check if 'last_check_time' was a long ago. + logger.debug( + "Skipping processing of %s range %s to %s: already done at %s", + self.object_type, + backfill._format_range_bound(range_start), + backfill._format_range_bound(range_end), + last_check_time, + ) + continue + logger.debug( "Processing %s range %s to %s", self.object_type, diff --git a/swh/scrubber/tests/test_storage_postgresql.py b/swh/scrubber/tests/test_storage_postgresql.py --- a/swh/scrubber/tests/test_storage_postgresql.py +++ b/swh/scrubber/tests/test_storage_postgresql.py @@ -296,6 +296,47 @@ assert_checked_ranges(scrubber_db, datastore, _long_ranges("snp")) +@patch_byte_ranges +def test_no_recheck(scrubber_db, datastore, swh_storage): + """ + Tests that objects that were already checked are not checked again on + the next run. + """ + # Corrupt two snapshots + snapshots = list(swh_model_data.SNAPSHOTS) + for i in (0, 1): + snapshots[i] = attr.evolve(snapshots[i], id=bytes([i]) * 20) + swh_storage.snapshot_add(snapshots) + + # Mark ranges as already checked + now = datetime.datetime.now(tz=datetime.timezone.utc) + for (range_start, range_end) in EXPECTED_RANGES: + scrubber_db.checked_range_upsert(datastore, range_start, range_end, now) + + StorageChecker( + db=scrubber_db, + storage=swh_storage, + object_type="snapshot", + start_object="00" * 20, + end_object="ff" * 20, + ).run() + + corrupt_objects = list(scrubber_db.corrupt_object_iter()) + assert ( + corrupt_objects == [] + ), "Detected corrupt objects in ranges that should have been skipped." + + # Make sure the DB was not changed (in particular, that timestamps were not bumped) + ranges = [ + (str(range_start), str(range_end), date) + for (range_start, range_end, date) in scrubber_db.checked_range_iter(datastore) + ] + ranges.sort(key=str) + assert ranges == [ + (range_start, range_end, now) for (range_start, range_end) in EXPECTED_RANGES + ] + + @patch_byte_ranges def test_no_hole(scrubber_db, datastore, swh_storage): swh_storage.content_add([CONTENT1])