Differential D7360 Diff 26636 swh/scrubber/tests/test_storage_postgresql.py

Changeset View

Standalone View

swh/scrubber/tests/test_storage_postgresql.py

This file was added.

				# Copyright (C) 2022 The Software Heritage developers
				# See the AUTHORS file at the top-level directory of this distribution
				# License: GNU General Public License version 3, or any later version
				# See top-level LICENSE file for more information

				import datetime
				import unittest.mock

				import attr
				import pytest

				from swh.journal.serializers import kafka_to_value
				from swh.model import swhids
				from swh.model.tests import swh_model_data
				from swh.scrubber.check_storage import StorageChecker
				from swh.storage.backfill import byte_ranges

				# decorator to make swh.storage.backfill use less ranges, so tests run faster
				patch_byte_ranges = unittest.mock.patch(
				"swh.storage.backfill.byte_ranges",
				lambda numbits, start, end: byte_ranges(numbits // 8, start, end),
				)


				@patch_byte_ranges
				ardumontUnsubmitted Not Done Inline Actions neat, i did not realize we could do that ;) ardumont: neat, i did not realize we could do that ;)
				def test_no_corruption(scrubber_db, swh_storage):
				swh_storage.directory_add(swh_model_data.DIRECTORIES)
				swh_storage.revision_add(swh_model_data.REVISIONS)
				swh_storage.release_add(swh_model_data.RELEASES)
				swh_storage.snapshot_add(swh_model_data.SNAPSHOTS)

				for object_type in ("snapshot", "release", "revision", "directory"):
				StorageChecker(
				db=scrubber_db,
				storage=swh_storage,
				object_type=object_type,
				start_object="00" * 20,
				end_object="ff" * 20,
				).check_storage()

				assert list(scrubber_db.corrupt_object_iter()) == []


				@pytest.mark.parametrize("corrupt_idx", range(len(swh_model_data.SNAPSHOTS)))
				@patch_byte_ranges
				def test_corrupt_snapshot(scrubber_db, swh_storage, corrupt_idx):
				snapshots = list(swh_model_data.SNAPSHOTS)
				snapshots[corrupt_idx] = attr.evolve(snapshots[corrupt_idx], id=b"\x00" * 20)
				swh_storage.snapshot_add(snapshots)

				before_date = datetime.datetime.now(tz=datetime.timezone.utc)
				for object_type in ("snapshot", "release", "revision", "directory"):
				StorageChecker(
				db=scrubber_db,
				storage=swh_storage,
				object_type=object_type,
				start_object="00" * 20,
				end_object="ff" * 20,
				).check_storage()
				after_date = datetime.datetime.now(tz=datetime.timezone.utc)

				corrupt_objects = list(scrubber_db.corrupt_object_iter())
				assert len(corrupt_objects) == 1
				assert corrupt_objects[0].id == swhids.CoreSWHID.from_string(
				"swh:1:snp:0000000000000000000000000000000000000000"
				)
				assert corrupt_objects[0].datastore.package == "storage"
				assert corrupt_objects[0].datastore.cls == "postgresql"
				assert corrupt_objects[0].datastore.instance.startswith(
				"user=postgres password=xxx dbname=storage host="
				)
				assert (
				before_date - datetime.timedelta(seconds=5)
				<= corrupt_objects[0].first_occurrence
				<= after_date + datetime.timedelta(seconds=5)
				)
				assert (
				kafka_to_value(corrupt_objects[0].object_) == snapshots[corrupt_idx].to_dict()
				ardumontUnsubmitted Not Done Inline Actions not a big fan of the object_ name... ardumont: not a big fan of the object_ name...
				vlorentzAuthorUnsubmitted Done Inline Actions I couldn't think of a better name. `obj` could work too, but it is discouraged by PEP 8: If your public attribute name collides with a reserved keyword, append a single trailing underscore to your attribute name. This is preferable to an abbreviation or corrupted spelling. https://peps.python.org/pep-0008/#designing-for-inheritance vlorentz: I couldn't think of a better name. `obj` could work too, but it is discouraged by PEP 8: > If…
				)


				@patch_byte_ranges
				def test_corrupt_snapshots(scrubber_db, swh_storage):
				snapshots = list(swh_model_data.SNAPSHOTS)
				for i in (0, 1):
				snapshots[i] = attr.evolve(snapshots[i], id=bytes([i]) * 20)
				swh_storage.snapshot_add(snapshots)

				StorageChecker(
				db=scrubber_db,
				storage=swh_storage,
				object_type="snapshot",
				start_object="00" * 20,
				end_object="ff" * 20,
				).check_storage()

				corrupt_objects = list(scrubber_db.corrupt_object_iter())
				assert len(corrupt_objects) == 2
				assert {co.id for co in corrupt_objects} == {
				swhids.CoreSWHID.from_string(swhid)
				for swhid in [
				"swh:1:snp:0000000000000000000000000000000000000000",
				"swh:1:snp:0101010101010101010101010101010101010101",
				]
				}
				ardumontUnsubmitted Not Done Inline Actions That's actually the same test one on a specific snapshot, another on multiple snapshots. Is it necessary to keep the first one? ardumont: That's actually the same test one on a specific snapshot, another on multiple snapshots. Is it…
				vlorentzAuthorUnsubmitted Done Inline Actions the first one checks the checker does not report all objects after the first corrupt one, for example. vlorentz: the first one checks the checker does not report all objects after the first corrupt one, for…