diff --git a/PKG-INFO b/PKG-INFO index 32edb0f..bd20acf 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,61 +1,61 @@ Metadata-Version: 2.1 Name: swh.scrubber -Version: 0.0.4 +Version: 0.0.5 Summary: Software Heritage Datastore Scrubber Home-page: https://forge.softwareheritage.org/diffusion/swh-scrubber Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-scrubber Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scrubber/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/x-rst Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Datastore Scrubber ====================================== Tools to periodically checks data integrity in swh-storage and swh-objstorage, reports errors, and (try to) fix them. This is a work in progress; some of the components described below do not exist yet (cassandra storage checker, objstorage checker, recovery, and reinjection) The Scrubber package is made of the following parts: Checking -------- Highly parallel processes continuously read objects from a data store, compute checksums, and write any failure in a database, along with the data of the corrupt object. There is one "checker" for each datastore package: storage (postgresql and cassandra), journal (kafka), and objstorage. Recovery -------- Then, from time to time, jobs go through the list of known corrupt objects, and try to recover the original objects, through various means: * Brute-forcing variations until they match their checksum * Recovering from another data store * As a last resort, recovering from known origins, if any Reinjection ----------- Finally, when an original object is recovered, it is reinjected in the original data store, replacing the corrupt one. diff --git a/swh.scrubber.egg-info/PKG-INFO b/swh.scrubber.egg-info/PKG-INFO index 32edb0f..bd20acf 100644 --- a/swh.scrubber.egg-info/PKG-INFO +++ b/swh.scrubber.egg-info/PKG-INFO @@ -1,61 +1,61 @@ Metadata-Version: 2.1 Name: swh.scrubber -Version: 0.0.4 +Version: 0.0.5 Summary: Software Heritage Datastore Scrubber Home-page: https://forge.softwareheritage.org/diffusion/swh-scrubber Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-scrubber Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scrubber/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/x-rst Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Datastore Scrubber ====================================== Tools to periodically checks data integrity in swh-storage and swh-objstorage, reports errors, and (try to) fix them. This is a work in progress; some of the components described below do not exist yet (cassandra storage checker, objstorage checker, recovery, and reinjection) The Scrubber package is made of the following parts: Checking -------- Highly parallel processes continuously read objects from a data store, compute checksums, and write any failure in a database, along with the data of the corrupt object. There is one "checker" for each datastore package: storage (postgresql and cassandra), journal (kafka), and objstorage. Recovery -------- Then, from time to time, jobs go through the list of known corrupt objects, and try to recover the original objects, through various means: * Brute-forcing variations until they match their checksum * Recovering from another data store * As a last resort, recovering from known origins, if any Reinjection ----------- Finally, when an original object is recovered, it is reinjected in the original data store, replacing the corrupt one. diff --git a/swh.scrubber.egg-info/SOURCES.txt b/swh.scrubber.egg-info/SOURCES.txt index 964ff87..69c4c3b 100644 --- a/swh.scrubber.egg-info/SOURCES.txt +++ b/swh.scrubber.egg-info/SOURCES.txt @@ -1,54 +1,55 @@ .git-blame-ignore-revs .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.rst conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/README.rst docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.scrubber.egg-info/PKG-INFO swh.scrubber.egg-info/SOURCES.txt swh.scrubber.egg-info/dependency_links.txt swh.scrubber.egg-info/entry_points.txt swh.scrubber.egg-info/requires.txt swh.scrubber.egg-info/top_level.txt swh/scrubber/__init__.py swh/scrubber/cli.py swh/scrubber/db.py swh/scrubber/fixer.py swh/scrubber/journal_checker.py swh/scrubber/origin_locator.py swh/scrubber/py.typed swh/scrubber/storage_checker.py swh/scrubber/utils.py swh/scrubber/sql/20-enums.sql swh/scrubber/sql/30-schema.sql swh/scrubber/sql/60-indexes.sql +swh/scrubber/sql/upgrades/2.sql swh/scrubber/tests/__init__.py swh/scrubber/tests/conftest.py swh/scrubber/tests/test_cli.py swh/scrubber/tests/test_fixer.py swh/scrubber/tests/test_init.py swh/scrubber/tests/test_journal_kafka.py swh/scrubber/tests/test_origin_locator.py swh/scrubber/tests/test_storage_postgresql.py \ No newline at end of file diff --git a/swh/scrubber/sql/upgrades/2.sql b/swh/scrubber/sql/upgrades/2.sql new file mode 100644 index 0000000..745a988 --- /dev/null +++ b/swh/scrubber/sql/upgrades/2.sql @@ -0,0 +1,22 @@ +-- SWH Scrubber DB schema upgrade +-- from_version: 1 +-- to_version: 2 +-- description: Add fixed_objects + +create table fixed_object +( + id swhid not null, + object bytea not null, + method text, + recovery_date timestamptz not null default now() +); + +comment on table fixed_object is 'Each row identifies an object that was found to be corrupt, along with the original version of the object'; +comment on column fixed_object.object is 'The recovered object itself, as a msgpack-encoded dict'; +comment on column fixed_object.recovery_date is 'Moment the object was recovered.'; +comment on column fixed_object.method is 'How the object was recovered. For example: "from_origin", "negative_utc", "capitalized_revision_parent".'; + +-- fixed_object + +create unique index concurrently fixed_object_pkey on fixed_object(id); +alter table fixed_object add primary key using index fixed_object_pkey;