diff --git a/PKG-INFO b/PKG-INFO index 32edb0f..bd20acf 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,61 +1,61 @@ Metadata-Version: 2.1 Name: swh.scrubber -Version: 0.0.4 +Version: 0.0.5 Summary: Software Heritage Datastore Scrubber Home-page: https://forge.softwareheritage.org/diffusion/swh-scrubber Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-scrubber Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scrubber/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/x-rst Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Datastore Scrubber ====================================== Tools to periodically checks data integrity in swh-storage and swh-objstorage, reports errors, and (try to) fix them. This is a work in progress; some of the components described below do not exist yet (cassandra storage checker, objstorage checker, recovery, and reinjection) The Scrubber package is made of the following parts: Checking -------- Highly parallel processes continuously read objects from a data store, compute checksums, and write any failure in a database, along with the data of the corrupt object. There is one "checker" for each datastore package: storage (postgresql and cassandra), journal (kafka), and objstorage. Recovery -------- Then, from time to time, jobs go through the list of known corrupt objects, and try to recover the original objects, through various means: * Brute-forcing variations until they match their checksum * Recovering from another data store * As a last resort, recovering from known origins, if any Reinjection ----------- Finally, when an original object is recovered, it is reinjected in the original data store, replacing the corrupt one. diff --git a/debian/changelog b/debian/changelog index 1c1d1f7..48bfb48 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,54 +1,58 @@ -swh-scrubber (0.0.4-1~swh1~bpo10+1) buster-swh; urgency=medium +swh-scrubber (0.0.5-1~swh1) unstable-swh; urgency=medium - * Rebuild for buster-swh + * New upstream release 0.0.5 - (tagged by Antoine R. Dumont + (@ardumont) on 2022-05-30 17:55:26 + +0200) + * Upstream changes: - v0.0.5 - Reference the scrubber db model + upgrade (from version 1 to 2) - -- Software Heritage autobuilder (on jenkins-debian1) Mon, 30 May 2022 15:43:20 +0000 + -- Software Heritage autobuilder (on jenkins-debian1) Mon, 30 May 2022 16:02:37 +0000 swh-scrubber (0.0.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.4 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-30 17:35:51 +0200) * Upstream changes: - v0.0.4 - Recursive include the swh/scrubber/sql folder -- Software Heritage autobuilder (on jenkins-debian1) Mon, 30 May 2022 15:40:57 +0000 swh-scrubber (0.0.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.3 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-30 15:45:51 +0200) * Upstream changes: - v0.0.3 - Unify factory to use keyword 'postgresql' over deprecated 'local' - db: Bump to version 2 - requirements: Add missing dependency -- Software Heritage autobuilder (on jenkins-debian1) Mon, 30 May 2022 13:49:46 +0000 swh-scrubber (0.0.2-1~swh2) unstable-swh; urgency=medium * Update dependencies and bump new release -- Antoine R. Dumont (@ardumont) Mon, 30 May 2022 14:25:52 +0200 swh-scrubber (0.0.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.2 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-30 10:15:15 +0200) * Upstream changes: - v0.0.2 - Add Fixer class, which re-loads corrupt objects from origins - Fix crash when using datastore_get_or_add for an existing datastore - Internals - -------- - requirements-test: Remove pytest pinning to < 7 - add strict asyncio_mode in pytest.ini - Bump mypy to v0.942 - Add .git-blame-ignore-revs file with automatic reformatting commits - python: Reformat code with black 22.3.0 - pre-commit, tox: Bump black from 19.10b0 to 22.3.0 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 30 May 2022 08:19:10 +0000 swh-scrubber (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release -- Nicolas Dandrimont Thu, 31 Mar 2022 19:29:54 +0200 diff --git a/swh.scrubber.egg-info/PKG-INFO b/swh.scrubber.egg-info/PKG-INFO index 32edb0f..bd20acf 100644 --- a/swh.scrubber.egg-info/PKG-INFO +++ b/swh.scrubber.egg-info/PKG-INFO @@ -1,61 +1,61 @@ Metadata-Version: 2.1 Name: swh.scrubber -Version: 0.0.4 +Version: 0.0.5 Summary: Software Heritage Datastore Scrubber Home-page: https://forge.softwareheritage.org/diffusion/swh-scrubber Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-scrubber Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scrubber/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/x-rst Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Datastore Scrubber ====================================== Tools to periodically checks data integrity in swh-storage and swh-objstorage, reports errors, and (try to) fix them. This is a work in progress; some of the components described below do not exist yet (cassandra storage checker, objstorage checker, recovery, and reinjection) The Scrubber package is made of the following parts: Checking -------- Highly parallel processes continuously read objects from a data store, compute checksums, and write any failure in a database, along with the data of the corrupt object. There is one "checker" for each datastore package: storage (postgresql and cassandra), journal (kafka), and objstorage. Recovery -------- Then, from time to time, jobs go through the list of known corrupt objects, and try to recover the original objects, through various means: * Brute-forcing variations until they match their checksum * Recovering from another data store * As a last resort, recovering from known origins, if any Reinjection ----------- Finally, when an original object is recovered, it is reinjected in the original data store, replacing the corrupt one. diff --git a/swh.scrubber.egg-info/SOURCES.txt b/swh.scrubber.egg-info/SOURCES.txt index 964ff87..69c4c3b 100644 --- a/swh.scrubber.egg-info/SOURCES.txt +++ b/swh.scrubber.egg-info/SOURCES.txt @@ -1,54 +1,55 @@ .git-blame-ignore-revs .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.rst conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/README.rst docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.scrubber.egg-info/PKG-INFO swh.scrubber.egg-info/SOURCES.txt swh.scrubber.egg-info/dependency_links.txt swh.scrubber.egg-info/entry_points.txt swh.scrubber.egg-info/requires.txt swh.scrubber.egg-info/top_level.txt swh/scrubber/__init__.py swh/scrubber/cli.py swh/scrubber/db.py swh/scrubber/fixer.py swh/scrubber/journal_checker.py swh/scrubber/origin_locator.py swh/scrubber/py.typed swh/scrubber/storage_checker.py swh/scrubber/utils.py swh/scrubber/sql/20-enums.sql swh/scrubber/sql/30-schema.sql swh/scrubber/sql/60-indexes.sql +swh/scrubber/sql/upgrades/2.sql swh/scrubber/tests/__init__.py swh/scrubber/tests/conftest.py swh/scrubber/tests/test_cli.py swh/scrubber/tests/test_fixer.py swh/scrubber/tests/test_init.py swh/scrubber/tests/test_journal_kafka.py swh/scrubber/tests/test_origin_locator.py swh/scrubber/tests/test_storage_postgresql.py \ No newline at end of file diff --git a/swh/scrubber/sql/upgrades/2.sql b/swh/scrubber/sql/upgrades/2.sql new file mode 100644 index 0000000..745a988 --- /dev/null +++ b/swh/scrubber/sql/upgrades/2.sql @@ -0,0 +1,22 @@ +-- SWH Scrubber DB schema upgrade +-- from_version: 1 +-- to_version: 2 +-- description: Add fixed_objects + +create table fixed_object +( + id swhid not null, + object bytea not null, + method text, + recovery_date timestamptz not null default now() +); + +comment on table fixed_object is 'Each row identifies an object that was found to be corrupt, along with the original version of the object'; +comment on column fixed_object.object is 'The recovered object itself, as a msgpack-encoded dict'; +comment on column fixed_object.recovery_date is 'Moment the object was recovered.'; +comment on column fixed_object.method is 'How the object was recovered. For example: "from_origin", "negative_utc", "capitalized_revision_parent".'; + +-- fixed_object + +create unique index concurrently fixed_object_pkey on fixed_object(id); +alter table fixed_object add primary key using index fixed_object_pkey;