diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..6676e2c --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,65 @@ +Metadata-Version: 2.1 +Name: swh.scrubber +Version: 0.0.1 +Summary: Software Heritage Datastore Scrubber +Home-page: https://forge.softwareheritage.org/diffusion/swh-scrubber +Author: Software Heritage developers +Author-email: swh-devel@inria.fr +License: UNKNOWN +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Funding, https://www.softwareheritage.org/donate +Project-URL: Source, https://forge.softwareheritage.org/source/swh-scrubber +Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scrubber/ +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) +Classifier: Operating System :: OS Independent +Classifier: Development Status :: 3 - Alpha +Requires-Python: >=3.7 +Description-Content-Type: text/x-rst +Provides-Extra: testing +License-File: LICENSE +License-File: AUTHORS + +Software Heritage - Datastore Scrubber +====================================== + +Tools to periodically checks data integrity in swh-storage and swh-objstorage, +reports errors, and (try to) fix them. + +This is a work in progress; some of the components described below do not +exist yet (cassandra storage checker, objstorage checker, recovery, and reinjection) + +The Scrubber package is made of the following parts: + + +Checking +-------- + +Highly parallel processes continuously read objects from a data store, +compute checksums, and write any failure in a database, along with the data of +the corrupt object. + +There is one "checker" for each datastore package: storage (postgresql and cassandra), +journal (kafka), and objstorage. + + +Recovery +-------- + +Then, from time to time, jobs go through the list of known corrupt objects, +and try to recover the original objects, through various means: + +* Brute-forcing variations until they match their checksum +* Recovering from another data store +* As a last resort, recovering from known origins, if any + + +Reinjection +----------- + +Finally, when an original object is recovered, it is reinjected in the original +data store, replacing the corrupt one. + + diff --git a/README.rst b/README.rst deleted file mode 120000 index cffceba..0000000 --- a/README.rst +++ /dev/null @@ -1 +0,0 @@ -docs/README.rst \ No newline at end of file diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..bc3ab62 --- /dev/null +++ b/README.rst @@ -0,0 +1,39 @@ +Software Heritage - Datastore Scrubber +====================================== + +Tools to periodically checks data integrity in swh-storage and swh-objstorage, +reports errors, and (try to) fix them. + +This is a work in progress; some of the components described below do not +exist yet (cassandra storage checker, objstorage checker, recovery, and reinjection) + +The Scrubber package is made of the following parts: + + +Checking +-------- + +Highly parallel processes continuously read objects from a data store, +compute checksums, and write any failure in a database, along with the data of +the corrupt object. + +There is one "checker" for each datastore package: storage (postgresql and cassandra), +journal (kafka), and objstorage. + + +Recovery +-------- + +Then, from time to time, jobs go through the list of known corrupt objects, +and try to recover the original objects, through various means: + +* Brute-forcing variations until they match their checksum +* Recovering from another data store +* As a last resort, recovering from known origins, if any + + +Reinjection +----------- + +Finally, when an original object is recovered, it is reinjected in the original +data store, replacing the corrupt one. diff --git a/setup.cfg b/setup.cfg index 8d79b7e..1d722c2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,8 @@ [flake8] -# E203: whitespaces before ':' -# E231: missing whitespace after ',' -# W503: line break before binary operator ignore = E203,E231,W503 max-line-length = 88 + +[egg_info] +tag_build = +tag_date = 0 +