diff --git a/debian/control b/debian/control index 54ebfa0..37e265b 100644 --- a/debian/control +++ b/debian/control @@ -1,38 +1,32 @@ Source: swh-objstorage Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, python3-flask, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.28~), python3-swh.model (>= 0.0.14~), python3-swh.storage.archiver (>= 0.0.52~), python3-click, python3-libcloud, python3-azure-storage, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DOBJS/ Package: python3-swh.objstorage Architecture: all Depends: python3-swh.core (>= 0.0.28~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Object Storage -Package: python3-swh.objstorage.checker -Architecture: all -Depends: python3-swh.objstorage (= ${binary:Version}), python3-swh.storage.archiver (>= 0.0.52~), - ${misc:Depends}, ${python3:Depends} -Description: Software Heritage Object Storage Checker - Package: python3-swh.objstorage.cloud Architecture: all Depends: python3-swh.objstorage (= ${binary:Version}), python3-libcloud, python3-azure-storage, ${misc:Depends}, ${python3:Depends} Breaks: python3-swh.objstorage (<= 0.0.7~) Description: Software Heritage Cloud Object Storage diff --git a/debian/rules b/debian/rules index c7c7d00..3382634 100755 --- a/debian/rules +++ b/debian/rules @@ -1,22 +1,19 @@ #!/usr/bin/make -f export PYBUILD_NAME=swh.objstorage %: dh $@ --with python3 --buildsystem=pybuild override_dh_install: dh_install for pyvers in $(shell py3versions -vr); do \ - mkdir -p $(CURDIR)/debian/python3-swh.objstorage.checker/usr/lib/python$$pyvers/dist-packages/swh/objstorage/ ; \ - mv $(CURDIR)/debian/python3-swh.objstorage/usr/lib/python$$pyvers/dist-packages/swh/objstorage/checker.py \ - $(CURDIR)/debian/python3-swh.objstorage.checker/usr/lib/python$$pyvers/dist-packages/swh/objstorage/ ; \ mkdir -p $(CURDIR)/debian/python3-swh.objstorage.cloud/usr/lib/python$$pyvers/dist-packages/swh/objstorage/cloud ; \ mv $(CURDIR)/debian/python3-swh.objstorage/usr/lib/python$$pyvers/dist-packages/swh/objstorage/cloud/* \ $(CURDIR)/debian/python3-swh.objstorage.cloud/usr/lib/python$$pyvers/dist-packages/swh/objstorage/cloud/ ; \ done override_dh_auto_test: PYBUILD_SYSTEM=custom \ PYBUILD_TEST_ARGS="cd {build_dir}; python{version} -m nose swh -sva '!db'" \ dh_auto_test diff --git a/swh/objstorage/checker.py b/swh/objstorage/checker.py deleted file mode 100644 index 08dd81c..0000000 --- a/swh/objstorage/checker.py +++ /dev/null @@ -1,270 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import abc -import click -import logging - -from swh.core import config -from swh.storage.archiver.storage import ArchiverStorage - -from swh.objstorage import get_objstorage -from swh.objstorage.exc import ObjNotFoundError, Error - - -class BaseContentChecker(config.SWHConfig, metaclass=abc.ABCMeta): - """Abstract class of the content integrity checker. - - This checker's purpose is to iterate over the contents of a storage and - check the integrity of each file. - Behavior of the checker to deal with corrupted status will be specified - by subclasses. - - You should override the DEFAULT_CONFIG and CONFIG_BASE_FILENAME - variables if you need it. - - """ - DEFAULT_CONFIG = { - 'storage': ('dict', { - 'cls': 'pathslicing', - 'args': { - 'root': '/srv/softwareheritage/objects', - 'slicing': '0:2/2:4/4:6' - } - }), - 'batch_size': ('int', 1000), - } - - CONFIG_BASE_FILENAME = 'objstorage/objstorage_checker' - - def __init__(self): - """ Create a checker that ensure the objstorage have no corrupted file - """ - self.config = self.parse_config_file() - self.objstorage = get_objstorage(**self.config['storage']) - self.batch_size = self.config['batch_size'] - - def run_as_daemon(self): - """ Start the check routine and perform it forever. - - Use this method to run the checker as a daemon that will iterate over - the content forever in background. - """ - while True: - try: - self.run() - except: - pass - - def run(self): - """ Check a batch of content. - """ - for obj_id in self._get_content_to_check(self.batch_size): - cstatus = self._check_content(obj_id) - if cstatus == 'corrupted': - self.corrupted_content(obj_id) - elif cstatus == 'missing': - self.missing_content(obj_id) - - def _get_content_to_check(self, batch_size): - """ Get the content that should be verified. - - Returns: - An iterable of the content's id that need to be checked. - """ - yield from self.objstorage.get_random(batch_size) - - def _check_content(self, obj_id): - """ Check the validity of the given content. - - Returns: - True if the content was valid, false if it was corrupted. - """ - try: - self.objstorage.check(obj_id) - except ObjNotFoundError: - return 'missing' - except Error: - return 'corrupted' - - @abc.abstractmethod - def corrupted_content(self, obj_id): - """ Perform an action to treat with a corrupted content. - """ - raise NotImplementedError("%s must implement " - "'corrupted_content' method" % type(self)) - - @abc.abstractmethod - def missing_content(self, obj_id): - """ Perform an action to treat with a missing content. - """ - raise NotImplementedError("%s must implement " - "'missing_content' method" % type(self)) - - -class LogContentChecker(BaseContentChecker): - """ Content integrity checker that just log detected errors. - """ - - DEFAULT_CONFIG = { - 'storage': ('dict', { - 'cls': 'pathslicing', - 'args': { - 'root': '/srv/softwareheritage/objects', - 'slicing': '0:2/2:4/4:6' - } - }), - 'batch_size': ('int', 1000), - 'log_tag': ('str', 'objstorage.checker') - } - - CONFIG_BASE_FILENAME = 'objstorage/log_checker' - - def __init__(self): - super().__init__() - self.logger = logging.getLogger(self.config['log_tag']) - - def corrupted_content(self, obj_id): - """ Perform an action to treat with a corrupted content. - """ - self.logger.error('Content %s is corrupted' % obj_id) - - def missing_content(self, obj_id): - """ Perform an action to treat with a missing content. - """ - self.logger.error('Content %s is detected missing' % obj_id) - - -class RepairContentChecker(LogContentChecker): - """ Content integrity checker that will try to restore contents. - """ - - DEFAULT_CONFIG = { - 'storage': ('dict', { - 'cls': 'pathslicing', - 'args': { - 'root': '/srv/softwareheritage/objects', - 'slicing': '0:2/2:4/4:6' - } - }), - 'batch_size': ('int', 1000), - 'log_tag': ('str', 'objstorage.checker'), - 'backup_storages': ('dict', { - 'banco': { - 'cls': 'remote', - 'args': {'url': 'http://banco:5003/'} - } - }) - } - - CONFIG_BASE_FILENAME = 'objstorage/repair_checker' - - def __init__(self): - super().__init__() - self.backups = [ - get_objstorage(**storage) - for name, storage in self.config['backup_storages'].items() - ] - - def corrupted_content(self, obj_id): - """ Perform an action to treat with a corrupted content. - """ - super().corrupted_content(obj_id) - self._restore(obj_id) - - def missing_content(self, obj_id): - """ Perform an action to treat with a missing content. - """ - super().missing_content(obj_id) - self._restore(obj_id) - - def _restore(self, obj_id): - if not self._perform_restore(obj_id): - # Object could not be restored - self.logger.critical( - 'Object %s is corrupted and could not be repaired' % obj_id - ) - - def _perform_restore(self, obj_id): - """ Try to restore the object in the current storage using the backups - """ - for backup in self.backups: - try: - content = backup.get(obj_id) - self.objstorage.restore(content, obj_id) - except ObjNotFoundError as e: - continue - else: - # Return True direclty when a backup contains the object - return True - # No backup contains the object - return False - - -class ArchiveNotifierContentChecker(LogContentChecker): - """ Implementation of the checker that will update the archiver database - - Once the database is updated the archiver may restore the content on it's - next scheduling as it won't be present anymore, and this status change - will probably make the retention policy invalid. - """ - DEFAULT_CONFIG = { - 'storage': ('dict', { - 'cls': 'pathslicing', - 'args': { - 'root': '/srv/softwareheritage/objects', - 'slicing': '0:2/2:4/4:6' - } - }), - 'batch_size': ('int', 1000), - 'log_tag': ('str', 'objstorage.checker'), - 'storage_name': ('str', 'banco'), - 'dbconn': ('str', 'dbname=softwareheritage-archiver-dev') - } - - CONFIG_BASE_FILENAME = 'objstorage/archive_notifier_checker' - - def __init__(self): - super().__init__() - self.archiver_db = ArchiverStorage(self.config['dbconn']) - self.storage_name = self.config['storage_name'] - - def corrupted_content(self, obj_id): - """ Perform an action to treat with a corrupted content. - """ - super().corrupted_content(obj_id) - self._update_status(obj_id, 'corrupted') - - def missing_content(self, obj_id): - """ Perform an action to treat with a missing content. - """ - super().missing_content(obj_id) - self._update_status(obj_id, 'missing') - - def _update_status(self, obj_id, status): - self.archiver_db.content_archive_update(obj_id, self.storage_name, - new_status=status) - - -@click.command() -@click.argument('checker-type', required=1, default='log') -@click.option('--daemon/--nodaemon', default=True, - help='Indicates if the checker should run forever ' - 'or on a single batch of content') -def launch(checker_type, daemon): - types = { - 'log': LogContentChecker, - 'repair': RepairContentChecker, - 'archiver_notifier': ArchiveNotifierContentChecker - } - checker = types[checker_type]() - if daemon: - checker.run_as_daemon() - else: - checker.run() - - -if __name__ == '__main__': - launch() diff --git a/swh/objstorage/tests/test_checker.py b/swh/objstorage/tests/test_checker.py deleted file mode 100644 index 57f75cb..0000000 --- a/swh/objstorage/tests/test_checker.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (C) 2015-2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import gzip -import tempfile -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.objstorage.exc import ObjNotFoundError -from swh.objstorage.checker import RepairContentChecker -from swh.model import hashutil - - -class MockBackupObjStorage(): - - def __init__(self): - self.values = {} - - def add(self, value, obj_id): - self.values[obj_id] = value - - def get(self, obj_id): - try: - return self.values[obj_id] - except KeyError: - raise ObjNotFoundError(obj_id) - - -@attr('fs') -class TestRepairChecker(unittest.TestCase): - """ Test the content integrity checker - """ - - def setUp(self): - super().setUp() - self._alter_config() - self.checker = RepairContentChecker() - self.checker.backups = [MockBackupObjStorage(), - MockBackupObjStorage()] - - def _alter_config(self): - RepairContentChecker.parse_config_file = ( - lambda cls: { - 'storage': {'cls': 'pathslicing', - 'args': {'root': tempfile.mkdtemp(), - 'slicing': '0:2/2:4/4:6'}}, - 'batch_size': 1000, - 'log_tag': 'objstorage_test', - 'backup_storages': {} - } - ) - - def _corrupt_content(self, obj_id): - """ Make the given content invalid. - """ - hex_obj_id = hashutil.hash_to_hex(obj_id) - file_path = self.checker.objstorage._obj_path(hex_obj_id) - with gzip.open(file_path, 'wb') as f: - f.write(b'Unexpected content') - - def _is_corrupted(self, obj_id): - """ Ensure the given object is corrupted - """ - return self.checker._check_content(obj_id) == 'corrupted' - - def _is_missing(self, obj_id): - """ Ensure the given object is missing - """ - return self.checker._check_content(obj_id) == 'missing' - - @istest - def check_valid_content(self): - # Check that a valid content is valid. - content = b'check_valid_content' - obj_id = self.checker.objstorage.add(content) - self.assertFalse(self._is_corrupted(obj_id)) - self.assertFalse(self._is_missing(obj_id)) - - @istest - def check_corrupted_content(self): - # Check that an invalid content is noticed. - content = b'check_corrupted_content' - obj_id = self.checker.objstorage.add(content) - self._corrupt_content(obj_id) - self.assertTrue(self._is_corrupted(obj_id)) - self.assertFalse(self._is_missing(obj_id)) - - @istest - def check_missing_content(self): - obj_id = hashutil.hash_data(b'check_missing_content')['sha1'] - self.assertFalse(self._is_corrupted(obj_id)) - self.assertTrue(self._is_missing(obj_id)) - - @istest - def repair_content_present_first(self): - # Try to repair a content that is in the backup storage. - content = b'repair_content_present_first' - obj_id = self.checker.objstorage.add(content) - # Add a content to the mock - self.checker.backups[0].add(content, obj_id) - # Corrupt and repair it. - self._corrupt_content(obj_id) - self.assertTrue(self._is_corrupted(obj_id)) - self.checker.corrupted_content(obj_id) - self.assertFalse(self._is_corrupted(obj_id)) - - @istest - def repair_content_present_second(self): - # Try to repair a content that is in the backup storage. - content = b'repair_content_present_first' - obj_id = self.checker.objstorage.add(content) - # Add a content to the mock - self.checker.backups[-1].add(content, obj_id) - # Corrupt and repair it. - self._corrupt_content(obj_id) - self.assertTrue(self._is_corrupted(obj_id)) - self.checker.corrupted_content(obj_id) - self.assertFalse(self._is_corrupted(obj_id)) - - @istest - def repair_content_present_distributed(self): - # Try to repair two contents that are in separate backup storages. - content1 = b'repair_content_present_distributed_2' - content2 = b'repair_content_present_distributed_1' - obj_id1 = self.checker.objstorage.add(content1) - obj_id2 = self.checker.objstorage.add(content2) - # Add content to the mock. - self.checker.backups[0].add(content1, obj_id1) - self.checker.backups[1].add(content2, obj_id2) - # Corrupt the contents - self._corrupt_content(obj_id1) - self._corrupt_content(obj_id2) - self.assertTrue(self._is_corrupted(obj_id1)) - self.assertTrue(self._is_corrupted(obj_id2)) - # Repare them - self.checker.corrupted_content(obj_id1) - self.checker.corrupted_content(obj_id2) - self.assertFalse(self._is_corrupted(obj_id1)) - self.assertFalse(self._is_corrupted(obj_id2)) - - @istest - def repair_content_missing(self): - # Try to repair a content that is NOT in the backup storage. - content = b'repair_content_missing' - obj_id = self.checker.objstorage.add(content) - # Corrupt the content - self._corrupt_content(obj_id) - self.assertTrue(self._is_corrupted(obj_id)) - # Try to repair it - self.checker.corrupted_content(obj_id) - self.assertTrue(self._is_corrupted(obj_id))