diff --git a/swh/storage/archiver/director.py b/swh/storage/archiver/director.py index 99d33b2c0..5b70aa3f0 100644 --- a/swh/storage/archiver/director.py +++ b/swh/storage/archiver/director.py @@ -1,300 +1,306 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import click import sys from swh.core import config, utils, hashutil from swh.objstorage import get_objstorage from swh.scheduler.utils import get_task from . import tasks # noqa -from .storage import ArchiverStorage +from .storage import get_archiver_storage class ArchiverDirectorBase(config.SWHConfig, metaclass=abc.ABCMeta): """Abstract Director class An archiver director is in charge of dispatching batch of contents to archiver workers (for them to archive). Inherit from this class and provide: - ADDITIONAL_CONFIG: Some added configuration needed for the director to work - CONFIG_BASE_FILENAME: relative path to lookup for the configuration file - def get_contents_to_archive(self): Implementation method to read contents to archive """ DEFAULT_CONFIG = { 'batch_max_size': ('int', 1500), 'asynchronous': ('bool', True), - 'dbconn': ('str', 'dbname=softwareheritage-archiver-dev user=guest') + 'archiver_storage': ('dict', { + 'cls': 'db', + 'args': { + 'dbconn': 'dbname=softwareheritage-archiver-dev user=guest', + }, + }), } # Destined to be overridden by subclass ADDITIONAL_CONFIG = {} # We use the same configuration file as the worker CONFIG_BASE_FILENAME = 'archiver/worker' # The worker's task queue name to use TASK_NAME = None def __init__(self): """ Constructor of the archiver director. Args: db_conn_archiver: Either a libpq connection string, or a psycopg2 connection for the archiver db. config: optionnal additional configuration. Keys in the dict will override the one parsed from the configuration file. """ super().__init__() self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) - self.archiver_storage = ArchiverStorage(self.config['dbconn']) + self.archiver_storage = get_archiver_storage( + **self.config['archiver_storage']) self.task = get_task(self.TASK_NAME) def run(self): """ Run the archiver director. The archiver director will check all the contents of the archiver database and do the required backup jobs. """ if self.config['asynchronous']: run_fn = self.run_async_worker else: run_fn = self.run_sync_worker for batch in self.read_batch_contents(): run_fn(batch) def run_async_worker(self, batch): """Produce a worker that will be added to the task queue. """ self.task.delay(batch=batch) def run_sync_worker(self, batch): """Run synchronously a worker on the given batch. """ self.task(batch=batch) def read_batch_contents(self): """ Create batch of contents that needs to be archived Yields: batch of sha1 that corresponds to contents that needs more archive copies. """ contents = [] for content in self.get_contents_to_archive(): contents.append(content) if len(contents) > self.config['batch_max_size']: yield contents contents = [] if len(contents) > 0: yield contents @abc.abstractmethod def get_contents_to_archive(self): """Retrieve generator of sha1 to archive Yields: sha1 to archive """ pass class ArchiverWithRetentionPolicyDirector(ArchiverDirectorBase): """Process the files in order to know which one is needed as backup. The archiver director processes the files in the local storage in order to know which one needs archival and it delegates this task to archiver workers. """ ADDITIONAL_CONFIG = { 'retention_policy': ('int', 2), } TASK_NAME = 'swh.storage.archiver.tasks.SWHArchiverWithRetentionPolicyTask' def get_contents_to_archive(self): """Create batch of contents that needs to be archived Yields: Datas about a content as a tuple (content_id, present_copies, ongoing_copies) where ongoing_copies is a dict mapping copy to mtime. """ last_content = None while True: archiver_contents = list( self.archiver_storage.content_archive_get_unarchived_copies( last_content=last_content, retention_policy=self.config['retention_policy'])) if not archiver_contents: return for content_id, _, _ in archiver_contents: last_content = content_id yield content_id def read_sha1_from_stdin(): """Read sha1 from stdin. """ for line in sys.stdin: sha1 = line.strip() try: yield {'content_id': hashutil.hex_to_hash(sha1)} except Exception: print("%s is not a valid sha1 hash, continuing" % repr(sha1), file=sys.stderr) continue class ArchiverStdinToBackendDirector(ArchiverDirectorBase): """A cloud archiver director in charge of reading contents and send them in batch in the cloud. The archiver director, in order: - Reads sha1 to send to a specific backend. - Checks if those sha1 are known in the archiver. If they are not, add them - if the sha1 are missing, they are sent for the worker to archive If the flag force_copy is set, this will force the copy to be sent for archive even though it has already been done. """ ADDITIONAL_CONFIG = { 'destination': ('str', 'azure'), 'force_copy': ('bool', False), 'source': ('str', 'uffizi'), 'storages': ('list[dict]', [ {'host': 'uffizi', 'cls': 'pathslicing', 'args': {'root': '/tmp/softwareheritage/objects', 'slicing': '0:2/2:4/4:6'}}, {'host': 'banco', 'cls': 'remote', 'args': {'base_url': 'http://banco:5003/'}} ]) } CONFIG_BASE_FILENAME = 'archiver/worker-to-backend' TASK_NAME = 'swh.storage.archiver.tasks.SWHArchiverToBackendTask' def __init__(self): super().__init__() self.destination = self.config['destination'] self.force_copy = self.config['force_copy'] self.objstorages = { storage['host']: get_objstorage(storage['cls'], storage['args']) for storage in self.config.get('storages', []) } # Fallback objstorage self.source = self.config['source'] def _add_unknown_content_ids(self, content_ids): """Check whether some content_id are unknown. If they are, add them to the archiver db. Args: content_ids: List of dict with one key content_id """ source_objstorage = self.objstorages[self.source] self.archiver_storage.content_archive_add( (h['content_id'] for h in content_ids if h['content_id'] in source_objstorage), sources_present=[self.source]) def get_contents_to_archive(self): gen_content_ids = ( ids for ids in utils.grouper(read_sha1_from_stdin(), self.config['batch_max_size'])) if self.force_copy: for content_ids in gen_content_ids: content_ids = list(content_ids) if not content_ids: continue # Add missing entries in archiver table self._add_unknown_content_ids(content_ids) print('Send %s contents to archive' % len(content_ids)) for content in content_ids: content_id = content['content_id'] # force its status to missing self.archiver_storage.content_archive_update( content_id, self.destination, 'missing') yield content_id else: for content_ids in gen_content_ids: content_ids = list(content_ids) # Add missing entries in archiver table self._add_unknown_content_ids(content_ids) # Filter already copied data content_ids = list( self.archiver_storage.content_archive_get_missing( content_ids=content_ids, backend_name=self.destination)) if not content_ids: continue print('Send %s contents to archive' % len(content_ids)) for content in content_ids: yield content def run_async_worker(self, batch): """Produce a worker that will be added to the task queue. """ self.task.delay(destination=self.destination, batch=batch) def run_sync_worker(self, batch): """Run synchronously a worker on the given batch. """ self.task(destination=self.destination, batch=batch) @click.command() @click.option('--direct', is_flag=True, help="""The archiver sends content for backup to one storage.""") def launch(direct): if direct: archiver = ArchiverStdinToBackendDirector() else: archiver = ArchiverWithRetentionPolicyDirector() archiver.run() if __name__ == '__main__': launch() diff --git a/swh/storage/archiver/storage.py b/swh/storage/archiver/storage.py index 8a896b378..fd9f9fdcc 100644 --- a/swh/storage/archiver/storage.py +++ b/swh/storage/archiver/storage.py @@ -1,192 +1,200 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import psycopg2 import time from .db import ArchiverDb from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError class ArchiverStorage(): """SWH Archiver storage proxy, encompassing DB """ - def __init__(self, db_conn): + def __init__(self, dbconn): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection """ try: - if isinstance(db_conn, psycopg2.extensions.connection): - self.db = ArchiverDb(db_conn) + if isinstance(dbconn, psycopg2.extensions.connection): + self.db = ArchiverDb(dbconn) else: - self.db = ArchiverDb.connect(db_conn) + self.db = ArchiverDb.connect(dbconn) except psycopg2.OperationalError as e: raise StorageDBError(e) @db_transaction_generator def archive_ls(self, cur=None): """ Get all the archives registered on the server. Yields: a tuple (server_id, server_url) for each archive server. """ yield from self.db.archive_ls(cur) @db_transaction def content_archive_get(self, content_id, cur=None): """ Get the archival status of a content. Retrieve from the database the archival status of the given content Args: content_id: the sha1 of the content Yields: A tuple (content_id, present_copies, ongoing_copies), where ongoing_copies is a dict mapping copy to mtime. """ return self.db.content_archive_get(content_id, cur) @db_transaction_generator def content_archive_get_copies(self, last_content=None, limit=1000, cur=None): """ Get the list of copies for `limit` contents starting after `last_content`. Args: last_content: sha1 of the last content retrieved. May be None to start at the beginning. limit: number of contents to retrieve. Can be None to retrieve all objects (will be slow). Yields: A tuple (content_id, present_copies, ongoing_copies), where ongoing_copies is a dict mapping copy to mtime. """ yield from self.db.content_archive_get_copies(last_content, limit, cur) @db_transaction_generator def content_archive_get_unarchived_copies( self, retention_policy, last_content=None, limit=1000, cur=None): """ Get the list of copies for `limit` contents starting after `last_content`. Yields only copies with number of present smaller than `retention policy`. Args: last_content: sha1 of the last content retrieved. May be None to start at the beginning. retention_policy: number of required present copies limit: number of contents to retrieve. Can be None to retrieve all objects (will be slow). Yields: A tuple (content_id, present_copies, ongoing_copies), where ongoing_copies is a dict mapping copy to mtime. """ yield from self.db.content_archive_get_unarchived_copies( retention_policy, last_content, limit, cur) @db_transaction_generator def content_archive_get_missing(self, content_ids, backend_name, cur=None): """Retrieve missing sha1s from source_name. Args: content_ids ([sha1s]): list of sha1s to test source_name (str): Name of the backend to check for content Yields: missing sha1s from backend_name """ db = self.db db.mktemp_content_archive() db.copy_to(content_ids, 'tmp_content_archive', ['content_id'], cur) for content_id in db.content_archive_get_missing(backend_name, cur): yield content_id[0] @db_transaction_generator def content_archive_get_unknown(self, content_ids, cur=None): """Retrieve unknown sha1s from content_archive. Args: content_ids ([sha1s]): list of sha1s to test Yields: Unknown sha1s from content_archive """ db = self.db db.mktemp_content_archive() db.copy_to(content_ids, 'tmp_content_archive', ['content_id'], cur) for content_id in db.content_archive_get_unknown(cur): yield content_id[0] @db_transaction def content_archive_update(self, content_id, archive_id, new_status=None, cur=None): """ Update the status of an archive content and set its mtime to now Change the mtime of an archived content for the given archive and set it's mtime to the current time. Args: content_id (str): content sha1 archive_id (str): name of the archive new_status (str): one of 'missing', 'present' or 'ongoing'. this status will replace the previous one. If not given, the function only change the mtime of the content for the given archive. """ self.db.content_archive_update(content_id, archive_id, new_status, cur) @db_transaction def content_archive_add( self, content_ids, sources_present, cur=None): """Insert a new entry in db about content_id. Args: content_ids ([bytes|str]): content identifiers sources_present ([str]): List of source names where contents are present """ db = self.db # Prepare copies dictionary copies = {} for source in sources_present: copies[source] = { "status": "present", "mtime": int(time.time()), } copies = json.dumps(copies) num_present = len(sources_present) db.mktemp('content_archive') db.copy_to( ({'content_id': id, 'copies': copies, 'num_present': num_present} for id in content_ids), 'tmp_content_archive', ['content_id', 'copies', 'num_present'], cur) db.content_archive_add_from_temp(cur) + + +def get_archiver_storage(cls, args): + """Instantiate an archiver database with the proper class and arguments""" + if cls == 'db': + return ArchiverStorage(**args) + else: + raise ValueError('Unknown Archiver Storage class `%s`' % cls) diff --git a/swh/storage/archiver/worker.py b/swh/storage/archiver/worker.py index a374c2809..724942fed 100644 --- a/swh/storage/archiver/worker.py +++ b/swh/storage/archiver/worker.py @@ -1,415 +1,421 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import logging import random import time from collections import defaultdict from celery import group from swh.core import hashutil, config, utils from swh.objstorage import get_objstorage from swh.objstorage.exc import Error, ObjNotFoundError from swh.scheduler.utils import get_task -from .storage import ArchiverStorage +from .storage import get_archiver_storage from .copier import ArchiverCopier logger = logging.getLogger('archiver.worker') class BaseArchiveWorker(config.SWHConfig, metaclass=abc.ABCMeta): """Base archive worker. Inherit from this class and override: - ADDITIONAL_CONFIG: Some added configuration needed for the director to work - CONFIG_BASE_FILENAME: relative path to lookup for the configuration file - def need_archival(self, content_data): Determine if a content needs archival or not - def choose_backup_servers(self, present, missing): Choose which backup server to send copies to """ DEFAULT_CONFIG = { - 'dbconn': ('str', 'dbname=softwareheritage-archiver-dev'), + 'archiver_storage': ('dict', { + 'cls': 'db', + 'args': { + 'dbconn': 'dbname=softwareheritage-archiver-dev user=guest', + }, + }), 'storages': ('list[dict]', [ {'host': 'uffizi', 'cls': 'pathslicing', 'args': {'root': '/tmp/softwareheritage/objects', 'slicing': '0:2/2:4/4:6'}}, {'host': 'banco', 'cls': 'remote', 'args': {'base_url': 'http://banco:5003/'}} ]) } ADDITIONAL_CONFIG = {} CONFIG_BASE_FILENAME = 'archiver/worker' objstorages = {} def __init__(self, batch): super().__init__() self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) self.batch = batch - self.archiver_db = ArchiverStorage(self.config['dbconn']) + self.archiver_db = get_archiver_storage( + **self.config['archiver_storage']) self.objstorages = { storage['host']: get_objstorage(storage['cls'], storage['args']) for storage in self.config.get('storages', []) } self.set_objstorages = set(self.objstorages) def run(self): """Do the task expected from the archiver worker. Process the contents in self.batch, ensure that the elements still need an archival (using archiver db), and spawn copiers to copy files in each destination according to the archiver-worker's policy. """ transfers = defaultdict(list) for obj_id in self.batch: # Get dict {'missing': [servers], 'present': [servers]} # for contents ignoring those who don't need archival. copies = self.compute_copies(self.set_objstorages, obj_id) if not copies: # could not happen if using .director module msg = 'Unknown content %s' % hashutil.hash_to_hex(obj_id) logger.warning(msg) continue if not self.need_archival(copies): continue present = copies.get('present', set()) missing = copies.get('missing', set()) if len(present) == 0: msg = 'Lost content %s' % hashutil.hash_to_hex(obj_id) logger.critical(msg) continue # Choose servers to be used as srcs and dests. for src_dest in self.choose_backup_servers(present, missing): transfers[src_dest].append(obj_id) # Then run copiers for each of the required transfers. contents_copied = [] for (src, dest), content_ids in transfers.items(): contents_copied.extend(self.run_copier(src, dest, content_ids)) # copy is done, eventually do something else with them self.copy_finished(contents_copied) def compute_copies(self, set_objstorages, content_id): """From a content_id, return present and missing copies. Args: objstorages (set): objstorage's id name content_id: the content concerned Returns: A dictionary with the following keys: - 'present': set of archives where the content is present - 'missing': set of archives where the content is missing - 'ongoing': ongoing copies: dict mapping the archive id with the time the copy supposedly started. """ result = self.archiver_db.content_archive_get(content_id) if not result: return None _, present, ongoing = result set_present = set_objstorages & set(present) set_ongoing = set_objstorages & set(ongoing) set_missing = set_objstorages - set_present - set_ongoing return { 'present': set_present, 'missing': set_missing, 'ongoing': {archive: value for archive, value in ongoing.items() if archive in set_ongoing}, } def run_copier(self, source, destination, content_ids): """Run a copier in order to archive the given contents. Upload the given contents from the source to the destination. If the process fails, the whole content is considered uncopied and remains 'ongoing', waiting to be rescheduled as there is a delay. Args: source (str): source storage's identifier destination (str): destination storage's identifier content_ids ([sha1]): list of content ids to archive. """ # Check if there are any errors among the contents. content_status = self.get_contents_error(content_ids, source) # Iterates over the error detected. for content_id, real_status in content_status.items(): # Remove them from the to-archive list, # as they cannot be retrieved correctly. content_ids.remove(content_id) # Update their status to reflect their real state. self.archiver_db.content_archive_update( content_id, archive_id=source, new_status=real_status) # Now perform the copy on the remaining contents ac = ArchiverCopier( source=self.objstorages[source], destination=self.objstorages[destination], content_ids=content_ids) if ac.run(): # Once the archival complete, update the database. for content_id in content_ids: self.archiver_db.content_archive_update( content_id, archive_id=destination, new_status='present') return content_ids return [] def copy_finished(self, content_ids): """Hook to notify the content_ids archive copy is finished. (This is not an abstract method as this is optional """ pass def get_contents_error(self, content_ids, source_storage): """Indicates what is the error associated to a content when needed Check the given content on the given storage. If an error is detected, it will be reported through the returned dict. Args: content_ids ([sha1]): list of content ids to check source_storage (str): the source storage holding the contents to check. Returns: a dict that map {content_id -> error_status} for each content_id with an error. The `error_status` result may be 'missing' or 'corrupted'. """ content_status = {} storage = self.objstorages[source_storage] for content_id in content_ids: try: storage.check(content_id) except Error: content_status[content_id] = 'corrupted' logger.error('%s corrupted!' % hashutil.hash_to_hex( content_id)) except ObjNotFoundError: content_status[content_id] = 'missing' logger.error('%s missing!' % hashutil.hash_to_hex(content_id)) return content_status @abc.abstractmethod def need_archival(self, content_data): """Indicate if the content needs to be archived. Args: content_data (dict): dict that contains two lists 'present' and 'missing' with copies id corresponding to this status. Returns: True if there is not enough copies, False otherwise. """ pass @abc.abstractmethod def choose_backup_servers(self, present, missing): """Choose and yield the required amount of couple source/destination For each required copy, choose a unique destination server among the missing copies and a source server among the presents. Args: present: set of objstorage source name where the content is present missing: set of objstorage destination name where the content is missing Yields: tuple (source (str), destination (src)) for each required copy. """ pass class ArchiverWithRetentionPolicyWorker(BaseArchiveWorker): """ Do the required backups on a given batch of contents. Process the content of a content batch in order to do the needed backups on the slaves servers. """ ADDITIONAL_CONFIG = { 'retention_policy': ('int', 2), 'archival_max_age': ('int', 3600), } def __init__(self, batch): """ Constructor of the ArchiverWorker class. Args: batch: list of object's sha1 that potentially need archival. """ super().__init__(batch) config = self.config self.retention_policy = config['retention_policy'] self.archival_max_age = config['archival_max_age'] if len(self.objstorages) < self.retention_policy: raise ValueError('Retention policy is too high for the number of ' 'provided servers') def need_archival(self, content_data): """ Indicate if the content need to be archived. Args: content_data (dict): dict that contains two lists 'present' and 'missing' with copies id corresponding to this status. Returns: True if there is not enough copies, False otherwise. """ nb_presents = len(content_data.get('present', [])) for copy, mtime in content_data.get('ongoing', {}).items(): if not self._is_archival_delay_elapsed(mtime): nb_presents += 1 return nb_presents < self.retention_policy def _is_archival_delay_elapsed(self, start_time): """ Indicates if the archival delay is elapsed given the start_time Args: start_time (float): time at which the archival started. Returns: True if the archival delay is elasped, False otherwise """ elapsed = time.time() - start_time return elapsed > self.archival_max_age def choose_backup_servers(self, present, missing): """Choose and yield the required amount of couple source/destination For each required copy, choose a unique destination server among the missing copies and a source server among the presents. Each destination server is unique so after archival, the retention policy requirement will be fulfilled. However, the source server may be used multiple times. Args: present: set of objstorage source name where the content is present missing: set of objstorage destination name where the content is missing Yields: tuple (source, destination) for each required copy. """ # Transform from set to list to allow random selections missing = list(missing) present = list(present) nb_required = self.retention_policy - len(present) destinations = random.sample(missing, nb_required) sources = [random.choice(present) for dest in destinations] yield from zip(sources, destinations) class ArchiverToBackendWorker(BaseArchiveWorker): """Worker that sends copies over from a source to another backend. Process the content of a content batch from source objstorage to destination objstorage. """ CONFIG_BASE_FILENAME = 'archiver/worker-to-backend' ADDITIONAL_CONFIG = { 'next_task': ( 'dict', { 'queue': 'swh.indexer.tasks.SWHOrchestratorAllContentsTask', 'batch_size': 10, } ) } def __init__(self, destination, batch): """Constructor of the ArchiverWorkerToBackend class. Args: destination: where to copy the objects from batch: sha1s to send to destination """ super().__init__(batch) self.destination = destination next_task = self.config['next_task'] destination_queue = next_task['queue'] self.task_destination = get_task(destination_queue) self.batch_size = int(next_task['batch_size']) def need_archival(self, content_data): """Indicate if the content needs to be archived. Args: content_data (dict): dict that contains 3 lists 'present', 'ongoing' and 'missing' with copies id corresponding to this status. Returns: True if we need to archive, False otherwise """ return self.destination in content_data.get('missing', {}) def choose_backup_servers(self, present, missing): """The destination is fixed to the destination mentioned. The only variable here is the source of information that we choose randomly in 'present'. Args: present: set of objstorage source name where the content is present missing: set of objstorage destination name where the content is missing Yields: tuple (source, destination) for each required copy. """ yield (random.choice(list(present)), self.destination) def copy_finished(self, content_ids): """Once the copy is finished, we'll send those batch of contents as done in the destination queue. """ groups = [] for ids in utils.grouper(content_ids, self.batch_size): sig_ids = self.task_destination.s(list(ids)) groups.append(sig_ids) group(groups).delay() diff --git a/swh/storage/tests/test_archiver.py b/swh/storage/tests/test_archiver.py index 95a262d74..757ed30d1 100644 --- a/swh/storage/tests/test_archiver.py +++ b/swh/storage/tests/test_archiver.py @@ -1,315 +1,325 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import shutil import unittest import os import time import json from nose.tools import istest from nose.plugins.attrib import attr from swh.core import hashutil from swh.core.tests.db_testing import DbsTestFixture from server_testing import ServerTestFixture from swh.storage.archiver import ArchiverWithRetentionPolicyDirector from swh.storage.archiver import ArchiverWithRetentionPolicyWorker from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.objstorage.api.server import app TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DATA_DIR = os.path.join(TEST_DIR, '../../../../swh-storage-testdata') @attr('db') class TestArchiver(DbsTestFixture, ServerTestFixture, unittest.TestCase): """ Test the objstorage archiver. """ TEST_DB_NAMES = [ 'softwareheritage-archiver-test', ] TEST_DB_DUMPS = [ os.path.join(TEST_DATA_DIR, 'dumps/swh-archiver.dump'), ] TEST_DB_DUMP_TYPES = [ 'pg_dump', ] def setUp(self): # Launch the backup server self.dest_root = tempfile.mkdtemp(prefix='remote') self.config = { 'cls': 'pathslicing', 'args': { 'root': self.dest_root, 'slicing': '0:2/2:4/4:6', } } self.app = app super().setUp() # Retrieve connection (depends on the order in TEST_DB_NAMES) self.conn = self.conns[0] # archiver db's connection self.cursor = self.cursors[0] # Create source storage self.src_root = tempfile.mkdtemp() src_config = { 'cls': 'pathslicing', 'args': { 'root': self.src_root, 'slicing': '0:2/2:4/4:6' } } self.src_storage = get_objstorage(**src_config) # Create destination storage dest_config = { 'cls': 'remote', 'args': { 'url': self.url() } } self.dest_storage = get_objstorage(**dest_config) # Keep mapped the id to the storages self.storages = { 'uffizi': self.src_storage, 'banco': self.dest_storage } # Override configurations src_archiver_conf = {'host': 'uffizi'} dest_archiver_conf = {'host': 'banco'} src_archiver_conf.update(src_config) dest_archiver_conf.update(dest_config) self.archiver_storages = [src_archiver_conf, dest_archiver_conf] self._override_director_config() self._override_worker_config() # Create the base archiver self.archiver = self._create_director() def tearDown(self): self.empty_tables() shutil.rmtree(self.src_root) shutil.rmtree(self.dest_root) super().tearDown() def empty_tables(self): # Remove all content self.cursor.execute('DELETE FROM content_archive') self.conn.commit() def _override_director_config(self, retention_policy=2): """ Override the default config of the Archiver director to allow the tests to use the *-test db instead of the default one as there is no configuration file for now. """ ArchiverWithRetentionPolicyDirector.parse_config_file = lambda obj, additional_configs: { # noqa - 'dbconn': self.conn, + 'archiver_storage': { + 'cls': 'db', + 'args': { + 'dbconn': self.conn, + }, + }, 'batch_max_size': 5000, 'archival_max_age': 3600, 'retention_policy': retention_policy, 'asynchronous': False, } def _override_worker_config(self): """ Override the default config of the Archiver worker to allow the tests to use the *-test db instead of the default one as there is no configuration file for now. """ ArchiverWithRetentionPolicyWorker.parse_config_file = lambda obj, additional_configs: { # noqa 'retention_policy': 2, 'archival_max_age': 3600, - 'dbconn': self.conn, + 'archiver_storage': { + 'cls': 'db', + 'args': { + 'dbconn': self.conn, + }, + }, 'storages': self.archiver_storages, 'source': 'uffizi', } def _create_director(self): return ArchiverWithRetentionPolicyDirector() def _create_worker(self, batch={}): return ArchiverWithRetentionPolicyWorker(batch) def _add_content(self, storage_name, content_data): """ Add really a content to the given objstorage This put an empty status for the added content. Args: storage_name: the concerned storage content_data: the data to insert with_row_insert: to insert a row entry in the db or not """ # Add the content to the storage obj_id = self.storages[storage_name].add(content_data) db_obj_id = r'\x' + hashutil.hash_to_hex(obj_id) self.cursor.execute(""" INSERT INTO content_archive VALUES('%s', '{}') """ % (db_obj_id)) return obj_id def _update_status(self, obj_id, storage_name, status, date=None): """ Update the db status for the given id/storage_name. This does not create the content in the storage. """ db_obj_id = r'\x' + hashutil.hash_to_hex(obj_id) self.archiver.archiver_storage.content_archive_update( db_obj_id, storage_name, status ) def _add_dated_content(self, obj_id, copies={}): """ Fully erase the previous copies field for the given content id This does not alter the contents into the objstorages. """ db_obj_id = r'\x' + hashutil.hash_to_hex(obj_id) self.cursor.execute(""" UPDATE TABLE content_archive SET copies='%s' WHERE content_id='%s' """ % (json.dumps(copies), db_obj_id)) # Integration test @istest def archive_missing_content(self): """ Run archiver on a missing content should archive it. """ obj_data = b'archive_missing_content' obj_id = self._add_content('uffizi', obj_data) self._update_status(obj_id, 'uffizi', 'present') # Content is missing on banco (entry not present in the db) try: self.dest_storage.get(obj_id) except ObjNotFoundError: pass else: self.fail('Content should not be present before archival') self.archiver.run() # now the content should be present on remote objstorage remote_data = self.dest_storage.get(obj_id) self.assertEquals(obj_data, remote_data) @istest def archive_present_content(self): """ A content that is not 'missing' shouldn't be archived. """ obj_id = self._add_content('uffizi', b'archive_present_content') self._update_status(obj_id, 'uffizi', 'present') self._update_status(obj_id, 'banco', 'present') # After the run, the content should NOT be in the archive. # As the archiver believe it was already in. self.archiver.run() with self.assertRaises(ObjNotFoundError): self.dest_storage.get(obj_id) @istest def archive_already_enough(self): """ A content missing with enough copies shouldn't be archived. """ obj_id = self._add_content('uffizi', b'archive_alread_enough') self._update_status(obj_id, 'uffizi', 'present') self._override_director_config(retention_policy=1) director = self._create_director() # Obj is present in only one archive but only one copy is required. director.run() with self.assertRaises(ObjNotFoundError): self.dest_storage.get(obj_id) # Unit tests for archive worker def archival_elapsed(self, mtime): return self._create_worker()._is_archival_delay_elapsed(mtime) @istest def vstatus_ongoing_remaining(self): self.assertFalse(self.archival_elapsed(time.time())) @istest def vstatus_ongoing_elapsed(self): past_time = ( time.time() - self._create_worker().archival_max_age ) self.assertTrue(self.archival_elapsed(past_time)) def _status(self, status, mtime=None): """ Get a dict that match the copies structure """ return {'status': status, 'mtime': mtime or time.time()} @istest def need_archival_missing(self): """ A content should need archival when it is missing. """ status_copies = {'present': ['uffizi'], 'missing': ['banco']} worker = self._create_worker() self.assertEqual(worker.need_archival(status_copies), True) @istest def need_archival_present(self): """ A content present everywhere shouldn't need archival """ status_copies = {'present': ['uffizi', 'banco']} worker = self._create_worker() self.assertEqual(worker.need_archival(status_copies), False) def _compute_copies_status(self, status): """ A content with a given status should be detected correctly """ obj_id = self._add_content( 'banco', b'compute_copies_' + bytes(status, 'utf8')) self._update_status(obj_id, 'banco', status) worker = self._create_worker() self.assertIn('banco', worker.compute_copies( set(worker.objstorages), obj_id)[status]) @istest def compute_copies_present(self): """ A present content should be detected with correct status """ self._compute_copies_status('present') @istest def compute_copies_missing(self): """ A missing content should be detected with correct status """ self._compute_copies_status('missing') @istest def compute_copies_extra_archive(self): obj_id = self._add_content('banco', b'foobar') self._update_status(obj_id, 'banco', 'present') self._update_status(obj_id, 'random_archive', 'present') worker = self._create_worker() copies = worker.compute_copies(set(worker.objstorages), obj_id) self.assertEqual(copies['present'], {'banco'}) self.assertEqual(copies['missing'], {'uffizi'}) def _get_backups(self, present, missing): """ Return a list of the pair src/dest from the present and missing """ worker = self._create_worker() return list(worker.choose_backup_servers(present, missing)) @istest def choose_backup_servers(self): self.assertEqual(len(self._get_backups(['uffizi', 'banco'], [])), 0) self.assertEqual(len(self._get_backups(['uffizi'], ['banco'])), 1) # Even with more possible destinations, do not take more than the # retention_policy require self.assertEqual( len(self._get_backups(['uffizi'], ['banco', 's3'])), 1 )