diff --git a/sql/archiver/upgrades/002.sql b/sql/archiver/upgrades/002.sql new file mode 100644 --- /dev/null +++ b/sql/archiver/upgrades/002.sql @@ -0,0 +1,9 @@ +-- SWH DB schema upgrade +-- from_version: 1 +-- to_version: 2 +-- description: Add a 'corrupted' status into the archive_content status + +INSERT INTO dbversion(version, release, description) +VALUES(2, now(), 'Work In Progress'); + +ALTER TYPE archive_status ADD VALUE 'corrupted'; diff --git a/swh/storage/archiver/copier.py b/swh/storage/archiver/copier.py --- a/swh/storage/archiver/copier.py +++ b/swh/storage/archiver/copier.py @@ -1,39 +1,25 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.core import hashutil -from swh.objstorage.api.client import RemoteObjStorage - class ArchiverCopier(): """ This archiver copy some files into a remote objstorage in order to get a backup. - - Attributes: - content_ids: A list of sha1's that represents the content this copier - has to archive. - server (RemoteArchive): The remote object storage that is used to - backup content. - master_objstorage (ObjStorage): The master storage that contains the - data the copier needs to archive. """ - def __init__(self, destination, content, master_objstorage): + def __init__(self, source, destination, content_ids): """ Create a Copier for the archiver Args: - destination: A tuple (archive_name, archive_url) that represents a - remote object storage as in the 'archive' table. - content: A list of sha1 that represents the content this copier - have to archive. - master_storage (Storage): The master storage of the system that - contains the data to archive. + source (ObjStorage): source storage to get the contents. + destination (ObjStorage): Storage where the contents will + be copied. + content_ids: list of content's id to archive. """ - _name, self.url = destination - self.content_ids = content - self.server = RemoteObjStorage(self.url) - self.master_objstorage = master_objstorage + self.source = source + self.destination = destination + self.content_ids = content_ids def run(self): """ Do the copy on the backup storage. @@ -47,12 +33,10 @@ Returns: A boolean that indicates if the whole content have been copied. """ - self.content_ids = map(lambda x: hashutil.hex_to_hash(x[2:]), - self.content_ids) try: for content_id in self.content_ids: - content = self.master_objstorage.get(content_id) - self.server.content_add(content, content_id) + content = self.source.get(content_id) + self.destination.add(content, content_id) except: return False return True diff --git a/swh/storage/archiver/director.py b/swh/storage/archiver/director.py --- a/swh/storage/archiver/director.py +++ b/swh/storage/archiver/director.py @@ -3,13 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging import click -import time -from swh.core import hashutil, config -from swh.objstorage import PathSlicingObjStorage -from swh.objstorage.api.client import RemoteObjStorage +from swh.core import config from swh.scheduler.celery_backend.config import app from . import tasks # NOQA @@ -17,22 +13,27 @@ DEFAULT_CONFIG = { - 'objstorage_type': ('str', 'local_storage'), - 'objstorage_path': ('str', '/tmp/swh-storage/objects'), - 'objstorage_slicing': ('str', '0:2/2:4/4:6'), - 'objstorage_url': ('str', 'http://localhost:5003/'), - 'batch_max_size': ('int', 50), 'archival_max_age': ('int', 3600), 'retention_policy': ('int', 2), 'asynchronous': ('bool', True), - 'dbconn': ('str', 'dbname=softwareheritage-archiver-dev user=guest') + 'dbconn': ('str', 'dbname=softwareheritage-archiver-dev user=guest'), + + 'storages': ('json', + [ + {'host': 'uffizi', + 'cls': 'pathslicing', + 'args': {'root': '/tmp/softwareheritage/objects', + 'slicing': '0:2/2:4/4:6'}}, + {'host': 'banco', + 'cls': 'remote', + 'args': {'base_url': 'http://banco:5003/'}} + ]) } -task_name = 'swh.storage.archiver.tasks.SWHArchiverTask' -logger = logging.getLogger() +task_name = 'swh.storage.archiver.tasks.SWHArchiverTask' class ArchiverDirector(): @@ -41,36 +42,6 @@ The archiver director processes the files in the local storage in order to know which one needs archival and it delegates this task to archiver workers. - Attributes: - master_objstorage: the local storage of the master server. - master_objstorage_args (dict): arguments of the master objstorage - initialization. - - archiver_storage: a wrapper for archiver db operations. - db_conn_archiver: Either a libpq connection string, - or a psycopg2 connection for the archiver db. - - slave_objstorages: Iterable of remote obj storages to the slaves - servers used for backup. - config: Archiver_configuration. A dictionary that must contain - the following keys: - - objstorage_type (str): type of objstorage used (local_storage - or remote_storage). - If the storage is local, the arguments keys must be present - objstorage_path (str): master's objstorage path - objstorage_slicing (str): masters's objstorage slicing - Otherwise, if it's a remote objstorage, the keys must be: - objstorage_url (str): url of the remote objstorage - - batch_max_size (int): The number of content items that can be - given to the same archiver worker. - archival_max_age (int): Delay given to the worker to copy all - the files in a given batch. - retention_policy (int): Required number of copies for the - content to be considered safe. - asynchronous (boolean): Indicate whenever the archival should - run in asynchronous mode or not. """ def __init__(self, db_conn_archiver, config): @@ -79,61 +50,14 @@ Args: db_conn_archiver: Either a libpq connection string, or a psycopg2 connection for the archiver db. - config: Archiver_configuration. A dictionary that must contain - the following keys: - - objstorage_type (str): type of objstorage used - (local_objstorage or remote_objstorage). - If the storage is local, the arguments keys must be present - objstorage_path (str): master's objstorage path - objstorage_slicing (str): masters's objstorage slicing - Otherwise, if it's a remote objstorage, the keys must be: - objstorage_url (str): url of the remote objstorage - - batch_max_size (int): The number of content items that can be - given to the same archiver worker. - archival_max_age (int): Delay given to the worker to copy all - the files in a given batch. - retention_policy (int): Required number of copies for the - content to be considered safe. - asynchronous (boolean): Indicate whenever the archival should - run in asynchronous mode or not. + config: Archiver configuration. A dictionary that must contain + all required data. See DEFAULT_CONFIG for structure. """ - # Get the slave storages + if len(config['storages']) < config['retention_policy']: + raise ValueError('Retention policy is too high for the number of ' + 'provided servers') self.db_conn_archiver = db_conn_archiver self.archiver_storage = ArchiverStorage(db_conn_archiver) - self.slave_objstorages = { - id: url - for id, url - in self.archiver_storage.archive_ls() - } - # Check that there is enough backup servers for the retention policy - if config['retention_policy'] > len(self.slave_objstorages) + 1: - raise ValueError( - "Can't have a retention policy of %d with %d backup servers" - % (config['retention_policy'], len(self.slave_objstorages)) - ) - - # Get the master storage that contains content to be archived - if config['objstorage_type'] == 'local_objstorage': - master_objstorage_args = { - 'root': config['objstorage_path'], - 'slicing': config['objstorage_slicing'] - } - master_objstorage = PathSlicingObjStorage( - **master_objstorage_args - ) - elif config['objstorage_type'] == 'remote_objstorage': - master_objstorage_args = {'base_url': config['objstorage_url']} - master_objstorage = RemoteObjStorage(**master_objstorage_args) - else: - raise ValueError( - 'Unknow objstorage class `%s`' % config['objstorage_type'] - ) - self.master_objstorage = master_objstorage - self.master_objstorage_args = master_objstorage_args - - # Keep the full configuration self.config = config def run(self): @@ -147,131 +71,80 @@ else: run_fn = self.run_sync_worker - for batch in self.get_unarchived_content(): + for batch in self.get_unarchived_content_batch(): run_fn(batch) + def _worker_args(self, batch): + """ Generates a dict that contains the arguments for a worker. + """ + return { + 'batch': batch, + 'archival_policy': { + 'retention_policy': self.config['retention_policy'], + 'archival_max_age': self.config['archival_max_age'] + }, + 'dbconn': self.db_conn_archiver, + 'storages': self.config['storages'] + } + def run_async_worker(self, batch): """ Produce a worker that will be added to the task queue. """ task = app.tasks[task_name] - task.delay(batch, - archiver_args=self.db_conn_archiver, - master_objstorage_args=self.master_objstorage_args, - slave_objstorages=self.slave_objstorages, - config=self.config) + task.delay(**self._worker_args(batch)) def run_sync_worker(self, batch): """ Run synchronously a worker on the given batch. """ task = app.tasks[task_name] - task(batch, - archiver_args=self.db_conn_archiver, - master_objstorage_args=self.master_objstorage_args, - slave_objstorages=self.slave_objstorages, - config=self.config) + task(**self._worker_args(batch)) - def get_unarchived_content(self): - """ Get contents that need to be archived. + def get_unarchived_content_batch(self): + """ Create batch of contents that needs to be archived Yields: - A batch of contents. Batches are dictionaries which associates - a content id to the data about servers that contains it or not. - - {'id1': - {'present': [('slave1', 'slave1_url')], - 'missing': [('slave2', 'slave2_url'), - ('slave3', 'slave3_url')] - }, - 'id2': - {'present': [], - 'missing': [ - ('slave1', 'slave1_url'), - ('slave2', 'slave2_url'), - ('slave3', 'slave3_url') - ]} - } - - Where keys (idX) are sha1 of the content and (slaveX, slaveX_url) - are ids and urls of the storage slaves. - - At least all the content that don't have enough copies on the - backups servers are distributed into these batches. + batch of sha1 that corresponds to contents that needs more archive + copies. """ - contents = {} - # Get the archives - archives = dict(self.archiver_storage.archive_ls()) - # Get all the contents referenced into the archiver tables - last_object = b'' - while True: - archived_contents = list( - self.archiver_storage.content_archive_get_copies(last_object)) - - if not archived_contents: - break - - for content_id, present, ongoing in archived_contents: - last_object = content_id - data = { - 'present': set(present), - 'missing': set(archives) - set(present) - set(ongoing), - } - - for archive_id, mtime in ongoing.items(): - status = self.get_virtual_status('ongoing', mtime) - data[status].add(archive_id) - - if not data['missing']: - continue - - contents[r'\x%s' % hashutil.hash_to_hex(content_id)] = { - k: [(archive_id, archives[archive_id]) for archive_id in v] - for k, v in data.items() - } - - if len(contents) >= self.config['batch_max_size']: - yield contents - contents = {} - + contents = [] + for content in self._get_unarchived_content(): + contents.append(content) + if len(contents) > self.config['batch_max_size']: + yield contents + contents = [] if len(contents) > 0: yield contents - def get_virtual_status(self, status, mtime): - """ Compute the virtual presence of a content. - - If the status is ongoing but the time is not elasped, the archiver - consider it will be present in the futur, and so consider it as - present. - However, if the time is elasped, the copy may have failed, so consider - the content as missing. + def _get_unarchived_content(self): + """ Get all the content ids in the db that needs more copies - Arguments: - status (string): One of ('present', 'missing', 'ongoing'). The - status of the content. - mtime (datetime): Time at which the content have been updated for - the last time. + Yields: + sha1 of contents that needs to be archived. + """ + for content_id, present, _ongoing in self._get_all_contents(): + if len(present) < self.config['retention_policy']: + yield content_id + else: + continue - Returns: - The virtual status of the studied content, which is 'present' or - 'missing'. + def _get_all_contents(self): + """ Get batchs from the archiver db and yield it as continous stream - Raises: - ValueError: if the status is not one 'present', 'missing' - or 'ongoing' + Yields: + Datas about a content as a tuple + (content_id, present_copies, ongoing_copies) where ongoing_copies + is a dict mapping copy to mtime. """ - if status in ('present', 'missing'): - return status - - # If the status is 'ongoing' but there is still time, another worker - # may still be on the task. - if status == 'ongoing': - elapsed = int(time.time()) - mtime - if elapsed <= self.config['archival_max_age']: - return 'present' - else: - return 'missing' - else: - raise ValueError("status must be either 'present', 'missing' " - "or 'ongoing'") + last_object = b'' + while True: + archiver_contents = list( + self.archiver_storage.content_archive_get_copies(last_object) + ) + if not archiver_contents: + return + for content in archiver_contents: + last_object = content[0] + yield content @click.command() @@ -291,7 +164,6 @@ conf.update(cl_config) # Create connection data and run the archiver. archiver = ArchiverDirector(conf['dbconn'], conf) - logger.info("Starting an archival at", time.time()) archiver.run() diff --git a/swh/storage/archiver/storage.py b/swh/storage/archiver/storage.py --- a/swh/storage/archiver/storage.py +++ b/swh/storage/archiver/storage.py @@ -38,21 +38,19 @@ yield from self.db.archive_ls(cur) @db_transaction_generator - def content_archive_get(self, content=None, cur=None): - """ Get the archival status of a content in a specific server. + def content_archive_get(self, content_id, cur=None): + """ Get the archival status of a content. - Retreive from the database the archival status of the given content - in the given archive server. + Retrieve from the database the archival status of the given content Args: - content: the sha1 of the content. May be None for any id. - archive: the database id of the server we're looking into - may be None for any server. + content_id: the sha1 of the content Yields: - A tuple (content_id, server_id, archival status, mtime, tzinfo). + A tuple (content_id, present_copies, ongoing_copies), where + ongoing_copies is a dict mapping copy to mtime. """ - yield from self.db.content_archive_get(content, cur) + return self.db.content_archive_get(content_id, cur) @db_transaction_generator def content_archive_get_copies(self, previous_content=None, limit=1000, @@ -77,7 +75,7 @@ @db_transaction def content_archive_update(self, content_id, archive_id, new_status=None, cur=None): - """ Update the status of an archive content and set its mtime to + """ Update the status of an archive content and set its mtime to now Change the mtime of an archived content for the given archive and set it's mtime to the current time. diff --git a/swh/storage/archiver/tasks.py b/swh/storage/archiver/tasks.py --- a/swh/storage/archiver/tasks.py +++ b/swh/storage/archiver/tasks.py @@ -12,9 +12,6 @@ """ task_queue = 'swh_storage_archive_worker' - def run(self, batch, archiver_args, master_objstorage_args, - slave_objstorages, config): - aw = ArchiverWorker(batch, archiver_args, master_objstorage_args, - slave_objstorages, config) - if aw.run(): - self.log("Successful backup for a batch of size %s" % len(batch)) + def run(self, batch, storages, dbconn, archival_policy): + aw = ArchiverWorker(batch, storages, dbconn, archival_policy) + aw.run() diff --git a/swh/storage/archiver/worker.py b/swh/storage/archiver/worker.py --- a/swh/storage/archiver/worker.py +++ b/swh/storage/archiver/worker.py @@ -7,14 +7,17 @@ import logging import time -from swh.objstorage import PathSlicingObjStorage -from swh.objstorage.api.client import RemoteObjStorage +from collections import defaultdict + +from swh.core import hashutil +from swh.objstorage import get_objstorage +from swh.objstorage.exc import Error, ObjNotFoundError from .storage import ArchiverStorage from .copier import ArchiverCopier -logger = logging.getLogger() +logger = logging.getLogger('archiver.worker') class ArchiverWorker(): @@ -22,230 +25,222 @@ Process the content of a content batch in order to do the needed backups on the slaves servers. - - Attributes: - batch: The content this worker has to archive, which is a dictionary - that associates a content's sha1 id to the list of servers where - the content is present or missing - (see ArchiverDirector::get_unarchived_content). - master_objstorage_args: The connection argument to initialize the - master storage with the db connection url & the object storage - path. - slave_objstorages: A map that associates server_id to the remote server - config: Archiver_configuration. A dictionary that must contains - the following keys. - objstorage_path (string): the path of the objstorage of the - master. - batch_max_size (int): The number of content items that can be - given to the same archiver worker. - archival_max_age (int): Delay given to the worker to copy all - the files in a given batch. - retention_policy (int): Required number of copies for the - content to be considered safe. - asynchronous (boolean): Indicate whenever the archival should - run in asynchronous mode or not. """ - def __init__(self, batch, archiver_args, master_objstorage_args, - slave_objstorages, config): + def __init__(self, batch, storages, dbconn, archival_policy): """ Constructor of the ArchiverWorker class. - - Args: - batch: A batch of content, which is a dictionary that associates - a content's sha1 id to the list of servers where the content - is present. - archiver_args: The archiver's arguments to establish connection to - db. - master_objstorage_args: The master storage arguments. - slave_objstorages: A map that associates server_id to the remote - server. - config: Archiver_configuration. A dictionary that must contains - the following keys. - objstorage_path (string): the path of the objstorage of the - master. - batch_max_size (int): The number of content items that can be - given to the same archiver worker. - archival_max_age (int): Delay given to the worker to copy all - the files in a given batch. - retention_policy (int): Required number of copies for the - content to be considered safe. - asynchronous (boolean): Indicate whenever the archival should - run in asynchronous mode or not. """ self.batch = batch - self.archiver_storage = ArchiverStorage(archiver_args) - self.slave_objstorages = slave_objstorages - self.config = config + self.archival_policy = archival_policy - if config['objstorage_type'] == 'local_objstorage': - master_objstorage = PathSlicingObjStorage(**master_objstorage_args) - else: - master_objstorage = RemoteObjStorage(**master_objstorage_args) - self.master_objstorage = master_objstorage - - def _choose_backup_servers(self, allowed_storage, backup_number): - """ Choose the slave servers for archival. + self.archiver_db = ArchiverStorage(dbconn) + self.objstorages = { + storage['host']: get_objstorage(storage['cls'], storage['args']) + for storage in storages + } - Choose the given amount of servers among those which don't already - contain a copy of the content. + def run(self): + """ Do the task expected from the archiver worker. - Args: - allowed_storage: servers when the content is not already present. - backup_number (int): The number of servers we have to choose in - order to fullfill the objective. + Process the content in the batch, ensure that the elements still need + an archival, and spawn copiers to copy files in each destinations. """ - # In case there is not enough backup servers to get all the backups - # we need, just do our best. - # Such situation should not happen. - backup_number = min(backup_number, len(allowed_storage)) - - # TODO Find a better (or a good) policy to choose the backup servers. - # The random choice should be equivalently distributed between - # servers for a great amount of data, but don't take care of servers - # capacities. - return random.sample(allowed_storage, backup_number) - - def _get_archival_status(self, content_id, server): - """ Get the archival status of the required content. - - Attributes: - content_id (string): Sha1 of the content. - server: Tuple (archive_id, archive_url) of the archive server. + # Defaultdict so the d[key] with non-existant key automatically + # create the given type (here list). + transferts = defaultdict(list) + for obj_id in self.batch: + # Get dict {'missing': [servers], 'present': [servers]} + # for contents ignoring those who don't need archival. + copies = self._compute_copies(obj_id) + if not self._need_archival(copies): + continue + present = copies.get('present', []) + missing = copies.get('missing', []) + if len(present) == 0: + logger.critical('Content have been lost %s' % obj_id) + continue + # Choose randomly some servers to be used as srcs and dests. + for src_dest in self._choose_backup_servers(present, missing): + transferts[src_dest].append(obj_id) + + # Then run copiers for each of the required transferts. + for (src, dest), content_ids in transferts.items(): + self.run_copier(self.objstorages[src], + self.objstorages[dest], content_ids) + + def _compute_copies(self, content_id): + """ From a content_id, return present and missing copies. + Returns: - A dictionary that contains all the required data : 'content_id', - 'archive_id', 'status', and 'mtime' + A dictionary with keys 'present' and 'missing' that are mapped to + lists of copies ids depending on whenever the content is present + or missing on the copy. """ - archive = server[0] - t, = list( - self.archiver_storage.content_archive_get(content_id) + copies = self.archiver_db.content_archive_get(content_id) + _, present, ongoing = copies + # Initialize the archival status with all known present + content_data = {'present': set(present), 'missing': set()} + # Add data about the ongoing items + for copy, mtime in ongoing.items(): + content_data[ + self._get_virtual_status('ongoing', mtime) + ].add(copy) + # Add to the archival status datas about servers that were not + # in the db; they are missing. + content_data['missing'].update( + set(self.objstorages.keys()) - set(content_data['present']) ) - status_for_archive = t[1].get(archive, {}) - return { - 'content_id': content_id, - 'archive_id': archive, - 'status': status_for_archive.get('status', 'missing'), - 'mtime': status_for_archive.get('mtime', 0), - } + return content_data - def _content_archive_update(self, content_id, archive_id, - new_status=None): - """ Update the status of a archive content and set its mtime to now. + def _get_virtual_status(self, status, mtime): + """ Compute the virtual presence of a content. - Change the last modification time of an archived content and change - its status to the given one. + If the status is ongoing but the time is not elasped, the archiver + consider it will be present in the futur, and so consider it as + present. + However, if the time is elasped, the copy may have failed, so consider + the content as missing. - Args: - content_id (string): The content id. - archive_id (string): The id of the concerned archive. - new_status (string): One of missing, ongoing or present, this - status will replace the previous one. If not given, the - function only changes the mtime of the content. - """ - self.archiver_storage.content_archive_update( - content_id, - archive_id, - new_status - ) + Arguments: + status (string): One of ('present', 'missing', 'ongoing'). The + status of the content. + mtime (datetime): Time at which the content have been updated for + the last time. - def need_archival(self, content, destination): - """ Indicates whenever a content need archivage. + Returns: + The virtual status of the studied content, which is 'present' or + 'missing'. - Filter function that returns True if a given content - still require to be archived. + Raises: + ValueError: if the status is not one 'present', 'missing' + or 'ongoing' + """ + if status in ('present', 'missing'): + return status + + # If the status is 'ongoing' but there is still time, another worker + # may still be on the task. + if status == 'ongoing': + elapsed = time.time() - mtime + if elapsed <= self.archival_policy['archival_max_age']: + return 'present' + else: + return 'missing' + else: + raise ValueError("status must be either 'present', 'missing' " + "or 'ongoing'") + + def _need_archival(self, content_data): + """ Indicate if the content need to be archived. Args: - content (str): Sha1 of a content. - destination: Tuple (archive id, archive url). + content_data (dict): dict that contains two lists 'present' and + 'missing' with copies id corresponding to this status. + Returns: True if there is not enough copies, False otherwise. """ - archival_status = self._get_archival_status( - content, - destination - ) - status = archival_status['status'] - mtime = archival_status['mtime'] - # If the archive is already present, no need to backup. - if status == 'present': - return False - # If the content is ongoing but still have time, there is - # another worker working on this content. - elif status == 'ongoing': - elapsed = int(time.time()) - mtime - if elapsed <= self.config['archival_max_age']: - return False - return True - - def sort_content_by_archive(self): - """ Create a map {archive_server -> list of content) - - Create a mapping that associate to a archive server all the - contents that needs to be archived in it by the current worker. - - The map is in the form of : - { - (archive_1, archive_1_url): [content1, content2, content_3] - (archive_2, archive_2_url): [content1, content3] - } + nb_present = len(content_data.get('present', [])) + retention_policy = self.archival_policy['retention_policy'] + print(content_data['present'], nb_present, retention_policy) + return nb_present < retention_policy - Returns: - The created mapping. - """ - slaves_copy = {} - for content_id in self.batch: - # Choose some servers to upload the content among the missing ones. - server_data = self.batch[content_id] - nb_present = len(server_data['present']) - nb_backup = self.config['retention_policy'] - nb_present - backup_servers = self._choose_backup_servers( - server_data['missing'], - nb_backup - ) - # Fill the map destination -> content to upload - for server in backup_servers: - slaves_copy.setdefault(server, []).append(content_id) - return slaves_copy + def _choose_backup_servers(self, present, missing): + """ Choose and yield the required amount of couple source/destination - def run(self): - """ Do the task expected from the archiver worker. + For each required copy, choose a unique destination server among the + missing copies and a source server among the presents. - Process the content in the batch, ensure that the elements still need - an archival, and spawn copiers to copy files in each destinations. - """ - # Get a map (archive -> [contents]) - slaves_copy = self.sort_content_by_archive() - - # At this point, re-check the archival status in order to know if the - # job have been done by another worker. - for destination in slaves_copy: - # list() is needed because filter's result will be consumed twice. - slaves_copy[destination] = list(filter( - lambda content_id: self.need_archival(content_id, destination), - slaves_copy[destination] - )) - for content_id in slaves_copy[destination]: - self._content_archive_update(content_id, destination[0], - new_status='ongoing') - - # Spawn a copier for each destination - for destination in slaves_copy: - try: - self.run_copier(destination, slaves_copy[destination]) - except: - logger.error('Unable to copy a batch to %s' % destination) + Each destination server is unique so after archival, the retention + policy requiremen will be fulfilled. However, the source server may be + used multiple times. - def run_copier(self, destination, contents): + Yields: + tuple (source, destination) for each required copy. + """ + # Transform from set to list to allow random selections + missing = list(missing) + present = list(present) + nb_required = self.archival_policy['retention_policy'] - len(present) + destinations = random.sample(missing, nb_required) + sources = [random.choice(present) for dest in destinations] + yield from zip(sources, destinations) + + def run_copier(self, source, destination, content_ids): """ Run a copier in order to archive the given contents - Upload the given contents to the given archive. + Upload the given contents from the source to the destination. If the process fail, the whole content is considered uncopied and remains 'ongoing', waiting to be rescheduled as there is a delay. - Attributes: - destination: Tuple (archive_id, archive_url) of the destination. - contents: List of contents to archive. + Args: + source (ObjStorage): source storage to get the contents. + destination (ObjStorage): Storage where the contents will be copied + content_ids: list of content's id to archive. """ - ac = ArchiverCopier(destination, contents, self.master_objstorage) + # Check if there is any error among the contents. + content_status = self._get_contents_error(content_ids, source) + + # Iterates over the error detected. + for content_id, real_status in content_status.items(): + # Remove them from the to-archive list, + # as they cannot be retrieved correclty. + content_ids.remove(content_id) + # Update their status to reflect their real state. + self._content_archive_update(content_id, source, + new_status=real_status) + + # Now perform the copy on the remaining contents + ac = ArchiverCopier(source, destination, content_ids) if ac.run(): # Once the archival complete, update the database. - for content_id in contents: - self._content_archive_update(content_id, destination[0], + for content_id in content_ids: + self._content_archive_update(content_id, destination, new_status='present') + + def _get_contents_error(self, content_ids, storage): + """ Indicates what is the error associated to a content when needed + + Check the given content on the given storage. If an error is detected, + it will be reported through the returned dict. + + Args: + content_ids: a list of content id to check + storage: the storage where are the content to check. + + Returns: + a dict that map {content_id -> error_status} for each content_id + with an error. The `error_status` result may be 'missing' or + 'corrupted'. + """ + content_status = {} + for content_id in content_ids: + try: + storage.check(content_id) + except Error: + content_status[content_id] = 'corrupted' + logger.error('Content is corrupted: %s' % content_id) + except ObjNotFoundError: + content_status[content_id] = 'missing' + logger.error('A content referenced present is missing: %s' + % content_id) + return content_status + + def _content_archive_update(self, content_id, archive_id, + new_status=None): + """ Update the status of a archive content and set its mtime to now. + + Change the last modification time of an archived content and change + its status to the given one. + + Args: + content_id (string): The content id. + archive_id (string): The id of the concerned archive. + new_status (string): One of missing, ongoing or present, this + status will replace the previous one. If not given, the + function only changes the mtime of the content. + """ + db_obj_id = r'\x' + hashutil.hash_to_hex(content_id) + self.archiver_db.content_archive_update( + db_obj_id, + archive_id, + new_status + ) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -660,29 +660,47 @@ """) yield from cursor_to_bytes(cur) - def content_archive_get(self, content=None, cur=None): + def content_archive_get(self, content_id, cur=None): """ Get the archival status of a content in a specific server. Retrieve from the database the archival status of the given content in the given archive server. Args: - content: the sha1 of the content. May be None for all contents. + content_id: the sha1 of the content. Yields: - A tuple (content_id, copies_json). + A tuple (content_id, present_copies, ongoing_copies), where + ongoing_copies is a dict mapping copy to mtime. + """ + query = """SELECT content_id, + array( + SELECT key + FROM jsonb_each(copies) + WHERE value->>'status' = 'present' + ORDER BY key + ) AS present, + array( + SELECT key + FROM jsonb_each(copies) + WHERE value->>'status' = 'ongoing' + ORDER BY key + ) AS ongoing, + array( + SELECT value->'mtime' + FROM jsonb_each(copies) + WHERE value->>'status' = 'ongoing' + ORDER BY key + ) AS ongoing_mtime + FROM content_archive + WHERE content_id = %s + ORDER BY content_id """ - query = """SELECT content_id, copies - FROM content_archive - """ - if content is not None: - query += "WHERE content_id='%s'" % content - else: - query += 'ORDER BY content_id' cur = self._cursor(cur) - cur.execute(query) - yield from cursor_to_bytes(cur) + cur.execute(query, (content_id,)) + content_id, present, ongoing, mtimes = cur.fetchone() + return (content_id, present, dict(zip(ongoing, mtimes))) def content_archive_get_copies(self, previous_content=None, limit=1000, cur=None): diff --git a/swh/storage/tests/test_archiver.py b/swh/storage/tests/test_archiver.py --- a/swh/storage/tests/test_archiver.py +++ b/swh/storage/tests/test_archiver.py @@ -16,10 +16,9 @@ from swh.core.tests.db_testing import DbsTestFixture from server_testing import ServerTestFixture -from swh.storage import Storage from swh.storage.archiver import ArchiverDirector, ArchiverWorker +from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError -from swh.objstorage.api.client import RemoteObjStorage from swh.objstorage.api.server import app TEST_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -33,44 +32,60 @@ """ TEST_DB_NAMES = [ - 'softwareheritage-test', 'softwareheritage-archiver-test', ] TEST_DB_DUMPS = [ - os.path.join(TEST_DATA_DIR, 'dumps/swh.dump'), os.path.join(TEST_DATA_DIR, 'dumps/swh-archiver.dump'), ] TEST_DB_DUMP_TYPES = [ 'pg_dump', - 'pg_dump', ] def setUp(self): # Launch the backup server - self.backup_objroot = tempfile.mkdtemp(prefix='remote') + dest_root = tempfile.mkdtemp(prefix='remote') self.config = { - 'storage_base': self.backup_objroot, + 'storage_base': dest_root, 'storage_slicing': '0:2/2:4/4:6' } self.app = app super().setUp() # Retrieve connection (depends on the order in TEST_DB_NAMES) - self.conn_storage = self.conns[0] # db connection to storage - self.conn = self.conns[1] # archiver db's connection - self.cursor = self.cursors[1] - # a reader storage to check content has been archived - self.remote_objstorage = RemoteObjStorage(self.url()) - # Create the local storage. - self.objroot = tempfile.mkdtemp(prefix='local') - # a writer storage to store content before archiving - self.storage = Storage(self.conn_storage, self.objroot) + self.conn = self.conns[0] # archiver db's connection + self.cursor = self.cursors[0] + + # Create source storage + src_root = tempfile.mkdtemp() + src_config = {'cls': 'pathslicing', + 'args': {'root': src_root, + 'slicing': '0:2/2:4/4:6'}} + self.src_storage = get_objstorage(**src_config) + + # Create destination storage + dest_config = {'cls': 'remote', + 'args': {'base_url': self.url()}} + self.dest_storage = get_objstorage(**dest_config) + + # Keep mapped the id to the storages + self.storages = {'uffizi': self.src_storage, + 'banco': self.dest_storage} + + # Create the archiver itself + src_archiver_conf = {'host': 'uffizi'} + dest_archiver_conf = {'host': 'banco'} + src_archiver_conf.update(src_config) + dest_archiver_conf.update(dest_config) + self.archiver_storages = [src_archiver_conf, dest_archiver_conf] + self.archiver = self._create_director( + retention_policy=2, + storages=self.archiver_storages + ) + # Create a base worker + self.archiver_worker = self._create_worker() + # Initializes and fill the tables. self.initialize_tables() - # Create the archiver - self.archiver = self.__create_director() - - self.storage_data = ('banco', 'http://localhost:%s/' % self.port) def tearDown(self): self.empty_tables() @@ -92,97 +107,111 @@ self.cursor.execute('DELETE FROM content_archive') self.conn.commit() - def __add_content(self, content_data, status='missing', date=None): - # Add the content to the storage - content = hashutil.hashdata(content_data) - content.update({'data': content_data}) - self.storage.content_add([content]) - # Then update database - content_id = r'\x' + hashutil.hash_to_hex(content['sha1']) - copies = {'banco': { - 'status': status, - 'mtime': date or int(time.time()) # if date is None, use now() - }} - self.cursor.execute("""INSERT INTO content_archive - VALUES('%s'::sha1, '%s') - """ % (content_id, json.dumps(copies))) - return content['sha1'] - - def __get_missing(self): - self.cursor.execute("""SELECT content_id - FROM content_archive - WHERE status='missing'""") - return self.cursor.fetchall() - - def __create_director(self, batch_size=5000, archival_max_age=3600, - retention_policy=1, asynchronous=False): + def _create_director(self, storages, batch_size=5000, + archival_max_age=3600, retention_policy=2, + asynchronous=False): config = { - 'objstorage_type': 'local_objstorage', - 'objstorage_path': self.objroot, - 'objstorage_slicing': '0:2/2:4/4:6', - + 'storages': storages, 'batch_max_size': batch_size, 'archival_max_age': archival_max_age, 'retention_policy': retention_policy, - 'asynchronous': asynchronous # Avoid depending on queue for tests. + 'asynchronous': asynchronous + } + return ArchiverDirector(self.conn, config) + + def _add_content(self, storage_name, content_data): + """ Add really a content to the given objstorage + + This put an empty status for the added content. + """ + # Add the content to the storage + obj_id = self.storages[storage_name].add(content_data) + db_obj_id = r'\x' + hashutil.hash_to_hex(obj_id) + self.cursor.execute(""" INSERT INTO content_archive + VALUES('%s', '{}') + """ % (db_obj_id)) + return obj_id + + def _update_status(self, obj_id, storage_name, status, date=None): + """ Update the db status for the given id/storage_name. + + This does not create the content in the storage. + """ + db_obj_id = r'\x' + hashutil.hash_to_hex(obj_id) + self.archiver.archiver_storage.content_archive_update( + db_obj_id, storage_name, status + ) + + def _add_dated_content(self, obj_id, copies={}): + """ Fully erase the previous copies field for the given content id + + This does not alter the contents into the objstorages. + """ + db_obj_id = r'\x' + hashutil.hash_to_hex(obj_id) + self.cursor.execute(""" UPDATE TABLE content_archive + SET copies='%s' + WHERE content_id='%s' + """ % (json.dumps(copies), db_obj_id)) + + def _create_worker(self, batch={}, retention_policy=2, + archival_max_age=3600): + archival_policy = { + 'retention_policy': retention_policy, + 'archival_max_age': archival_max_age } - director = ArchiverDirector(db_conn_archiver=self.conn, - config=config) - return director - - def __create_worker(self, batch={}, config={}): - mobjstorage_args = self.archiver.master_objstorage_args - if not config: - config = self.archiver.config - return ArchiverWorker(batch, - archiver_args=self.conn, - master_objstorage_args=mobjstorage_args, - slave_objstorages=[self.storage_data], - config=config) + return ArchiverWorker(batch, self.archiver_storages, + self.conn, archival_policy) # Integration test @istest def archive_missing_content(self): """ Run archiver on a missing content should archive it. """ - content_data = b'archive_missing_content' - content_id = self.__add_content(content_data) - # before, the content should not be there + obj_data = b'archive_missing_content' + obj_id = self._add_content('uffizi', obj_data) + self._update_status(obj_id, 'uffizi', 'present') + # Content is missing on banco (entry not present in the db) try: - self.remote_objstorage.get(content_id) + self.dest_storage.get(obj_id) except ObjNotFoundError: pass else: self.fail('Content should not be present before archival') self.archiver.run() # now the content should be present on remote objstorage - remote_data = self.remote_objstorage.content_get(content_id) - self.assertEquals(content_data, remote_data) + remote_data = self.dest_storage.get(obj_id) + self.assertEquals(obj_data, remote_data) @istest def archive_present_content(self): """ A content that is not 'missing' shouldn't be archived. """ - id = self.__add_content(b'archive_present_content', status='present') - # After the run, the content should NOT be in the archive.* + obj_id = self._add_content('uffizi', b'archive_present_content') + self._update_status(obj_id, 'uffizi', 'present') + self._update_status(obj_id, 'banco', 'present') + # After the run, the content should NOT be in the archive. + # As the archiver believe it was already in. self.archiver.run() with self.assertRaises(ObjNotFoundError): - self.remote_objstorage.get(id) + self.dest_storage.get(obj_id) @istest def archive_already_enough(self): """ A content missing with enough copies shouldn't be archived. """ - id = self.__add_content(b'archive_alread_enough') - director = self.__create_director(retention_policy=0) + obj_id = self._add_content('uffizi', b'archive_alread_enough') + self._update_status(obj_id, 'uffizi', 'present') + director = self._create_director(self.archiver_storages, + retention_policy=1) + # Obj is present in only one archive but only one copy is required. director.run() with self.assertRaises(ObjNotFoundError): - self.remote_objstorage.get(id) + self.dest_storage.get(obj_id) - # Unit test for ArchiverDirector + # Unit tests for archive worker def vstatus(self, status, mtime): - return self.archiver.get_virtual_status(status, mtime) + return self.archiver_worker._get_virtual_status(status, mtime) @istest def vstatus_present(self): @@ -201,80 +230,79 @@ @istest def vstatus_ongoing_remaining(self): self.assertEquals( - self.vstatus('ongoing', int(time.time())), + self.vstatus('ongoing', time.time()), 'present' ) @istest def vstatus_ongoing_elapsed(self): past_time = ( - int(time.time()) - self.archiver.config['archival_max_age'] - 1 + time.time() - self.archiver_worker.archival_policy[ + 'archival_max_age' + ] - 1 ) self.assertEquals( self.vstatus('ongoing', past_time), 'missing' ) - # Unit tests for archive worker + def _status(self, status, mtime=None): + """ Get a dict that match the copies structure + """ + return {'status': status, 'mtime': mtime or time.time()} @istest def need_archival_missing(self): - """ A content should still need archival when it is missing. + """ A content should need archival when it is missing. """ - id = self.__add_content(b'need_archival_missing', status='missing') - id = r'\x' + hashutil.hash_to_hex(id) - worker = self.__create_worker() - self.assertEqual(worker.need_archival(id, self.storage_data), True) + status_copies = {'present': ['uffizi'], 'missing': ['banco']} + worker = self._create_worker({}, retention_policy=2) + self.assertEqual(worker._need_archival(status_copies), + True) @istest def need_archival_present(self): - """ A content should still need archival when it is missing + """ A content present everywhere shouldn't need archival """ - id = self.__add_content(b'need_archival_missing', status='present') - id = r'\x' + hashutil.hash_to_hex(id) - worker = self.__create_worker() - self.assertEqual(worker.need_archival(id, self.storage_data), False) + status_copies = {'present': ['uffizi', 'banco']} + worker = self._create_worker({}, retention_policy=2) + self.assertEqual(worker._need_archival(status_copies), + False) - @istest - def need_archival_ongoing_remaining(self): - """ An ongoing archival with remaining time shouldnt need archival. + def _compute_copies_status(self, status): + """ A content with a given status should be detected correctly """ - id = self.__add_content(b'need_archival_ongoing_remaining', - status='ongoing') - id = r'\x' + hashutil.hash_to_hex(id) - worker = self.__create_worker() - self.assertEqual(worker.need_archival(id, self.storage_data), False) + obj_id = self._add_content( + 'banco', b'compute_copies_' + bytes(status, 'utf8')) + self._update_status(obj_id, 'banco', status) + worker = self._create_worker() + self.assertIn('banco', worker._compute_copies(obj_id)[status]) @istest - def need_archival_ongoing_elasped(self): - """ An ongoing archival with elapsed time should be scheduled again. + def compute_copies_present(self): + """ A present content should be detected with correct status """ - id = self.__add_content( - b'archive_ongoing_elapsed', - status='ongoing', - date=( - int(time.time()) - self.archiver.config['archival_max_age'] - 1 - ) - ) - id = r'\x' + hashutil.hash_to_hex(id) - worker = self.__create_worker() - self.assertEqual(worker.need_archival(id, self.storage_data), True) + self._compute_copies_status('present') @istest - def content_sorting_by_archiver(self): - """ Check that the content is correctly sorted. + def compute_copies_missing(self): + """ A missing content should be detected with correct status """ - batch = { - 'id1': { - 'present': [('slave1', 'slave1_url')], - 'missing': [] - }, - 'id2': { - 'present': [], - 'missing': [('slave1', 'slave1_url')] - } - } - worker = self.__create_worker(batch=batch) - mapping = worker.sort_content_by_archive() - self.assertNotIn('id1', mapping[('slave1', 'slave1_url')]) - self.assertIn('id2', mapping[('slave1', 'slave1_url')]) + self._compute_copies_status('missing') + + def _get_backups(self, present, missing): + """ Return a list of the pair src/dest from the present and missing + """ + worker = self._create_worker() + return list(worker._choose_backup_servers(present, missing)) + + @istest + def choose_backup_servers(self): + self.assertEqual(len(self._get_backups(['uffizi', 'banco'], [])), 0) + self.assertEqual(len(self._get_backups(['uffizi'], ['banco'])), 1) + # Even with more possible destinations, do not take more than the + # retention_policy require + self.assertEqual( + len(self._get_backups(['uffizi'], ['banco', 's3'])), + 1 + )