diff --git a/swh/storage/archiver/director.py b/swh/storage/archiver/director.py index d4e3c24a..a9f630f7 100644 --- a/swh/storage/archiver/director.py +++ b/swh/storage/archiver/director.py @@ -1,296 +1,287 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import click - -from datetime import datetime +import time from swh.core import hashutil, config from swh.objstorage import PathSlicingObjStorage from swh.objstorage.api.client import RemoteObjStorage from swh.scheduler.celery_backend.config import app from . import tasks # NOQA from .storage import ArchiverStorage DEFAULT_CONFIG = { 'objstorage_type': ('str', 'local_storage'), 'objstorage_path': ('str', '/tmp/swh-storage/objects'), 'objstorage_slicing': ('str', '0:2/2:4/4:6'), 'objstorage_url': ('str', 'http://localhost:5003/'), 'batch_max_size': ('int', 50), 'archival_max_age': ('int', 3600), 'retention_policy': ('int', 2), 'asynchronous': ('bool', True), 'dbconn': ('str', 'dbname=softwareheritage-archiver-dev user=guest') } task_name = 'swh.storage.archiver.tasks.SWHArchiverTask' logger = logging.getLogger() class ArchiverDirector(): """Process the files in order to know which one is needed as backup. The archiver director processes the files in the local storage in order to know which one needs archival and it delegates this task to archiver workers. Attributes: master_objstorage: the local storage of the master server. master_objstorage_args (dict): arguments of the master objstorage initialization. archiver_storage: a wrapper for archiver db operations. db_conn_archiver: Either a libpq connection string, or a psycopg2 connection for the archiver db. slave_objstorages: Iterable of remote obj storages to the slaves servers used for backup. config: Archiver_configuration. A dictionary that must contain the following keys: objstorage_type (str): type of objstorage used (local_storage or remote_storage). If the storage is local, the arguments keys must be present objstorage_path (str): master's objstorage path objstorage_slicing (str): masters's objstorage slicing Otherwise, if it's a remote objstorage, the keys must be: objstorage_url (str): url of the remote objstorage batch_max_size (int): The number of content items that can be given to the same archiver worker. archival_max_age (int): Delay given to the worker to copy all the files in a given batch. retention_policy (int): Required number of copies for the content to be considered safe. asynchronous (boolean): Indicate whenever the archival should run in asynchronous mode or not. """ def __init__(self, db_conn_archiver, config): """ Constructor of the archiver director. Args: db_conn_archiver: Either a libpq connection string, or a psycopg2 connection for the archiver db. config: Archiver_configuration. A dictionary that must contain the following keys: objstorage_type (str): type of objstorage used (local_objstorage or remote_objstorage). If the storage is local, the arguments keys must be present objstorage_path (str): master's objstorage path objstorage_slicing (str): masters's objstorage slicing Otherwise, if it's a remote objstorage, the keys must be: objstorage_url (str): url of the remote objstorage batch_max_size (int): The number of content items that can be given to the same archiver worker. archival_max_age (int): Delay given to the worker to copy all the files in a given batch. retention_policy (int): Required number of copies for the content to be considered safe. asynchronous (boolean): Indicate whenever the archival should run in asynchronous mode or not. """ # Get the slave storages self.db_conn_archiver = db_conn_archiver self.archiver_storage = ArchiverStorage(db_conn_archiver) self.slave_objstorages = { id: url for id, url in self.archiver_storage.archive_ls() } # Check that there is enough backup servers for the retention policy if config['retention_policy'] > len(self.slave_objstorages) + 1: raise ValueError( "Can't have a retention policy of %d with %d backup servers" % (config['retention_policy'], len(self.slave_objstorages)) ) # Get the master storage that contains content to be archived if config['objstorage_type'] == 'local_objstorage': master_objstorage_args = { 'root': config['objstorage_path'], 'slicing': config['objstorage_slicing'] } master_objstorage = PathSlicingObjStorage( **master_objstorage_args ) elif config['objstorage_type'] == 'remote_objstorage': master_objstorage_args = {'base_url': config['objstorage_url']} master_objstorage = RemoteObjStorage(**master_objstorage_args) else: raise ValueError( 'Unknow objstorage class `%s`' % config['objstorage_type'] ) self.master_objstorage = master_objstorage self.master_objstorage_args = master_objstorage_args # Keep the full configuration self.config = config def run(self): """ Run the archiver director. The archiver director will check all the contents of the archiver database and do the required backup jobs. """ if self.config['asynchronous']: run_fn = self.run_async_worker else: run_fn = self.run_sync_worker for batch in self.get_unarchived_content(): run_fn(batch) def run_async_worker(self, batch): """ Produce a worker that will be added to the task queue. """ task = app.tasks[task_name] task.delay(batch, archiver_args=self.db_conn_archiver, master_objstorage_args=self.master_objstorage_args, slave_objstorages=self.slave_objstorages, config=self.config) def run_sync_worker(self, batch): """ Run synchronously a worker on the given batch. """ task = app.tasks[task_name] task(batch, archiver_args=self.db_conn_archiver, master_objstorage_args=self.master_objstorage_args, slave_objstorages=self.slave_objstorages, config=self.config) def get_unarchived_content(self): """ Get contents that need to be archived. Yields: A batch of contents. Batches are dictionaries which associates a content id to the data about servers that contains it or not. {'id1': {'present': [('slave1', 'slave1_url')], 'missing': [('slave2', 'slave2_url'), ('slave3', 'slave3_url')] }, 'id2': {'present': [], 'missing': [ ('slave1', 'slave1_url'), ('slave2', 'slave2_url'), ('slave3', 'slave3_url') ]} } Where keys (idX) are sha1 of the content and (slaveX, slaveX_url) are ids and urls of the storage slaves. At least all the content that don't have enough copies on the backups servers are distributed into these batches. """ - # Get the data about each content referenced into the archiver. - missing_copy = {} - for content_id in self.archiver_storage.content_archive_ls(): - db_content_id = '\\x' + hashutil.hash_to_hex(content_id[0]) - - # Fetch the datas about archival status of the content - backups = self.archiver_storage.content_archive_get( - content=db_content_id - ) - for _content_id, server_id, status, mtime in backups: - virtual_status = self.get_virtual_status(status, mtime) - server_data = (server_id, self.slave_objstorages[server_id]) - - missing_copy.setdefault( - db_content_id, - {'present': [], 'missing': []} - ).setdefault(virtual_status, []).append(server_data) - - # Check the content before archival. - try: - self.master_objstorage.check(content_id[0]) - except Exception as e: - # Exception can be Error or ObjNotFoundError. - logger.error(e) - # TODO Do something to restore the content? - - if len(missing_copy) >= self.config['batch_max_size']: - yield missing_copy - missing_copy = {} - - if len(missing_copy) > 0: - yield missing_copy + contents = {} + # Get the archives + archives = [(k, v) for k, v in self.archiver_storage.archive_ls()] + # Get all the contents referenced into the archiver tables + for content_id, copies in self.archiver_storage.content_archive_get(): + content_id = r'\x' + hashutil.hash_to_hex(content_id) + data = {'present': [], 'missing': []} + # For each archive server, check the current content status + for archive_id, archive_url in archives: + if archive_id not in copies: + data['missing'].append((archive_id, archive_url)) + else: + copy = copies[archive_id] + vstatus = self.get_virtual_status(copy['status'], + copy['mtime']) + data[vstatus].append((archive_id, archive_url)) + + contents[content_id] = data + + if len(contents) >= self.config['batch_max_size']: + yield contents + contents = {} + + if len(contents) > 0: + yield contents def get_virtual_status(self, status, mtime): """ Compute the virtual presence of a content. If the status is ongoing but the time is not elasped, the archiver consider it will be present in the futur, and so consider it as present. However, if the time is elasped, the copy may have failed, so consider the content as missing. Arguments: status (string): One of ('present', 'missing', 'ongoing'). The status of the content. mtime (datetime): Time at which the content have been updated for the last time. Returns: The virtual status of the studied content, which is 'present' or 'missing'. Raises: ValueError: if the status is not one 'present', 'missing' or 'ongoing' """ if status in ('present', 'missing'): return status # If the status is 'ongoing' but there is still time, another worker # may still be on the task. if status == 'ongoing': - mtime = mtime.replace(tzinfo=None) - elapsed = (datetime.now() - mtime).total_seconds() + elapsed = int(time.time()) - mtime if elapsed <= self.config['archival_max_age']: return 'present' else: return 'missing' else: raise ValueError("status must be either 'present', 'missing' " "or 'ongoing'") @click.command() @click.argument('config-path', required=1) @click.option('--dbconn', default=DEFAULT_CONFIG['dbconn'][1], help="Connection string for the archiver database") @click.option('--async/--sync', default=DEFAULT_CONFIG['asynchronous'][1], help="Indicates if the archiver should run asynchronously") def launch(config_path, dbconn, async): # The configuration have following priority : # command line > file config > default config cl_config = { 'dbconn': dbconn, 'asynchronous': async } conf = config.read(config_path, DEFAULT_CONFIG) conf.update(cl_config) # Create connection data and run the archiver. archiver = ArchiverDirector(conf['dbconn'], conf) - logger.info("Starting an archival at", datetime.now()) + logger.info("Starting an archival at", time.time()) archiver.run() if __name__ == '__main__': launch() diff --git a/swh/storage/archiver/storage.py b/swh/storage/archiver/storage.py index 028304cc..73586e0d 100644 --- a/swh/storage/archiver/storage.py +++ b/swh/storage/archiver/storage.py @@ -1,89 +1,82 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import psycopg2 +import time -from ..common import db_transaction_generator +from ..common import db_transaction_generator, db_transaction from ..db import Db from ..exc import StorageDBError class ArchiverStorage(): """SWH Archiver storage proxy, encompassing DB """ def __init__(self, db_conn): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection """ try: if isinstance(db_conn, psycopg2.extensions.connection): self.db = Db(db_conn) else: self.db = Db.connect(db_conn) except psycopg2.OperationalError as e: raise StorageDBError(e) @db_transaction_generator def archive_ls(self, cur=None): """ Get all the archives registered on the server. Yields: a tuple (server_id, server_url) for each archive server. """ yield from self.db.archive_ls(cur) @db_transaction_generator - def content_archive_ls(self, cur=None): - """ Get the archival status of the content - - Get an iterable over all the content that is referenced - in a backup server. - - Yields: - the sha1 of each content referenced at least one time - in the database of archiveal status. - - """ - yield from self.db.content_archive_ls(cur) - - @db_transaction_generator - def content_archive_get(self, content=None, archive=None, cur=None): + def content_archive_get(self, content=None, cur=None): """ Get the archival status of a content in a specific server. Retreive from the database the archival status of the given content in the given archive server. Args: content: the sha1 of the content. May be None for any id. archive: the database id of the server we're looking into may be None for any server. Yields: A tuple (content_id, server_id, archival status, mtime, tzinfo). """ - yield from self.db.content_archive_get(content, archive, cur) + yield from self.db.content_archive_get(content, cur) - @db_transaction_generator + @db_transaction def content_archive_update(self, content_id, archive_id, new_status=None, cur=None): - """ Update the status of a archive content and set it's mtime to now() + """ Update the status of an archive content and set its mtime to - Change the last modification time of an archived content and change - its status to the given one. + Change the mtime of an archived content for the given archive and set + it's mtime to the current time. Args: - content_id (string): The content id. - archive_id (string): The id of the concerned archive. - new_status (string): One of missing, ongoing or present, this - status will replace the previous one. If not given, the - function only changes the mtime of the content. + content_id (str): content sha1 + archive_id (str): name of the archive + new_status (str): one of 'missing', 'present' or 'ongoing'. + this status will replace the previous one. If not given, + the function only change the mtime of the content for the + given archive. """ - yield from self.db.content_archive_update(content_id, - archive_id, - new_status, - cur) + # FIXME check how to alter direclty the json object with postgres + # Get the data and alter it + copies = self.db.content_archive_get(content_id)['copies'] + if new_status is not None: + copies[archive_id]['status'] = new_status + copies[archive_id]['mtime'] = int(time.time()) + + # Then save the new data + self.db.content_archive_update(content_id, copies) diff --git a/swh/storage/archiver/worker.py b/swh/storage/archiver/worker.py index e1220f13..3f936151 100644 --- a/swh/storage/archiver/worker.py +++ b/swh/storage/archiver/worker.py @@ -1,251 +1,250 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import logging - -from datetime import datetime +import time from swh.objstorage import PathSlicingObjStorage from swh.objstorage.api.client import RemoteObjStorage from .storage import ArchiverStorage from .copier import ArchiverCopier logger = logging.getLogger() class ArchiverWorker(): """ Do the required backups on a given batch of contents. Process the content of a content batch in order to do the needed backups on the slaves servers. Attributes: batch: The content this worker has to archive, which is a dictionary that associates a content's sha1 id to the list of servers where the content is present or missing (see ArchiverDirector::get_unarchived_content). master_objstorage_args: The connection argument to initialize the master storage with the db connection url & the object storage path. slave_objstorages: A map that associates server_id to the remote server config: Archiver_configuration. A dictionary that must contains the following keys. objstorage_path (string): the path of the objstorage of the master. batch_max_size (int): The number of content items that can be given to the same archiver worker. archival_max_age (int): Delay given to the worker to copy all the files in a given batch. retention_policy (int): Required number of copies for the content to be considered safe. asynchronous (boolean): Indicate whenever the archival should run in asynchronous mode or not. """ def __init__(self, batch, archiver_args, master_objstorage_args, slave_objstorages, config): """ Constructor of the ArchiverWorker class. Args: batch: A batch of content, which is a dictionary that associates a content's sha1 id to the list of servers where the content is present. archiver_args: The archiver's arguments to establish connection to db. master_objstorage_args: The master storage arguments. slave_objstorages: A map that associates server_id to the remote server. config: Archiver_configuration. A dictionary that must contains the following keys. objstorage_path (string): the path of the objstorage of the master. batch_max_size (int): The number of content items that can be given to the same archiver worker. archival_max_age (int): Delay given to the worker to copy all the files in a given batch. retention_policy (int): Required number of copies for the content to be considered safe. asynchronous (boolean): Indicate whenever the archival should run in asynchronous mode or not. """ self.batch = batch self.archiver_storage = ArchiverStorage(archiver_args) self.slave_objstorages = slave_objstorages self.config = config if config['objstorage_type'] == 'local_objstorage': master_objstorage = PathSlicingObjStorage(**master_objstorage_args) else: master_objstorage = RemoteObjStorage(**master_objstorage_args) self.master_objstorage = master_objstorage def _choose_backup_servers(self, allowed_storage, backup_number): """ Choose the slave servers for archival. Choose the given amount of servers among those which don't already contain a copy of the content. Args: allowed_storage: servers when the content is not already present. backup_number (int): The number of servers we have to choose in order to fullfill the objective. """ # In case there is not enough backup servers to get all the backups # we need, just do our best. # Such situation should not happen. backup_number = min(backup_number, len(allowed_storage)) # TODO Find a better (or a good) policy to choose the backup servers. # The random choice should be equivalently distributed between # servers for a great amount of data, but don't take care of servers # capacities. return random.sample(allowed_storage, backup_number) def _get_archival_status(self, content_id, server): """ Get the archival status of the required content. Attributes: content_id (string): Sha1 of the content. server: Tuple (archive_id, archive_url) of the archive server. Returns: A dictionary that contains all the required data : 'content_id', 'archive_id', 'status', and 'mtime' """ + archive = server[0] t, = list( - self.archiver_storage.content_archive_get(content_id, server[0]) + self.archiver_storage.content_archive_get(content_id) ) return { 'content_id': t[0], - 'archive_id': t[1], - 'status': t[2], - 'mtime': t[3] + 'archive_id': archive, + 'status': t[1][archive]['status'], + 'mtime': t[1][archive]['mtime'] } def _content_archive_update(self, content_id, archive_id, new_status=None): - """ Update the status of a archive content and set it's mtime to now() + """ Update the status of a archive content and set its mtime to now. Change the last modification time of an archived content and change its status to the given one. Args: content_id (string): The content id. archive_id (string): The id of the concerned archive. new_status (string): One of missing, ongoing or present, this status will replace the previous one. If not given, the function only changes the mtime of the content. """ self.archiver_storage.content_archive_update( content_id, archive_id, new_status ) def need_archival(self, content, destination): """ Indicates whenever a content need archivage. Filter function that returns True if a given content still require to be archived. Args: content (str): Sha1 of a content. destination: Tuple (archive id, archive url). """ archival_status = self._get_archival_status( content, destination ) status = archival_status['status'] mtime = archival_status['mtime'] # If the archive is already present, no need to backup. if status == 'present': return False # If the content is ongoing but still have time, there is # another worker working on this content. elif status == 'ongoing': - mtime = mtime.replace(tzinfo=None) - elapsed = (datetime.now() - mtime).total_seconds() + elapsed = int(time.time()) - mtime if elapsed <= self.config['archival_max_age']: return False return True def sort_content_by_archive(self): """ Create a map {archive_server -> list of content) Create a mapping that associate to a archive server all the contents that needs to be archived in it by the current worker. The map is in the form of : { (archive_1, archive_1_url): [content1, content2, content_3] (archive_2, archive_2_url): [content1, content3] } Returns: The created mapping. """ slaves_copy = {} for content_id in self.batch: # Choose some servers to upload the content among the missing ones. server_data = self.batch[content_id] nb_present = len(server_data['present']) nb_backup = self.config['retention_policy'] - nb_present backup_servers = self._choose_backup_servers( server_data['missing'], nb_backup ) # Fill the map destination -> content to upload for server in backup_servers: slaves_copy.setdefault(server, []).append(content_id) return slaves_copy def run(self): """ Do the task expected from the archiver worker. Process the content in the batch, ensure that the elements still need an archival, and spawn copiers to copy files in each destinations. """ # Get a map (archive -> [contents]) slaves_copy = self.sort_content_by_archive() # At this point, re-check the archival status in order to know if the # job have been done by another worker. for destination in slaves_copy: # list() is needed because filter's result will be consumed twice. slaves_copy[destination] = list(filter( lambda content_id: self.need_archival(content_id, destination), slaves_copy[destination] )) for content_id in slaves_copy[destination]: self._content_archive_update(content_id, destination[0], new_status='ongoing') # Spawn a copier for each destination for destination in slaves_copy: try: self.run_copier(destination, slaves_copy[destination]) except: logger.error('Unable to copy a batch to %s' % destination) def run_copier(self, destination, contents): """ Run a copier in order to archive the given contents Upload the given contents to the given archive. If the process fail, the whole content is considered uncopied and remains 'ongoing', waiting to be rescheduled as there is a delay. Attributes: destination: Tuple (archive_id, archive_url) of the destination. contents: List of contents to archive. """ ac = ArchiverCopier(destination, contents, self.master_objstorage) if ac.run(): # Once the archival complete, update the database. for content_id in contents: self._content_archive_update(content_id, destination[0], new_status='present') diff --git a/swh/storage/db.py b/swh/storage/db.py index 2af32e17..fe8bb0aa 100644 --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -1,738 +1,700 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import binascii import datetime import functools import json import psycopg2 import psycopg2.extras import tempfile from contextlib import contextmanager TMP_CONTENT_TABLE = 'tmp_content' psycopg2.extras.register_uuid() def stored_procedure(stored_proc): """decorator to execute remote stored procedure, specified as argument Generally, the body of the decorated function should be empty. If it is not, the stored procedure will be executed first; the function body then. """ def wrap(meth): @functools.wraps(meth) def _meth(self, *args, **kwargs): cur = kwargs.get('cur', None) self._cursor(cur).execute('SELECT %s()' % stored_proc) meth(self, *args, **kwargs) return _meth return wrap def jsonize(value): """Convert a value to a psycopg2 JSON object if necessary""" if isinstance(value, dict): return psycopg2.extras.Json(value) return value def entry_to_bytes(entry): """Convert an entry coming from the database to bytes""" if isinstance(entry, memoryview): return entry.tobytes() if isinstance(entry, list): return [entry_to_bytes(value) for value in entry] return entry def line_to_bytes(line): """Convert a line coming from the database to bytes""" if isinstance(line, dict): return {k: entry_to_bytes(v) for k, v in line.items()} return line.__class__(entry_to_bytes(entry) for entry in line) def cursor_to_bytes(cursor): """Yield all the data from a cursor as bytes""" yield from (line_to_bytes(line) for line in cursor) class Db: """Proxy to the SWH DB, with wrappers around stored procedures """ @classmethod def connect(cls, *args, **kwargs): """factory method to create a DB proxy Accepts all arguments of psycopg2.connect; only some specific possibilities are reported below. Args: connstring: libpq2 connection string """ conn = psycopg2.connect(*args, **kwargs) return cls(conn) def _cursor(self, cur_arg): """get a cursor: from cur_arg if given, or a fresh one otherwise meant to avoid boilerplate if/then/else in methods that proxy stored procedures """ if cur_arg is not None: return cur_arg # elif self.cur is not None: # return self.cur else: return self.conn.cursor() def __init__(self, conn): """create a DB proxy Args: conn: psycopg2 connection to the SWH DB """ self.conn = conn @contextmanager def transaction(self): """context manager to execute within a DB transaction Yields: a psycopg2 cursor """ with self.conn.cursor() as cur: try: yield cur self.conn.commit() except: if not self.conn.closed: self.conn.rollback() raise def mktemp(self, tblname, cur=None): self._cursor(cur).execute('SELECT swh_mktemp(%s)', (tblname,)) def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute('SELECT swh_mktemp_dir_entry(%s)', (('directory_entry_%s' % entry_type),)) @stored_procedure('swh_mktemp_revision') def mktemp_revision(self, cur=None): pass @stored_procedure('swh_mktemp_release') def mktemp_release(self, cur=None): pass @stored_procedure('swh_mktemp_occurrence_history') def mktemp_occurrence_history(self, cur=None): pass @stored_procedure('swh_mktemp_entity_lister') def mktemp_entity_lister(self, cur=None): pass @stored_procedure('swh_mktemp_entity_history') def mktemp_entity_history(self, cur=None): pass @stored_procedure('swh_mktemp_bytea') def mktemp_bytea(self, cur=None): pass def copy_to(self, items, tblname, columns, cur=None, item_cb=None): def escape(data): if data is None: return '' if isinstance(data, bytes): return '\\x%s' % binascii.hexlify(data).decode('ascii') elif isinstance(data, str): return '"%s"' % data.replace('"', '""') elif isinstance(data, datetime.datetime): # We escape twice to make sure the string generated by # isoformat gets escaped return escape(data.isoformat()) elif isinstance(data, dict): return escape(json.dumps(data)) elif isinstance(data, list): return escape("{%s}" % ','.join(escape(d) for d in data)) elif isinstance(data, psycopg2.extras.Range): # We escape twice here too, so that we make sure # everything gets passed to copy properly return escape( '%s%s,%s%s' % ( '[' if data.lower_inc else '(', '-infinity' if data.lower_inf else escape(data.lower), 'infinity' if data.upper_inf else escape(data.upper), ']' if data.upper_inc else ')', ) ) else: # We don't escape here to make sure we pass literals properly return str(data) with tempfile.TemporaryFile('w+') as f: for d in items: if item_cb is not None: item_cb(d) line = [escape(d.get(k)) for k in columns] f.write(','.join(line)) f.write('\n') f.seek(0) self._cursor(cur).copy_expert('COPY %s (%s) FROM STDIN CSV' % ( tblname, ', '.join(columns)), f) @stored_procedure('swh_content_add') def content_add_from_temp(self, cur=None): pass @stored_procedure('swh_directory_add') def directory_add_from_temp(self, cur=None): pass @stored_procedure('swh_skipped_content_add') def skipped_content_add_from_temp(self, cur=None): pass @stored_procedure('swh_revision_add') def revision_add_from_temp(self, cur=None): pass @stored_procedure('swh_release_add') def release_add_from_temp(self, cur=None): pass @stored_procedure('swh_occurrence_history_add') def occurrence_history_add_from_temp(self, cur=None): pass @stored_procedure('swh_entity_history_add') def entity_history_add_from_temp(self, cur=None): pass def store_tmp_bytea(self, ids, cur=None): """Store the given identifiers in a new tmp_bytea table""" cur = self._cursor(cur) self.mktemp_bytea(cur) self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea', ['id'], cur) def content_missing_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("""SELECT sha1, sha1_git, sha256 FROM swh_content_missing()""") yield from cursor_to_bytes(cur) def content_missing_per_sha1_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("""SELECT * FROM swh_content_missing_per_sha1()""") yield from cursor_to_bytes(cur) def skipped_content_missing_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("""SELECT sha1, sha1_git, sha256 FROM swh_skipped_content_missing()""") yield from cursor_to_bytes(cur) def occurrence_get(self, origin_id, cur=None): """Retrieve latest occurrence's information by origin_id. """ cur = self._cursor(cur) cur.execute("""SELECT origin, branch, target, target_type, (select max(date) from origin_visit where origin=%s) as date FROM occurrence WHERE origin=%s """, (origin_id, origin_id)) yield from cursor_to_bytes(cur) def content_find(self, sha1=None, sha1_git=None, sha256=None, cur=None): """Find the content optionally on a combination of the following checksums sha1, sha1_git or sha256. Args: sha1: sha1 content git_sha1: the sha1 computed `a la git` sha1 of the content sha256: sha256 content Returns: The triplet (sha1, sha1_git, sha256) if found or None. """ cur = self._cursor(cur) cur.execute("""SELECT sha1, sha1_git, sha256, length, ctime, status FROM swh_content_find(%s, %s, %s) LIMIT 1""", (sha1, sha1_git, sha256)) content = line_to_bytes(cur.fetchone()) if set(content) == {None}: return None else: return content def content_find_occurrence(self, sha1, cur=None): """Find one content's occurrence. Args: sha1: sha1 content cur: cursor to use Returns: One occurrence for that particular sha1 """ cur = self._cursor(cur) cur.execute("""SELECT origin_type, origin_url, branch, target, target_type, path FROM swh_content_find_occurrence(%s) LIMIT 1""", (sha1, )) return line_to_bytes(cur.fetchone()) def directory_get_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute('''SELECT id, file_entries, dir_entries, rev_entries FROM swh_directory_get()''') yield from cursor_to_bytes(cur) def directory_missing_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute('SELECT * FROM swh_directory_missing()') yield from cursor_to_bytes(cur) def directory_walk_one(self, directory, cur=None): cur = self._cursor(cur) cur.execute('SELECT * FROM swh_directory_walk_one(%s)', (directory,)) yield from cursor_to_bytes(cur) def directory_walk(self, directory, cur=None): cur = self._cursor(cur) cur.execute('SELECT * FROM swh_directory_walk(%s)', (directory,)) yield from cursor_to_bytes(cur) def revision_missing_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute('SELECT id FROM swh_revision_missing() as r(id)') yield from cursor_to_bytes(cur) revision_add_cols = [ 'id', 'date', 'date_offset', 'date_neg_utc_offset', 'committer_date', 'committer_date_offset', 'committer_date_neg_utc_offset', 'type', 'directory', 'message', 'author_fullname', 'author_name', 'author_email', 'committer_fullname', 'committer_name', 'committer_email', 'metadata', 'synthetic', ] revision_get_cols = revision_add_cols + [ 'author_id', 'committer_id', 'parents'] origin_visit_get_cols = [ 'origin', 'visit', 'date' ] def origin_visit_get(self, origin_id, cur=None): """Retrieve occurrence's history information by origin_id. Args: origin_id: The occurrence's origin Yields: The occurrence's history visits """ cur = self._cursor(cur) cur.execute( 'SELECT origin, visit, date FROM origin_visit where origin=%s', (origin_id, )) yield from cursor_to_bytes(cur) def revision_get_from_temp(self, cur=None): cur = self._cursor(cur) query = 'SELECT %s FROM swh_revision_get()' % ( ', '.join(self.revision_get_cols)) cur.execute(query) yield from cursor_to_bytes(cur) def revision_log(self, root_revisions, limit=None, cur=None): cur = self._cursor(cur) query = """SELECT %s FROM swh_revision_log(%%s, %%s) """ % ', '.join(self.revision_get_cols) cur.execute(query, (root_revisions, limit)) yield from cursor_to_bytes(cur) revision_shortlog_cols = ['id', 'parents'] def revision_shortlog(self, root_revisions, limit=None, cur=None): cur = self._cursor(cur) query = """SELECT %s FROM swh_revision_list(%%s, %%s) """ % ', '.join(self.revision_shortlog_cols) cur.execute(query, (root_revisions, limit)) yield from cursor_to_bytes(cur) def release_missing_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute('SELECT id FROM swh_release_missing() as r(id)') yield from cursor_to_bytes(cur) object_find_by_sha1_git_cols = ['sha1_git', 'type', 'id', 'object_id'] def object_find_by_sha1_git(self, ids, cur=None): cur = self._cursor(cur) self.store_tmp_bytea(ids, cur) query = 'select %s from swh_object_find_by_sha1_git()' % ( ', '.join(self.object_find_by_sha1_git_cols) ) cur.execute(query) yield from cursor_to_bytes(cur) def stat_counters(self, cur=None): cur = self._cursor(cur) cur.execute('SELECT * FROM swh_stat_counters()') yield from cur fetch_history_cols = ['origin', 'date', 'status', 'result', 'stdout', 'stderr', 'duration'] def create_fetch_history(self, fetch_history, cur=None): """Create a fetch_history entry with the data in fetch_history""" cur = self._cursor(cur) query = '''INSERT INTO fetch_history (%s) VALUES (%s) RETURNING id''' % ( ','.join(self.fetch_history_cols), ','.join(['%s'] * len(self.fetch_history_cols)) ) cur.execute(query, [fetch_history.get(col) for col in self.fetch_history_cols]) return cur.fetchone()[0] def get_fetch_history(self, fetch_history_id, cur=None): """Get a fetch_history entry with the given id""" cur = self._cursor(cur) query = '''SELECT %s FROM fetch_history WHERE id=%%s''' % ( ', '.join(self.fetch_history_cols), ) cur.execute(query, (fetch_history_id,)) data = cur.fetchone() if not data: return None ret = {'id': fetch_history_id} for i, col in enumerate(self.fetch_history_cols): ret[col] = data[i] return ret def update_fetch_history(self, fetch_history, cur=None): """Update the fetch_history entry from the data in fetch_history""" cur = self._cursor(cur) query = '''UPDATE fetch_history SET %s WHERE id=%%s''' % ( ','.join('%s=%%s' % col for col in self.fetch_history_cols) ) cur.execute(query, [jsonize(fetch_history.get(col)) for col in self.fetch_history_cols + ['id']]) base_entity_cols = ['uuid', 'parent', 'name', 'type', 'description', 'homepage', 'active', 'generated', 'lister_metadata', 'metadata'] entity_cols = base_entity_cols + ['last_seen', 'last_id'] entity_history_cols = base_entity_cols + ['id', 'validity'] def origin_add(self, type, url, cur=None): """Insert a new origin and return the new identifier.""" insert = """INSERT INTO origin (type, url) values (%s, %s) RETURNING id""" cur.execute(insert, (type, url)) return cur.fetchone()[0] def origin_get_with(self, type, url, cur=None): """Retrieve the origin id from its type and url if found.""" cur = self._cursor(cur) query = """SELECT id, type, url, lister, project FROM origin WHERE type=%s AND url=%s""" cur.execute(query, (type, url)) data = cur.fetchone() if data: return line_to_bytes(data) return None def origin_get(self, id, cur=None): """Retrieve the origin per its identifier. """ cur = self._cursor(cur) query = "SELECT id, type, url, lister, project FROM origin WHERE id=%s" cur.execute(query, (id,)) data = cur.fetchone() if data: return line_to_bytes(data) return None person_cols = ['fullname', 'name', 'email'] person_get_cols = person_cols + ['id'] def person_add(self, person, cur=None): """Add a person identified by its name and email. Returns: The new person's id """ cur = self._cursor(cur) query_new_person = '''\ INSERT INTO person(%s) VALUES (%s) RETURNING id''' % ( ', '.join(self.person_cols), ', '.join('%s' for i in range(len(self.person_cols))) ) cur.execute(query_new_person, [person[col] for col in self.person_cols]) return cur.fetchone()[0] def person_get(self, ids, cur=None): """Retrieve the persons identified by the list of ids. """ cur = self._cursor(cur) query = """SELECT %s FROM person WHERE id IN %%s""" % ', '.join(self.person_get_cols) cur.execute(query, (tuple(ids),)) yield from cursor_to_bytes(cur) release_add_cols = [ 'id', 'target', 'target_type', 'date', 'date_offset', 'date_neg_utc_offset', 'name', 'comment', 'synthetic', 'author_fullname', 'author_name', 'author_email', ] release_get_cols = release_add_cols + ['author_id'] def release_get_from_temp(self, cur=None): cur = self._cursor(cur) query = ''' SELECT %s FROM swh_release_get() ''' % ', '.join(self.release_get_cols) cur.execute(query) yield from cursor_to_bytes(cur) def release_get_by(self, origin_id, limit=None, cur=None): """Retrieve a release by occurrence criterion (only origin right now) Args: - origin_id: The origin to look for. """ cur = self._cursor(cur) query = """ SELECT %s FROM swh_release_get_by(%%s) LIMIT %%s """ % ', '.join(self.release_get_cols) cur.execute(query, (origin_id, limit)) yield from cursor_to_bytes(cur) def revision_get_by(self, origin_id, branch_name, datetime, limit=None, cur=None): """Retrieve a revision by occurrence criterion. Args: - origin_id: The origin to look for - branch_name: the branch name to look for - datetime: the lower bound of timerange to look for. - limit: limit number of results to return The upper bound being now. """ cur = self._cursor(cur) if branch_name and isinstance(branch_name, str): branch_name = branch_name.encode('utf-8') query = ''' SELECT %s FROM swh_revision_get_by(%%s, %%s, %%s) LIMIT %%s ''' % ', '.join(self.revision_get_cols) cur.execute(query, (origin_id, branch_name, datetime, limit)) yield from cursor_to_bytes(cur) def directory_entry_get_by_path(self, directory, paths, cur=None): """Retrieve a directory entry by path. """ cur = self._cursor(cur) cur.execute("""SELECT dir_id, type, target, name, perms, status, sha1, sha1_git, sha256 FROM swh_find_directory_entry_by_path(%s, %s)""", (directory, paths)) data = cur.fetchone() if set(data) == {None}: return None return line_to_bytes(data) def entity_get(self, uuid, cur=None): """Retrieve the entity and its parent hierarchy chain per uuid. """ cur = self._cursor(cur) cur.execute("""SELECT %s FROM swh_entity_get(%%s)""" % ( ', '.join(self.entity_cols)), (uuid, )) yield from cursor_to_bytes(cur) def entity_get_one(self, uuid, cur=None): """Retrieve a single entity given its uuid. """ cur = self._cursor(cur) cur.execute("""SELECT %s FROM entity WHERE uuid = %%s""" % ( ', '.join(self.entity_cols)), (uuid, )) data = cur.fetchone() if not data: return None return line_to_bytes(data) def archive_ls(self, cur=None): """ Get all the archives registered on the server. Yields: a tuple (server_id, server_url) for each archive server. """ cur = self._cursor(cur) cur.execute("""SELECT id, url FROM archive """) yield from cursor_to_bytes(cur) - def content_archive_ls(self, cur=None): - """ Get the archival status of the content - - Get an iterable over all the content that is referenced - in a backup server. - - Yields: - the sha1 of each content referenced at least one time - in the database of archiveal status. - """ - cur = self._cursor(cur) - cur.execute("""SELECT DISTINCT content_id - FROM content_archive""") - yield from cursor_to_bytes(cur) - - def content_archive_get(self, content=None, archive=None, cur=None): + def content_archive_get(self, content=None, cur=None): """ Get the archival status of a content in a specific server. Retreive from the database the archival status of the given content in the given archive server. Args: - content: the sha1 of the content. May be None for any id. - archive: the database id of the server we're looking into - may be None for any server. + content: the sha1 of the content. May be None for all contents. Yields: - A tuple (content_id, server_id, archival status, mtime, tzinfo). + A tuple (content_id, copies_json). """ - query = """SELECT content_id, archive_id, status, mtime + query = """SELECT content_id, copies FROM content_archive """ - conditions = [] - if content: - conditions.append("content_id='%s'" % content) - if archive: - conditions.append("archive_id='%s'" % archive) - - if conditions: - query = """%s - WHERE %s - """ % (query, ' and '.join(conditions)) + if content is not None: + query += "WHERE content_id='%s'" % content cur = self._cursor(cur) cur.execute(query) yield from cursor_to_bytes(cur) - def content_archive_update(self, content_id, archive_id, - new_status=None, cur=None): - """ Update the status of a archive content and set it's mtime to now() + def content_archive_update(self, content_id, copies, cur=None): + """ Update the status of an archive content and set it's mtime to now() Change the last modification time of an archived content and change its status to the given one. Args: - content_id (string): The content id. - archive_id (string): The id of the concerned archive. - new_status (string): One of missing, ongoing or present, this - status will replace the previous one. If not given, the - function only changes the mtime of the content. + content_id (str): The content id. + copies (dict): dictionary that match the json expected in the table """ query = """UPDATE content_archive - SET %(fields)s - WHERE content_id='%(content_id)s' - and archive_id='%(archive_id)s' - """ - fields = [] - if new_status: - fields.append("status='%s'" % new_status) - fields.append("mtime=now()") - - d = {'fields': ', '.join(fields), - 'content_id': content_id, - 'archive_id': archive_id} + SET copies=%s + WHERE content_id='%s' + """ % (jsonize(copies), content_id) cur = self._cursor(cur) - cur.execute(query % d) + cur.execute(query) diff --git a/swh/storage/tests/test_archiver.py b/swh/storage/tests/test_archiver.py index acd8f35b..333f40bb 100644 --- a/swh/storage/tests/test_archiver.py +++ b/swh/storage/tests/test_archiver.py @@ -1,275 +1,280 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest import os +import time +import json from nose.tools import istest from nose.plugins.attrib import attr -from datetime import datetime, timedelta from swh.core import hashutil from swh.core.tests.db_testing import DbsTestFixture from server_testing import ServerTestFixture from swh.storage import Storage from swh.storage.archiver import ArchiverDirector, ArchiverWorker from swh.objstorage.exc import ObjNotFoundError from swh.objstorage.api.client import RemoteObjStorage from swh.objstorage.api.server import app TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DATA_DIR = os.path.join(TEST_DIR, '../../../../swh-storage-testdata') @attr('db') class TestArchiver(DbsTestFixture, ServerTestFixture, unittest.TestCase): """ Test the objstorage archiver. """ TEST_DB_NAMES = [ 'softwareheritage-test', 'softwareheritage-archiver-test', ] TEST_DB_DUMPS = [ os.path.join(TEST_DATA_DIR, 'dumps/swh.dump'), os.path.join(TEST_DATA_DIR, 'dumps/swh-archiver.dump'), ] TEST_DB_DUMP_TYPES = [ 'pg_dump', 'pg_dump', ] def setUp(self): # Launch the backup server self.backup_objroot = tempfile.mkdtemp(prefix='remote') self.config = { 'storage_base': self.backup_objroot, 'storage_slicing': '0:2/2:4/4:6' } self.app = app super().setUp() # Retrieve connection (depends on the order in TEST_DB_NAMES) self.conn_storage = self.conns[0] # db connection to storage self.conn = self.conns[1] # archiver db's connection self.cursor = self.cursors[1] # a reader storage to check content has been archived self.remote_objstorage = RemoteObjStorage(self.url()) # Create the local storage. self.objroot = tempfile.mkdtemp(prefix='local') # a writer storage to store content before archiving self.storage = Storage(self.conn_storage, self.objroot) # Initializes and fill the tables. self.initialize_tables() # Create the archiver self.archiver = self.__create_director() self.storage_data = ('banco', 'http://localhost:%s/' % self.port) def tearDown(self): self.empty_tables() super().tearDown() def initialize_tables(self): """ Initializes the database with a sample of items. """ # Add an archive (update existing one for technical reason, # altering enum cannot run in a transaction...) self.cursor.execute("""UPDATE archive SET url='{}' WHERE id='banco' """.format(self.url())) self.conn.commit() def empty_tables(self): # Remove all content self.cursor.execute('DELETE FROM content_archive') self.conn.commit() - def __add_content(self, content_data, status='missing', date='now()'): - # Add the content + def __add_content(self, content_data, status='missing', date=None): + # Add the content to the storage content = hashutil.hashdata(content_data) content.update({'data': content_data}) self.storage.content_add([content]) # Then update database content_id = r'\x' + hashutil.hash_to_hex(content['sha1']) + copies = {'banco': { + 'status': status, + 'mtime': date or int(time.time()) # if date is None, use now() + }} self.cursor.execute("""INSERT INTO content_archive - VALUES('%s'::sha1, 'banco', '%s', %s) - """ % (content_id, status, date)) + VALUES('%s'::sha1, '%s') + """ % (content_id, json.dumps(copies))) return content['sha1'] def __get_missing(self): self.cursor.execute("""SELECT content_id FROM content_archive WHERE status='missing'""") return self.cursor.fetchall() def __create_director(self, batch_size=5000, archival_max_age=3600, retention_policy=1, asynchronous=False): config = { 'objstorage_type': 'local_objstorage', 'objstorage_path': self.objroot, 'objstorage_slicing': '0:2/2:4/4:6', 'batch_max_size': batch_size, 'archival_max_age': archival_max_age, 'retention_policy': retention_policy, 'asynchronous': asynchronous # Avoid depending on queue for tests. } director = ArchiverDirector(db_conn_archiver=self.conn, config=config) return director def __create_worker(self, batch={}, config={}): mobjstorage_args = self.archiver.master_objstorage_args if not config: config = self.archiver.config return ArchiverWorker(batch, archiver_args=self.conn, master_objstorage_args=mobjstorage_args, slave_objstorages=[self.storage_data], config=config) # Integration test - + @istest def archive_missing_content(self): """ Run archiver on a missing content should archive it. """ content_data = b'archive_missing_content' content_id = self.__add_content(content_data) # before, the content should not be there try: self.remote_objstorage.content_get(content_id) - except: + except ObjNotFoundError: pass + else: + self.fail('Content should not be present before archival') self.archiver.run() # now the content should be present on remote objstorage remote_data = self.remote_objstorage.content_get(content_id) - # After the run, the content should be archived after the archiver run. self.assertEquals(content_data, remote_data) @istest def archive_present_content(self): """ A content that is not 'missing' shouldn't be archived. """ id = self.__add_content(b'archive_present_content', status='present') # After the run, the content should NOT be in the archive.* self.archiver.run() with self.assertRaises(ObjNotFoundError): self.remote_objstorage.content_get(id) @istest def archive_already_enough(self): """ A content missing with enough copies shouldn't be archived. """ id = self.__add_content(b'archive_alread_enough') director = self.__create_director(retention_policy=0) director.run() with self.assertRaises(ObjNotFoundError): self.remote_objstorage.content_get(id) # Unit test for ArchiverDirector def vstatus(self, status, mtime): return self.archiver.get_virtual_status(status, mtime) @istest def vstatus_present(self): self.assertEquals( self.vstatus('present', None), 'present' ) @istest def vstatus_missing(self): self.assertEquals( self.vstatus('missing', None), 'missing' ) @istest def vstatus_ongoing_remaining(self): - current_time = datetime.now() self.assertEquals( - self.vstatus('ongoing', current_time), + self.vstatus('ongoing', int(time.time())), 'present' ) @istest def vstatus_ongoing_elapsed(self): - past_time = datetime.now() - timedelta( - seconds=self.archiver.config['archival_max_age'] + 1 + past_time = ( + int(time.time()) - self.archiver.config['archival_max_age'] - 1 ) self.assertEquals( self.vstatus('ongoing', past_time), 'missing' ) # Unit tests for archive worker @istest def need_archival_missing(self): """ A content should still need archival when it is missing. """ id = self.__add_content(b'need_archival_missing', status='missing') id = r'\x' + hashutil.hash_to_hex(id) worker = self.__create_worker() self.assertEqual(worker.need_archival(id, self.storage_data), True) @istest def need_archival_present(self): """ A content should still need archival when it is missing """ id = self.__add_content(b'need_archival_missing', status='present') id = r'\x' + hashutil.hash_to_hex(id) worker = self.__create_worker() self.assertEqual(worker.need_archival(id, self.storage_data), False) @istest def need_archival_ongoing_remaining(self): """ An ongoing archival with remaining time shouldnt need archival. """ id = self.__add_content(b'need_archival_ongoing_remaining', - status='ongoing', date="'%s'" % datetime.now()) + status='ongoing') id = r'\x' + hashutil.hash_to_hex(id) worker = self.__create_worker() self.assertEqual(worker.need_archival(id, self.storage_data), False) @istest def need_archival_ongoing_elasped(self): """ An ongoing archival with elapsed time should be scheduled again. """ id = self.__add_content( b'archive_ongoing_elapsed', status='ongoing', - date="'%s'" % (datetime.now() - timedelta( - seconds=self.archiver.config['archival_max_age'] + 1 - )) + date=( + int(time.time()) - self.archiver.config['archival_max_age'] - 1 + ) ) id = r'\x' + hashutil.hash_to_hex(id) worker = self.__create_worker() self.assertEqual(worker.need_archival(id, self.storage_data), True) @istest def content_sorting_by_archiver(self): """ Check that the content is correctly sorted. """ batch = { 'id1': { 'present': [('slave1', 'slave1_url')], 'missing': [] }, 'id2': { 'present': [], 'missing': [('slave1', 'slave1_url')] } } worker = self.__create_worker(batch=batch) mapping = worker.sort_content_by_archive() self.assertNotIn('id1', mapping[('slave1', 'slave1_url')]) self.assertIn('id2', mapping[('slave1', 'slave1_url')])