diff --git a/sql/Makefile b/sql/Makefile new file mode 100644 index 0000000..021dcb3 --- /dev/null +++ b/sql/Makefile @@ -0,0 +1,41 @@ +# Depends: postgresql-client, postgresql-autodoc + +DBNAME = softwareheritage-archiver-dev +DOCDIR = autodoc + +SQL_INIT = ../swh-init.sql +SQL_SCHEMA = swh-archiver-schema.sql +SQL_DATA = swh-archiver-data.sql +SQLS = $(SQL_INIT) $(SQL_SCHEMA) $(SQL_DATA) + +PSQL_BIN = psql +PSQL_FLAGS = --single-transaction --echo-all -X +PSQL = $(PSQL_BIN) $(PSQL_FLAGS) + + +all: + +createdb: createdb-stamp +createdb-stamp: $(SQL_INIT) + createdb $(DBNAME) + touch $@ + +filldb: filldb-stamp +filldb-stamp: createdb-stamp + cat $(SQLS) | $(PSQL) $(DBNAME) + touch $@ + +dropdb: + -dropdb $(DBNAME) + +dumpdb: swh-archiver.dump +swh.dump: filldb-stamp + pg_dump -Fc $(DBNAME) > $@ + +clean: + rm -rf *-stamp $(DOCDIR)/ + +distclean: clean dropdb + rm -f swh.dump + +.PHONY: all initdb createdb dropdb doc clean diff --git a/sql/swh-archiver-data.sql b/sql/swh-archiver-data.sql new file mode 100644 index 0000000..a156bd3 --- /dev/null +++ b/sql/swh-archiver-data.sql @@ -0,0 +1,2 @@ +INSERT INTO archive(id, url) +VALUES('Banco', 'http://banco.softwareheritage.org:5003/'); diff --git a/sql/swh-archiver-schema.sql b/sql/swh-archiver-schema.sql new file mode 100644 index 0000000..87df5dc --- /dev/null +++ b/sql/swh-archiver-schema.sql @@ -0,0 +1,50 @@ +-- In order to archive the content of the object storage, add +-- some tables to keep trace of what have already been archived. + +create table dbversion +( + version int primary key, + release timestamptz, + description text +); + +comment on table dbversion is 'Schema update tracking'; + +INSERT INTO dbversion(version, release, description) +VALUES(1, now(), 'Work In Progress'); + +CREATE DOMAIN archive_id AS TEXT; + +CREATE TABLE archive ( + id archive_id PRIMARY KEY, + url TEXT +); + +comment on table archive is 'Possible archives'; +comment on column archive.id is 'Short identifier for the archive'; +comment on column archive.url is 'Url identifying the archiver api'; + +CREATE TYPE archive_status AS ENUM ( + 'missing', + 'ongoing', + 'present' +); + +comment on type archive_status is 'Status of a given archive'; + +-- a SHA1 checksum (not necessarily originating from Git) +CREATE DOMAIN sha1 AS bytea CHECK (LENGTH(VALUE) = 20); + +CREATE TABLE content_archive ( + content_id sha1, + archive_id archive_id REFERENCES archive(id), + status archive_status, + mtime timestamptz, + PRIMARY KEY (content_id, archive_id) +); + +comment on table content_archive is 'Referencing the status and whereabouts of a content'; +comment on column content_archive.content_id is 'content identifier'; +comment on column content_archive.archive_id is 'content whereabouts'; +comment on column content_archive.status is 'content status'; +comment on column content_archive.mtime is 'last time the content was stored'; diff --git a/swh/archiver/copier.py b/swh/archiver/copier.py index 1484dbe..3669c76 100644 --- a/swh/archiver/copier.py +++ b/swh/archiver/copier.py @@ -1,60 +1,60 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.core import hashutil from swh.objstorage.api.client import RemoteObjStorage class ArchiverCopier(): """ This archiver copy some files into a remote objstorage in order to get a backup. Attributes: content_ids: A list of sha1's that represents the content this copier has to archive. server (RemoteArchive): The remote object storage that is used to backup content. master_storage (Storage): The master storage that contains the data the copier needs to archive. """ def __init__(self, destination, content, master_storage): """ Create a Copier for the archiver Args: destination: A tuple (archive_name, archive_url) that represents a - remote object storage as in the 'archives' table. + remote object storage as in the 'archive' table. content: A list of sha1 that represents the content this copier have to archive. master_storage (Storage): The master storage of the system that contains the data to archive. """ _name, self.url = destination self.content_ids = content self.server = RemoteObjStorage(self.url) self.master_storage = master_storage def run(self): """ Do the copy on the backup storage. Run the archiver copier in order to copy the required content into the current destination. The content which corresponds to the sha1 in self.content_ids will be fetched from the master_storage and then copied into the backup object storage. Returns: A boolean that indicates if the whole content have been copied. """ self.content_ids = list(map(lambda x: hashutil.hex_to_hash(x[2:]), self.content_ids)) contents = self.master_storage.content_get(self.content_ids) try: for content in contents: content_data = content['data'] self.server.content_add(content_data) except: return False return True diff --git a/swh/archiver/director.py b/swh/archiver/director.py index 15f26ff..8740f27 100644 --- a/swh/archiver/director.py +++ b/swh/archiver/director.py @@ -1,243 +1,258 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import swh import logging import click from datetime import datetime from swh.core import hashutil, config from swh.scheduler.celery_backend.config import app + from . import tasks # NOQA +from .storage import ArchiverStorage DEFAULT_CONFIG = { 'objstorage_path': ('str', '/tmp/swh-storage/objects'), 'batch_max_size': ('int', 50), 'archival_max_age': ('int', 3600), 'retention_policy': ('int', 2), 'asynchronous': ('bool', True), - 'dbconn': ('str', 'dbname=softwareheritage-dev user=guest') + 'dbconn': ('str', 'dbname=softwareheritage-archiver-dev user=guest'), + 'dbconn_storage': ('str', 'dbname=softwareheritage-dev user=guest') } task_name = 'swh.storage.archiver.tasks.SWHArchiverTask' logger = logging.getLogger() class ArchiverDirector(): """Process the files in order to know which one is needed as backup. The archiver director processes the files in the local storage in order to know which one needs archival and it delegates this task to archiver workers. - Attributes: master_storage: the local storage of the master server. slave_storages: Iterable of remote obj storages to the slaves servers used for backup. config: Archiver_configuration. A dictionary that must contain the following keys. - objstorage_path (string): the path of the objstorage of the - master. + objstorage_path (string): master's objstorage path + batch_max_size (int): The number of content items that can be given to the same archiver worker. archival_max_age (int): Delay given to the worker to copy all the files in a given batch. retention_policy (int): Required number of copies for the content to be considered safe. asynchronous (boolean): Indicate whenever the archival should run in asynchronous mode or not. """ - def __init__(self, db_conn, config): + def __init__(self, db_conn_archiver, db_conn_storage, config): """ Constructor of the archiver director. Args: - db_conn: db_conn: Either a libpq connection string, - or a psycopg2 connection. - config: Archiver_configuration. A dictionary that must contains + db_conn_archiver: Either a libpq connection string, + or a psycopg2 connection for the archiver db connection. + db_conn_storage: Either a libpq connection string, + or a psycopg2 connection for the db storage connection. + config: Archiver_configuration. A dictionary that must contain the following keys. - objstorage_path (string): the path of the objstorage of the - master. + objstorage_path (string): master's objstorage path batch_max_size (int): The number of content items that can be given to the same archiver worker. archival_max_age (int): Delay given to the worker to copy all the files in a given batch. retention_policy (int): Required number of copies for the content to be considered safe. asynchronous (boolean): Indicate whenever the archival should run in asynchronous mode or not. """ # Get the local storage of the master and remote ones for the slaves. - self.master_storage_args = [db_conn, config['objstorage_path']] + self.db_conn_archiver = db_conn_archiver + self.archiver_storage = ArchiverStorage(db_conn_archiver) + + self.master_storage_args = [db_conn_storage, config['objstorage_path']] master_storage = swh.storage.get_storage('local_storage', self.master_storage_args) slaves = { id: url for id, url - in master_storage.db.archive_ls() + in self.archiver_storage.archive_ls() } # TODO Database should be initialized somehow before going in # production. For now, assumes that the database contains - # datas for all the current content. + # data for all the current content. self.master_storage = master_storage self.slave_storages = slaves self.config = config def run(self): """ Run the archiver director. The archiver director will check all the contents of the archiver database and do the required backup jobs. """ if self.config['asynchronous']: run_fn = self.run_async_worker else: run_fn = self.run_sync_worker for batch in self.get_unarchived_content(): run_fn(batch) def run_async_worker(self, batch): """ Produce a worker that will be added to the task queue. """ task = app.tasks[task_name] - task.delay(batch, self.master_storage_args, - self.slave_storages, self.config) + task.delay(batch, + archiver_args=self.db_conn_archiver, + master_storage_args=self.master_storage_args, + slave_storages=self.slave_storages, + config=self.config) def run_sync_worker(self, batch): """ Run synchronously a worker on the given batch. """ task = app.tasks[task_name] - task(batch, self.master_storage_args, - self.slave_storages, self.config) + task(batch, + archiver_args=self.db_conn_archiver, + master_storage_args=self.master_storage_args, + slave_storages=self.slave_storages, + config=self.config) def get_unarchived_content(self): - """ get all the contents that needs to be archived. + """ Get contents that need to be archived. Yields: A batch of contents. Batches are dictionaries which associates a content id to the data about servers that contains it or not. {'id1': {'present': [('slave1', 'slave1_url')], 'missing': [('slave2', 'slave2_url'), ('slave3', 'slave3_url')] }, 'id2': {'present': [], 'missing': [ ('slave1', 'slave1_url'), ('slave2', 'slave2_url'), ('slave3', 'slave3_url') ]} } Where keys (idX) are sha1 of the content and (slaveX, slaveX_url) are ids and urls of the storage slaves. At least all the content that don't have enough copies on the backups servers are distributed into these batches. """ # Get the data about each content referenced into the archiver. missing_copy = {} - for content_id in self.master_storage.db.content_archive_ls(): + for content_id in self.archiver_storage.content_archive_ls(): db_content_id = '\\x' + hashutil.hash_to_hex(content_id[0]) # Fetch the datas about archival status of the content - backups = self.master_storage.db.content_archive_get( + backups = self.archiver_storage.content_archive_get( content=db_content_id ) for _content_id, server_id, status, mtime in backups: virtual_status = self.get_virtual_status(status, mtime) server_data = (server_id, self.slave_storages[server_id]) missing_copy.setdefault( db_content_id, {'present': [], 'missing': []} ).setdefault(virtual_status, []).append(server_data) # Check the content before archival. try: self.master_storage.objstorage.check(content_id[0]) except Exception as e: # Exception can be Error or ObjNotFoundError. logger.error(e) # TODO Do something to restore the content? if len(missing_copy) >= self.config['batch_max_size']: yield missing_copy missing_copy = {} if len(missing_copy) > 0: yield missing_copy def get_virtual_status(self, status, mtime): """ Compute the virtual presence of a content. If the status is ongoing but the time is not elasped, the archiver consider it will be present in the futur, and so consider it as present. However, if the time is elasped, the copy may have failed, so consider the content as missing. Arguments: status (string): One of ('present', 'missing', 'ongoing'). The status of the content. mtime (datetime): Time at which the content have been updated for the last time. Returns: The virtual status of the studied content, which is 'present' or 'missing'. Raises: ValueError: if the status is not one 'present', 'missing' or 'ongoing' """ if status in ('present', 'missing'): return status # If the status is 'ongoing' but there is still time, another worker # may still be on the task. if status == 'ongoing': mtime = mtime.replace(tzinfo=None) elapsed = (datetime.now() - mtime).total_seconds() if elapsed <= self.config['archival_max_age']: return 'present' else: return 'missing' else: raise ValueError("status must be either 'present', 'missing' " "or 'ongoing'") @click.command() @click.argument('config-path', required=1) @click.option('--dbconn', default=DEFAULT_CONFIG['dbconn'][1], - help="Connection string for the database") + help="Connection string for the archiver database") +@click.option('--dbconn-storage', default=DEFAULT_CONFIG['dbconn_storage'][1], + help="Connection string for the storage database") @click.option('--async/--sync', default=DEFAULT_CONFIG['asynchronous'][1], help="Indicates if the archiver should run asynchronously") -def launch(config_path, dbconn, async): +def launch(config_path, dbconn, dbconn_storage, async): # The configuration have following priority : # command line > file config > default config cl_config = { 'dbconn': dbconn, + 'dbconn_storage': dbconn_storage, 'asynchronous': async } conf = config.read(config_path, DEFAULT_CONFIG) conf.update(cl_config) # Create connection data and run the archiver. - archiver = ArchiverDirector(conf['dbconn'], conf) + archiver = ArchiverDirector(conf['dbconn'], conf['dbconn_storage'], conf) logger.info("Starting an archival at", datetime.now()) archiver.run() if __name__ == '__main__': launch() diff --git a/swh/archiver/storage.py b/swh/archiver/storage.py new file mode 100644 index 0000000..028304c --- /dev/null +++ b/swh/archiver/storage.py @@ -0,0 +1,89 @@ +# Copyright (C) 2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import psycopg2 + +from ..common import db_transaction_generator +from ..db import Db +from ..exc import StorageDBError + + +class ArchiverStorage(): + """SWH Archiver storage proxy, encompassing DB + + """ + def __init__(self, db_conn): + """ + Args: + db_conn: either a libpq connection string, or a psycopg2 connection + + """ + try: + if isinstance(db_conn, psycopg2.extensions.connection): + self.db = Db(db_conn) + else: + self.db = Db.connect(db_conn) + except psycopg2.OperationalError as e: + raise StorageDBError(e) + + @db_transaction_generator + def archive_ls(self, cur=None): + """ Get all the archives registered on the server. + + Yields: + a tuple (server_id, server_url) for each archive server. + """ + yield from self.db.archive_ls(cur) + + @db_transaction_generator + def content_archive_ls(self, cur=None): + """ Get the archival status of the content + + Get an iterable over all the content that is referenced + in a backup server. + + Yields: + the sha1 of each content referenced at least one time + in the database of archiveal status. + + """ + yield from self.db.content_archive_ls(cur) + + @db_transaction_generator + def content_archive_get(self, content=None, archive=None, cur=None): + """ Get the archival status of a content in a specific server. + + Retreive from the database the archival status of the given content + in the given archive server. + + Args: + content: the sha1 of the content. May be None for any id. + archive: the database id of the server we're looking into + may be None for any server. + + Yields: + A tuple (content_id, server_id, archival status, mtime, tzinfo). + """ + yield from self.db.content_archive_get(content, archive, cur) + + @db_transaction_generator + def content_archive_update(self, content_id, archive_id, + new_status=None, cur=None): + """ Update the status of a archive content and set it's mtime to now() + + Change the last modification time of an archived content and change + its status to the given one. + + Args: + content_id (string): The content id. + archive_id (string): The id of the concerned archive. + new_status (string): One of missing, ongoing or present, this + status will replace the previous one. If not given, the + function only changes the mtime of the content. + """ + yield from self.db.content_archive_update(content_id, + archive_id, + new_status, + cur) diff --git a/swh/archiver/tasks.py b/swh/archiver/tasks.py index 0b7ce61..439aae0 100644 --- a/swh/archiver/tasks.py +++ b/swh/archiver/tasks.py @@ -1,20 +1,20 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task from .worker import ArchiverWorker class SWHArchiverTask(Task): """ Main task that archive a batch of content. """ task_queue = 'swh_storage_archive_worker' - def run(self, batch, master_storage_args, - slave_storages, config): - aw = ArchiverWorker(batch, master_storage_args, + def run(self, batch, archiver_args, master_storage_args, slave_storages, + config): + aw = ArchiverWorker(batch, archiver_args, master_storage_args, slave_storages, config) if aw.run(): self.log("Successful backup for a batch of size %s" % len(batch)) diff --git a/swh/archiver/tests/manual_test_archiver.py b/swh/archiver/tests/manual_test_archiver.py index 26d4f4c..8ab77bb 100644 --- a/swh/archiver/tests/manual_test_archiver.py +++ b/swh/archiver/tests/manual_test_archiver.py @@ -1,95 +1,96 @@ import string import random from swh.core import hashutil from swh.storage import Storage from swh.storage.db import cursor_to_bytes from swh.storage.archiver import ArchiverDirector def rs(size=6, chars=string.ascii_uppercase + string.ascii_lowercase): return ''.join(random.choice(chars) for _ in range(size)) def mc(data): data = bytes(data, 'utf8') content = hashutil.hashdata(data) content.update({'data': data}) return content def initialize_content_archive(db, sample_size, names=['Local']): """ Initialize the content_archive table with a sample. From the content table, get a sample of id, and fill the content_archive table with those id in order to create a test sample for the archiver. Args: db: The database of the storage. sample_size (int): The size of the sample to create. names: A list of archive names. Those archives must already exists. Archival status of the archives content will be erased on db. Returns: Tha amount of entry created. """ with db.transaction() as cur: cur.execute('DELETE FROM content_archive') with db.transaction() as cur: cur.execute('SELECT sha1 from content limit %d' % sample_size) ids = list(cursor_to_bytes(cur)) for id, in ids: tid = r'\x' + hashutil.hash_to_hex(id) with db.transaction() as cur: for name in names: s = """INSERT INTO content_archive VALUES('%s'::sha1, '%s', 'missing', now()) """ % (tid, name) cur.execute(s) print('Initialized database with', sample_size * len(names), 'items') return sample_size * len(names) def clean(): # Clean all with loc.db.transaction() as cur: cur.execute('delete from content_archive') cur.execute('delete from content') import os os.system("rm -r /tmp/swh/storage-dev/2/*") CONTENT_SIZE = 10 if __name__ == '__main__': random.seed(0) - # Local database - dbname = 'softwareheritage-dev' - user = 'qcampos' - cstring = 'dbname=%s user=%s' % (dbname, user) + + # Local databases + cstring_archiver = 'service=swh-archiver-dev' + cstring_storage = 'service=swh-dev' + # Archiver config config = { 'objstorage_path': '/tmp/swh/storage-dev/2', 'archival_max_age': 3600, 'batch_max_size': 10, 'retention_policy': 1, 'asynchronous': False } # Grand-palais's storage - loc = Storage(cstring, config['objstorage_path']) + loc = Storage(cstring_storage, config['objstorage_path']) # Add the content l = [mc(rs(100)) for _ in range(CONTENT_SIZE)] loc.content_add(l) initialize_content_archive(loc.db, CONTENT_SIZE, ['petit-palais']) # Launch the archiver - archiver = ArchiverDirector(cstring, config) + archiver = ArchiverDirector(cstring_archiver, cstring_storage, config) archiver.run() diff --git a/swh/archiver/tests/test_archiver.py b/swh/archiver/tests/test_archiver.py index c26ef86..9c44515 100644 --- a/swh/archiver/tests/test_archiver.py +++ b/swh/archiver/tests/test_archiver.py @@ -1,245 +1,277 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest import os from nose.tools import istest from nose.plugins.attrib import attr from datetime import datetime, timedelta from swh.core import hashutil -from swh.core.tests.db_testing import DbTestFixture +from swh.core.tests.db_testing import DbsTestFixture from server_testing import ServerTestFixture from swh.storage import Storage from swh.storage.archiver import ArchiverDirector, ArchiverWorker from swh.objstorage.exc import ObjNotFoundError from swh.objstorage.api.client import RemoteObjStorage from swh.objstorage.api.server import app TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DATA_DIR = os.path.join(TEST_DIR, '../../../../swh-storage-testdata') @attr('db') -class TestArchiver(DbTestFixture, ServerTestFixture, +class TestArchiver(DbsTestFixture, ServerTestFixture, unittest.TestCase): """ Test the objstorage archiver. """ - TEST_DB_DUMP = os.path.join(TEST_DATA_DIR, 'dumps/swh.dump') + TEST_DB_NAMES = [ + 'softwareheritage-test', + 'softwareheritage-archiver-test', + ] + TEST_DB_DUMPS = [ + os.path.join(TEST_DATA_DIR, 'dumps/swh.dump'), + os.path.join(TEST_DATA_DIR, 'dumps/swh-archiver.dump'), + ] + TEST_DB_DUMP_TYPES = [ + 'pg_dump', + 'pg_dump', + ] def setUp(self): # Launch the backup server self.backup_objroot = tempfile.mkdtemp(prefix='remote') - self.config = {'storage_base': self.backup_objroot, - 'storage_slicing': '0:2/2:4/4:6'} + self.config = { + 'storage_base': self.backup_objroot, + 'storage_slicing': '0:2/2:4/4:6' + } self.app = app super().setUp() - # Launch a client to check objects presence + # Retrieve connection (depends on the order in TEST_DB_NAMES) + self.conn_storage = self.conns[0] # db connection to storage + self.conn = self.conns[1] # archiver db's connection + self.cursor = self.cursors[1] + # a reader storage to check content has been archived self.remote_objstorage = RemoteObjStorage(self.url()) # Create the local storage. self.objroot = tempfile.mkdtemp(prefix='local') - self.storage = Storage(self.conn, self.objroot) + # a writer storage to store content before archiving + self.storage = Storage(self.conn_storage, self.objroot) # Initializes and fill the tables. self.initialize_tables() # Create the archiver self.archiver = self.__create_director() self.storage_data = ('Local', 'http://localhost:%s/' % self.port) def tearDown(self): self.empty_tables() super().tearDown() def initialize_tables(self): """ Initializes the database with a sample of items. """ # Add an archive - self.cursor.execute("""INSERT INTO archives(id, url) - VALUES('Local', 'http://localhost:{}/') - """.format(self.port)) + self.cursor.execute("""INSERT INTO archive(id, url) + VALUES('Local', '{}') + """.format(self.url())) self.conn.commit() def empty_tables(self): # Remove all content self.cursor.execute('DELETE FROM content_archive') - self.cursor.execute('DELETE FROM archives') + self.cursor.execute('DELETE FROM archive where id=\'Local\'') self.conn.commit() def __add_content(self, content_data, status='missing', date='now()'): # Add the content content = hashutil.hashdata(content_data) content.update({'data': content_data}) self.storage.content_add([content]) # Then update database content_id = r'\x' + hashutil.hash_to_hex(content['sha1']) self.cursor.execute("""INSERT INTO content_archive VALUES('%s'::sha1, 'Local', '%s', %s) """ % (content_id, status, date)) return content['sha1'] def __get_missing(self): self.cursor.execute("""SELECT content_id FROM content_archive WHERE status='missing'""") return self.cursor.fetchall() def __create_director(self, batch_size=5000, archival_max_age=3600, retention_policy=1, asynchronous=False): config = { 'objstorage_path': self.objroot, 'batch_max_size': batch_size, 'archival_max_age': archival_max_age, 'retention_policy': retention_policy, 'asynchronous': asynchronous # Avoid depending on queue for tests. } - director = ArchiverDirector(self.conn, config) + director = ArchiverDirector(db_conn_archiver=self.conn, + db_conn_storage=self.conn_storage, + config=config) return director def __create_worker(self, batch={}, config={}): - mstorage_args = [self.archiver.master_storage.db.conn, - self.objroot] - slaves = [self.storage_data] + mstorage_args = [ + self.archiver.master_storage.db.conn, # master storage db + # connection + self.objroot # object storage path + ] if not config: config = self.archiver.config - return ArchiverWorker(batch, mstorage_args, slaves, config) + return ArchiverWorker(batch, + archiver_args=self.conn, + master_storage_args=mstorage_args, + slave_storages=[self.storage_data], + config=config) # Integration test @istest def archive_missing_content(self): """ Run archiver on a missing content should archive it. """ content_data = b'archive_missing_content' - id = self.__add_content(content_data) - # After the run, the content should be in the archive. + content_id = self.__add_content(content_data) + # before, the content should not be there + try: + self.remote_objstorage.content_get(content_id) + except: + pass self.archiver.run() - remote_data = self.remote_objstorage.content_get(id) + # now the content should be present on remote objstorage + remote_data = self.remote_objstorage.content_get(content_id) + # After the run, the content should be archived after the archiver run. self.assertEquals(content_data, remote_data) @istest def archive_present_content(self): """ A content that is not 'missing' shouldn't be archived. """ id = self.__add_content(b'archive_present_content', status='present') # After the run, the content should NOT be in the archive.* self.archiver.run() with self.assertRaises(ObjNotFoundError): self.remote_objstorage.content_get(id) @istest def archive_already_enough(self): """ A content missing with enough copies shouldn't be archived. """ id = self.__add_content(b'archive_alread_enough') director = self.__create_director(retention_policy=0) director.run() with self.assertRaises(ObjNotFoundError): self.remote_objstorage.content_get(id) # Unit test for ArchiverDirector def vstatus(self, status, mtime): return self.archiver.get_virtual_status(status, mtime) @istest def vstatus_present(self): self.assertEquals( self.vstatus('present', None), 'present' ) @istest def vstatus_missing(self): self.assertEquals( self.vstatus('missing', None), 'missing' ) @istest def vstatus_ongoing_remaining(self): current_time = datetime.now() self.assertEquals( self.vstatus('ongoing', current_time), 'present' ) @istest def vstatus_ongoing_elapsed(self): past_time = datetime.now() - timedelta( seconds=self.archiver.config['archival_max_age'] + 1 ) self.assertEquals( self.vstatus('ongoing', past_time), 'missing' ) # Unit tests for archive worker @istest def need_archival_missing(self): """ A content should still need archival when it is missing. """ id = self.__add_content(b'need_archival_missing', status='missing') id = r'\x' + hashutil.hash_to_hex(id) worker = self.__create_worker() self.assertEqual(worker.need_archival(id, self.storage_data), True) @istest def need_archival_present(self): """ A content should still need archival when it is missing """ id = self.__add_content(b'need_archival_missing', status='present') id = r'\x' + hashutil.hash_to_hex(id) worker = self.__create_worker() self.assertEqual(worker.need_archival(id, self.storage_data), False) @istest def need_archival_ongoing_remaining(self): """ An ongoing archival with remaining time shouldnt need archival. """ id = self.__add_content(b'need_archival_ongoing_remaining', status='ongoing', date="'%s'" % datetime.now()) id = r'\x' + hashutil.hash_to_hex(id) worker = self.__create_worker() self.assertEqual(worker.need_archival(id, self.storage_data), False) @istest def need_archival_ongoing_elasped(self): """ An ongoing archival with elapsed time should be scheduled again. """ id = self.__add_content( b'archive_ongoing_elapsed', status='ongoing', date="'%s'" % (datetime.now() - timedelta( seconds=self.archiver.config['archival_max_age'] + 1 )) ) id = r'\x' + hashutil.hash_to_hex(id) worker = self.__create_worker() self.assertEqual(worker.need_archival(id, self.storage_data), True) @istest def content_sorting_by_archiver(self): """ Check that the content is correctly sorted. """ batch = { 'id1': { 'present': [('slave1', 'slave1_url')], 'missing': [] }, 'id2': { 'present': [], 'missing': [('slave1', 'slave1_url')] } } worker = self.__create_worker(batch=batch) mapping = worker.sort_content_by_archive() self.assertNotIn('id1', mapping[('slave1', 'slave1_url')]) self.assertIn('id2', mapping[('slave1', 'slave1_url')]) diff --git a/swh/archiver/worker.py b/swh/archiver/worker.py index 8fda96b..a14476c 100644 --- a/swh/archiver/worker.py +++ b/swh/archiver/worker.py @@ -1,239 +1,244 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import logging +from .storage import ArchiverStorage from .copier import ArchiverCopier from .. import get_storage from datetime import datetime logger = logging.getLogger() class ArchiverWorker(): """ Do the required backups on a given batch of contents. Process the content of a content batch in order to do the needed backups on the slaves servers. Attributes: batch: The content this worker has to archive, which is a dictionary that associates a content's sha1 id to the list of servers where the content is present or missing (see ArchiverDirector::get_unarchived_content). master_storage_args: The connection argument to initialize the master storage with the db connection url & the object storage path. slave_storages: A map that associates server_id to the remote server. config: Archiver_configuration. A dictionary that must contains the following keys. objstorage_path (string): the path of the objstorage of the master. batch_max_size (int): The number of content items that can be given to the same archiver worker. archival_max_age (int): Delay given to the worker to copy all the files in a given batch. retention_policy (int): Required number of copies for the content to be considered safe. asynchronous (boolean): Indicate whenever the archival should run in asynchronous mode or not. """ - def __init__(self, batch, master_storage_args, slave_storages, config): + def __init__(self, batch, archiver_args, master_storage_args, + slave_storages, config): """ Constructor of the ArchiverWorker class. Args: batch: A batch of content, which is a dictionary that associates a content's sha1 id to the list of servers where the content is present. + archiver_args: The archiver's arguments to establish connection to + db. master_storage_args: The master storage arguments. slave_storages: A map that associates server_id to the remote server. config: Archiver_configuration. A dictionary that must contains the following keys. objstorage_path (string): the path of the objstorage of the master. batch_max_size (int): The number of content items that can be given to the same archiver worker. archival_max_age (int): Delay given to the worker to copy all the files in a given batch. retention_policy (int): Required number of copies for the content to be considered safe. asynchronous (boolean): Indicate whenever the archival should run in asynchronous mode or not. """ self.batch = batch + self.archiver_storage = ArchiverStorage(archiver_args) self.master_storage = get_storage('local_storage', master_storage_args) self.slave_storages = slave_storages self.config = config def __choose_backup_servers(self, allowed_storage, backup_number): """ Choose the slave servers for archival. Choose the given amount of servers among those which don't already contain a copy of the content. Args: allowed_storage: servers when the content is not already present. backup_number (int): The number of servers we have to choose in order to fullfill the objective. """ # In case there is not enough backup servers to get all the backups # we need, just do our best. # TODO such situation can only be caused by an incorrect configuration # setting. Do a verification previously. backup_number = min(backup_number, len(allowed_storage)) # TODO Find a better (or a good) policy to choose the backup servers. # The random choice should be equivalently distributed between # servers for a great amount of data, but don't take care of servers # capacities. return random.sample(allowed_storage, backup_number) def __get_archival_status(self, content_id, server): """ Get the archival status of the required content. Attributes: content_id (string): Sha1 of the content. server: Tuple (archive_id, archive_url) of the archive server. Returns: A dictionary that contains all the required data : 'content_id', 'archive_id', 'status', and 'mtime' """ t, = list( - self.master_storage.db.content_archive_get(content_id, server[0]) + self.archiver_storage.content_archive_get(content_id, server[0]) ) return { 'content_id': t[0], 'archive_id': t[1], 'status': t[2], 'mtime': t[3] } def __content_archive_update(self, content_id, archive_id, new_status=None): """ Update the status of a archive content and set it's mtime to now() Change the last modification time of an archived content and change its status to the given one. Args: content_id (string): The content id. archive_id (string): The id of the concerned archive. new_status (string): One of missing, ongoing or present, this status will replace the previous one. If not given, the function only changes the mtime of the content. """ - self.master_storage.db.content_archive_update( + self.archiver_storage.content_archive_update( content_id, archive_id, new_status ) def need_archival(self, content, destination): """ Indicates whenever a content need archivage. Filter function that returns True if a given content still require to be archived. Args: content (str): Sha1 of a content. destination: Tuple (archive id, archive url). """ archival_status = self.__get_archival_status( content, destination ) status = archival_status['status'] mtime = archival_status['mtime'] # If the archive is already present, no need to backup. if status == 'present': return False # If the content is ongoing but still have time, there is # another worker working on this content. elif status == 'ongoing': mtime = mtime.replace(tzinfo=None) elapsed = (datetime.now() - mtime).total_seconds() if elapsed <= self.config['archival_max_age']: return False return True def sort_content_by_archive(self): """ Create a map {archive_server -> list of content) Create a mapping that associate to a archive server all the contents that needs to be archived in it by the current worker. The map is in the form of : { (archive_1, archive_1_url): [content1, content2, content_3] (archive_2, archive_2_url): [content1, content3] } Returns: The created mapping. """ slaves_copy = {} for content_id in self.batch: # Choose some servers to upload the content among the missing ones. server_data = self.batch[content_id] nb_present = len(server_data['present']) nb_backup = self.config['retention_policy'] - nb_present backup_servers = self.__choose_backup_servers( server_data['missing'], nb_backup ) # Fill the map destination -> content to upload for server in backup_servers: slaves_copy.setdefault(server, []).append(content_id) return slaves_copy def run(self): """ Do the task expected from the archiver worker. Process the content in the batch, ensure that the elements still need an archival, and spawn copiers to copy files in each destinations. """ # Get a map (archive -> [contents]) slaves_copy = self.sort_content_by_archive() # At this point, re-check the archival status in order to know if the # job have been done by another worker. for destination in slaves_copy: # list() is needed because filter's result will be consumed twice. slaves_copy[destination] = list(filter( lambda content_id: self.need_archival(content_id, destination), slaves_copy[destination] )) for content_id in slaves_copy[destination]: self.__content_archive_update(content_id, destination[0], new_status='ongoing') # Spawn a copier for each destination for destination in slaves_copy: try: self.run_copier(destination, slaves_copy[destination]) except: logger.error('Unable to copy a batch to %s' % destination) def run_copier(self, destination, contents): """ Run a copier in order to archive the given contents Upload the given contents to the given archive. If the process fail, the whole content is considered uncopied and remains 'ongoing', waiting to be rescheduled as there is a delay. Attributes: destination: Tuple (archive_id, archive_url) of the destination. contents: List of contents to archive. """ ac = ArchiverCopier(destination, contents, self.master_storage) if ac.run(): # Once the archival complete, update the database. for content_id in contents: self.__content_archive_update(content_id, destination[0], new_status='present')