diff --git a/sql/archiver/swh-archiver-data.sql b/sql/archiver/swh-archiver-data.sql index e93e1f19..38364e09 100644 --- a/sql/archiver/swh-archiver-data.sql +++ b/sql/archiver/swh-archiver-data.sql @@ -1,2 +1,2 @@ -INSERT INTO archive(id, url) -VALUES('banco', 'http://banco.softwareheritage.org:5003/'); +INSERT INTO archive(id) VALUES('banco'); +INSERT INTO archive(id) VALUES('azure'); diff --git a/sql/archiver/swh-archiver-schema.sql b/sql/archiver/swh-archiver-schema.sql index d4e340a2..2e31787e 100644 --- a/sql/archiver/swh-archiver-schema.sql +++ b/sql/archiver/swh-archiver-schema.sql @@ -1,67 +1,59 @@ -- In order to archive the content of the object storage, add -- some tables to keep trace of what have already been archived. create table dbversion ( version int primary key, release timestamptz, description text ); comment on table dbversion is 'Schema update tracking'; INSERT INTO dbversion(version, release, description) -VALUES(3, now(), 'Work In Progress'); - -CREATE TYPE archive_id AS ENUM ( - 'uffizi', - 'banco' -); +VALUES(4, now(), 'Work In Progress'); CREATE TABLE archive ( - id archive_id PRIMARY KEY, - url TEXT + id text PRIMARY KEY ); comment on table archive is 'Possible archives'; comment on column archive.id is 'Short identifier for the archive'; -comment on column archive.url is 'Url identifying the archiver api'; CREATE TYPE archive_status AS ENUM ( 'missing', 'ongoing', 'present', 'corrupted' ); comment on type archive_status is 'Status of a given archive'; -- a SHA1 checksum (not necessarily originating from Git) CREATE DOMAIN sha1 AS bytea CHECK (LENGTH(VALUE) = 20); CREATE TABLE content_archive ( content_id sha1 primary key, copies jsonb, num_present int default null ); create index on content_archive(num_present); comment on table content_archive is 'Referencing the status and whereabouts of a content'; comment on column content_archive.content_id is 'content identifier'; comment on column content_archive.copies is 'map archive_id -> { "status": archive_status, "mtime": epoch timestamp }'; comment on column content_archive.num_present is 'Number of copies marked as present (cache updated via trigger)'; -- Keep the num_copies cache updated CREATE FUNCTION update_num_present() RETURNS TRIGGER AS $$ BEGIN NEW.num_present := (select count(*) from jsonb_each(NEW.copies) where value->>'status' = 'present'); RETURN new; END; $$ LANGUAGE PLPGSQL; CREATE TRIGGER update_num_present BEFORE INSERT OR UPDATE OF copies ON content_archive FOR EACH ROW EXECUTE PROCEDURE update_num_present(); - diff --git a/sql/archiver/upgrades/004.sql b/sql/archiver/upgrades/004.sql new file mode 100644 index 00000000..c6f9bcf1 --- /dev/null +++ b/sql/archiver/upgrades/004.sql @@ -0,0 +1,12 @@ +-- SWH DB schema upgrade +-- from_version: 3 +-- to_version: 4 +-- description: Add azure instance + +INSERT INTO dbversion(version, release, description) +VALUES(4, now(), 'Work In Progress'); + +ALTER TABLE archive DROP COLUMN url; +ALTER TABLE archive ALTER COLUMN id SET DATA TYPE TEXT; + +INSERT INTO archive(id) VALUES ('azure'); diff --git a/swh/storage/archiver/db.py b/swh/storage/archiver/db.py index a55929e6..d7985b81 100644 --- a/swh/storage/archiver/db.py +++ b/swh/storage/archiver/db.py @@ -1,205 +1,203 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import time from swh.storage.db import BaseDb, cursor_to_bytes class ArchiverDb(BaseDb): """Proxy to the SWH's archiver DB """ def archive_ls(self, cur=None): """ Get all the archives registered on the server. Yields: a tuple (server_id, server_url) for each archive server. """ cur = self._cursor(cur) - cur.execute("""SELECT id, url - FROM archive - """) + cur.execute("SELECT * FROM archive") yield from cursor_to_bytes(cur) def content_archive_get(self, content_id, cur=None): """ Get the archival status of a content in a specific server. Retrieve from the database the archival status of the given content in the given archive server. Args: content_id: the sha1 of the content. Yields: A tuple (content_id, present_copies, ongoing_copies), where ongoing_copies is a dict mapping copy to mtime. """ query = """SELECT content_id, array( SELECT key FROM jsonb_each(copies) WHERE value->>'status' = 'present' ORDER BY key ) AS present, array( SELECT key FROM jsonb_each(copies) WHERE value->>'status' = 'ongoing' ORDER BY key ) AS ongoing, array( SELECT value->'mtime' FROM jsonb_each(copies) WHERE value->>'status' = 'ongoing' ORDER BY key ) AS ongoing_mtime FROM content_archive WHERE content_id = %s ORDER BY content_id """ cur = self._cursor(cur) cur.execute(query, (content_id,)) content_id, present, ongoing, mtimes = cur.fetchone() return (content_id, present, dict(zip(ongoing, mtimes))) def content_archive_get_copies(self, last_content=None, limit=1000, cur=None): """Get the list of copies for `limit` contents starting after `last_content`. Args: last_content: sha1 of the last content retrieved. May be None to start at the beginning. limit: number of contents to retrieve. Can be None to retrieve all objects (will be slow). Yields: A tuple (content_id, present_copies, ongoing_copies), where ongoing_copies is a dict mapping copy to mtime. """ query = """SELECT content_id, array( SELECT key FROM jsonb_each(copies) WHERE value->>'status' = 'present' ORDER BY key ) AS present, array( SELECT key FROM jsonb_each(copies) WHERE value->>'status' = 'ongoing' ORDER BY key ) AS ongoing, array( SELECT value->'mtime' FROM jsonb_each(copies) WHERE value->>'status' = 'ongoing' ORDER BY key ) AS ongoing_mtime FROM content_archive WHERE content_id > %s ORDER BY content_id LIMIT %s """ if last_content is None: last_content = b'' cur = self._cursor(cur) cur.execute(query, (last_content, limit)) for content_id, present, ongoing, mtimes in cursor_to_bytes(cur): yield (content_id, present, dict(zip(ongoing, mtimes))) def content_archive_get_unarchived_copies( self, retention_policy, last_content=None, limit=1000, cur=None): """ Get the list of copies for `limit` contents starting after `last_content`. Yields only copies with number of present smaller than `retention policy`. Args: last_content: sha1 of the last content retrieved. May be None to start at the beginning. retention_policy: number of required present copies limit: number of contents to retrieve. Can be None to retrieve all objects (will be slow). Yields: A tuple (content_id, present_copies, ongoing_copies), where ongoing_copies is a dict mapping copy to mtime. """ query = """SELECT content_id, array( SELECT key FROM jsonb_each(copies) WHERE value->>'status' = 'present' ORDER BY key ) AS present, array( SELECT key FROM jsonb_each(copies) WHERE value->>'status' = 'ongoing' ORDER BY key ) AS ongoing, array( SELECT value->'mtime' FROM jsonb_each(copies) WHERE value->>'status' = 'ongoing' ORDER BY key ) AS ongoing_mtime FROM content_archive WHERE content_id > %s AND num_present < %s ORDER BY content_id LIMIT %s """ if last_content is None: last_content = b'' cur = self._cursor(cur) cur.execute(query, (last_content, retention_policy, limit)) for content_id, present, ongoing, mtimes in cursor_to_bytes(cur): yield (content_id, present, dict(zip(ongoing, mtimes))) def content_archive_update(self, content_id, archive_id, new_status=None, cur=None): """ Update the status of an archive content and set its mtime to Change the mtime of an archived content for the given archive and set it's mtime to the current time. Args: content_id (str): content sha1 archive_id (str): name of the archive new_status (str): one of 'missing', 'present' or 'ongoing'. this status will replace the previous one. If not given, the function only change the mtime of the content for the given archive. """ if new_status is not None: query = """UPDATE content_archive SET copies=jsonb_set( copies, '{%s}', '{"status":"%s", "mtime":%d}' ) WHERE content_id='%s' """ % (archive_id, new_status, int(time.time()), content_id) else: query = """ UPDATE content_archive SET copies=jsonb_set(copies, '{%s,mtime}', '%d') WHERE content_id='%s' """ % (archive_id, int(time.time())) cur = self._cursor(cur) cur.execute(query) diff --git a/swh/storage/tests/test_archiver.py b/swh/storage/tests/test_archiver.py index c4f88e6f..7eac329f 100644 --- a/swh/storage/tests/test_archiver.py +++ b/swh/storage/tests/test_archiver.py @@ -1,296 +1,282 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest import os import time import json from nose.tools import istest from nose.plugins.attrib import attr from swh.core import hashutil from swh.core.tests.db_testing import DbsTestFixture from server_testing import ServerTestFixture from swh.storage.archiver import ArchiverWithRetentionPolicyDirector from swh.storage.archiver import ArchiverWithRetentionPolicyWorker from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.objstorage.api.server import app TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DATA_DIR = os.path.join(TEST_DIR, '../../../../swh-storage-testdata') @attr('db') class TestArchiver(DbsTestFixture, ServerTestFixture, unittest.TestCase): """ Test the objstorage archiver. """ TEST_DB_NAMES = [ 'softwareheritage-archiver-test', ] TEST_DB_DUMPS = [ os.path.join(TEST_DATA_DIR, 'dumps/swh-archiver.dump'), ] TEST_DB_DUMP_TYPES = [ 'pg_dump', ] def setUp(self): # Launch the backup server dest_root = tempfile.mkdtemp(prefix='remote') self.config = { 'storage_base': dest_root, 'storage_slicing': '0:2/2:4/4:6' } self.app = app super().setUp() # Retrieve connection (depends on the order in TEST_DB_NAMES) self.conn = self.conns[0] # archiver db's connection self.cursor = self.cursors[0] # Create source storage src_root = tempfile.mkdtemp() src_config = {'cls': 'pathslicing', 'args': {'root': src_root, 'slicing': '0:2/2:4/4:6'}} self.src_storage = get_objstorage(**src_config) # Create destination storage dest_config = {'cls': 'remote', 'args': {'base_url': self.url()}} self.dest_storage = get_objstorage(**dest_config) # Keep mapped the id to the storages self.storages = {'uffizi': self.src_storage, 'banco': self.dest_storage} # Override configurations src_archiver_conf = {'host': 'uffizi'} dest_archiver_conf = {'host': 'banco'} src_archiver_conf.update(src_config) dest_archiver_conf.update(dest_config) self.archiver_storages = [src_archiver_conf, dest_archiver_conf] self._override_director_config() self._override_worker_config() # Create the base archiver self.archiver = self._create_director() - # Initializes and fill the tables. - self.initialize_tables() - def tearDown(self): self.empty_tables() super().tearDown() - def initialize_tables(self): - """ Initializes the database with a sample of items. - """ - # Add an archive (update existing one for technical reason, - # altering enum cannot run in a transaction...) - self.cursor.execute("""UPDATE archive - SET url='{}' - WHERE id='banco' - """.format(self.url())) - self.conn.commit() - def empty_tables(self): # Remove all content self.cursor.execute('DELETE FROM content_archive') self.conn.commit() def _override_director_config(self, retention_policy=2): """ Override the default config of the Archiver director to allow the tests to use the *-test db instead of the default one as there is no configuration file for now. """ ArchiverWithRetentionPolicyDirector.parse_config_file = lambda obj, additional_configs: { # noqa 'dbconn': self.conn, 'batch_max_size': 5000, 'archival_max_age': 3600, 'retention_policy': retention_policy, 'asynchronous': False } def _override_worker_config(self): """ Override the default config of the Archiver worker to allow the tests to use the *-test db instead of the default one as there is no configuration file for now. """ ArchiverWithRetentionPolicyWorker.parse_config_file = lambda obj, additional_configs: { # noqa 'retention_policy': 2, 'archival_max_age': 3600, 'dbconn': self.conn, 'storages': self.archiver_storages } def _create_director(self): return ArchiverWithRetentionPolicyDirector() def _create_worker(self, batch={}): return ArchiverWithRetentionPolicyWorker(batch) def _add_content(self, storage_name, content_data): """ Add really a content to the given objstorage This put an empty status for the added content. """ # Add the content to the storage obj_id = self.storages[storage_name].add(content_data) db_obj_id = r'\x' + hashutil.hash_to_hex(obj_id) self.cursor.execute(""" INSERT INTO content_archive VALUES('%s', '{}') """ % (db_obj_id)) return obj_id def _update_status(self, obj_id, storage_name, status, date=None): """ Update the db status for the given id/storage_name. This does not create the content in the storage. """ db_obj_id = r'\x' + hashutil.hash_to_hex(obj_id) self.archiver.archiver_storage.content_archive_update( db_obj_id, storage_name, status ) def _add_dated_content(self, obj_id, copies={}): """ Fully erase the previous copies field for the given content id This does not alter the contents into the objstorages. """ db_obj_id = r'\x' + hashutil.hash_to_hex(obj_id) self.cursor.execute(""" UPDATE TABLE content_archive SET copies='%s' WHERE content_id='%s' """ % (json.dumps(copies), db_obj_id)) # Integration test @istest def archive_missing_content(self): """ Run archiver on a missing content should archive it. """ obj_data = b'archive_missing_content' obj_id = self._add_content('uffizi', obj_data) self._update_status(obj_id, 'uffizi', 'present') # Content is missing on banco (entry not present in the db) try: self.dest_storage.get(obj_id) except ObjNotFoundError: pass else: self.fail('Content should not be present before archival') self.archiver.run() # now the content should be present on remote objstorage remote_data = self.dest_storage.get(obj_id) self.assertEquals(obj_data, remote_data) @istest def archive_present_content(self): """ A content that is not 'missing' shouldn't be archived. """ obj_id = self._add_content('uffizi', b'archive_present_content') self._update_status(obj_id, 'uffizi', 'present') self._update_status(obj_id, 'banco', 'present') # After the run, the content should NOT be in the archive. # As the archiver believe it was already in. self.archiver.run() with self.assertRaises(ObjNotFoundError): self.dest_storage.get(obj_id) @istest def archive_already_enough(self): """ A content missing with enough copies shouldn't be archived. """ obj_id = self._add_content('uffizi', b'archive_alread_enough') self._update_status(obj_id, 'uffizi', 'present') self._override_director_config(retention_policy=1) director = self._create_director() # Obj is present in only one archive but only one copy is required. director.run() with self.assertRaises(ObjNotFoundError): self.dest_storage.get(obj_id) # Unit tests for archive worker def archival_elapsed(self, mtime): return self._create_worker()._is_archival_delay_elapsed(mtime) @istest def vstatus_ongoing_remaining(self): self.assertFalse(self.archival_elapsed(time.time())) @istest def vstatus_ongoing_elapsed(self): past_time = ( time.time() - self._create_worker().archival_max_age ) self.assertTrue(self.archival_elapsed(past_time)) def _status(self, status, mtime=None): """ Get a dict that match the copies structure """ return {'status': status, 'mtime': mtime or time.time()} @istest def need_archival_missing(self): """ A content should need archival when it is missing. """ status_copies = {'present': ['uffizi'], 'missing': ['banco']} worker = self._create_worker() self.assertEqual(worker.need_archival(status_copies), True) @istest def need_archival_present(self): """ A content present everywhere shouldn't need archival """ status_copies = {'present': ['uffizi', 'banco']} worker = self._create_worker() self.assertEqual(worker.need_archival(status_copies), False) def _compute_copies_status(self, status): """ A content with a given status should be detected correctly """ obj_id = self._add_content( 'banco', b'compute_copies_' + bytes(status, 'utf8')) self._update_status(obj_id, 'banco', status) worker = self._create_worker() self.assertIn('banco', worker.compute_copies( set(worker.objstorages), obj_id)[status]) @istest def compute_copies_present(self): """ A present content should be detected with correct status """ self._compute_copies_status('present') @istest def compute_copies_missing(self): """ A missing content should be detected with correct status """ self._compute_copies_status('missing') def _get_backups(self, present, missing): """ Return a list of the pair src/dest from the present and missing """ worker = self._create_worker() return list(worker.choose_backup_servers(present, missing)) @istest def choose_backup_servers(self): self.assertEqual(len(self._get_backups(['uffizi', 'banco'], [])), 0) self.assertEqual(len(self._get_backups(['uffizi'], ['banco'])), 1) # Even with more possible destinations, do not take more than the # retention_policy require self.assertEqual( len(self._get_backups(['uffizi'], ['banco', 's3'])), 1 )