No OneTemporary
Actions

Size

35 KB

Subscribers

None

View Options

	diff --git a/swh/storage/archiver/db.py b/swh/storage/archiver/db.py
	index b1156aef..a18aabee 100644
	--- a/swh/storage/archiver/db.py
	+++ b/swh/storage/archiver/db.py
	@@ -1,257 +1,260 @@
	# Copyright (C) 2015-2016 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information


	import time

	from swh.core import hashutil
	from swh.storage.db import BaseDb, cursor_to_bytes, stored_procedure


	class ArchiverDb(BaseDb):
	"""Proxy to the SWH's archiver DB

	"""

	def archive_ls(self, cur=None):
	""" Get all the archives registered on the server.

	Yields:
	a tuple (server_id, server_url) for each archive server.
	"""
	cur = self._cursor(cur)
	cur.execute("SELECT * FROM archive")
	yield from cursor_to_bytes(cur)

	def content_archive_get(self, content_id, cur=None):
	""" Get the archival status of a content in a specific server.

	Retrieve from the database the archival status of the given content
	in the given archive server.

	Args:
	content_id: the sha1 of the content.

	Yields:
	A tuple (content_id, present_copies, ongoing_copies), where
	ongoing_copies is a dict mapping copy to mtime.
	"""
	query = """SELECT content_id,
	array(
	SELECT key
	FROM jsonb_each(copies)
	WHERE value->>'status' = 'present'
	ORDER BY key
	) AS present,
	array(
	SELECT key
	FROM jsonb_each(copies)
	WHERE value->>'status' = 'ongoing'
	ORDER BY key
	) AS ongoing,
	array(
	SELECT value->'mtime'
	FROM jsonb_each(copies)
	WHERE value->>'status' = 'ongoing'
	ORDER BY key
	) AS ongoing_mtime
	FROM content_archive
	WHERE content_id = %s
	ORDER BY content_id
	"""
	cur = self._cursor(cur)
	cur.execute(query, (content_id,))
	row = cur.fetchone()
	if not row:
	return None
	content_id, present, ongoing, mtimes = row
	return (content_id, present, dict(zip(ongoing, mtimes)))

	def content_archive_get_copies(self, last_content=None, limit=1000,
	cur=None):
	"""Get the list of copies for `limit` contents starting after
	`last_content`.

	Args:
	last_content: sha1 of the last content retrieved. May be None
	to start at the beginning.
	limit: number of contents to retrieve. Can be None to retrieve all
	objects (will be slow).

	Yields:
	A tuple (content_id, present_copies, ongoing_copies), where
	ongoing_copies is a dict mapping copy to mtime.

	"""

	query = """SELECT content_id,
	array(
	SELECT key
	FROM jsonb_each(copies)
	WHERE value->>'status' = 'present'
	ORDER BY key
	) AS present,
	array(
	SELECT key
	FROM jsonb_each(copies)
	WHERE value->>'status' = 'ongoing'
	ORDER BY key
	) AS ongoing,
	array(
	SELECT value->'mtime'
	FROM jsonb_each(copies)
	WHERE value->>'status' = 'ongoing'
	ORDER BY key
	) AS ongoing_mtime
	FROM content_archive
	WHERE content_id > %s
	ORDER BY content_id
	LIMIT %s
	"""

	if last_content is None:
	last_content = b''

	cur = self._cursor(cur)
	cur.execute(query, (last_content, limit))
	for content_id, present, ongoing, mtimes in cursor_to_bytes(cur):
	yield (content_id, present, dict(zip(ongoing, mtimes)))

	def content_archive_get_unarchived_copies(
	self, retention_policy, last_content=None,
	limit=1000, cur=None):
	""" Get the list of copies for `limit` contents starting after
	`last_content`. Yields only copies with number of present
	smaller than `retention policy`.

	Args:
	last_content: sha1 of the last content retrieved. May be None
	to start at the beginning.
	retention_policy: number of required present copies
	limit: number of contents to retrieve. Can be None to retrieve all
	objects (will be slow).

	Yields:
	A tuple (content_id, present_copies, ongoing_copies), where
	ongoing_copies is a dict mapping copy to mtime.

	"""

	query = """SELECT content_id,
	array(
	SELECT key
	FROM jsonb_each(copies)
	WHERE value->>'status' = 'present'
	ORDER BY key
	) AS present,
	array(
	SELECT key
	FROM jsonb_each(copies)
	WHERE value->>'status' = 'ongoing'
	ORDER BY key
	) AS ongoing,
	array(
	SELECT value->'mtime'
	FROM jsonb_each(copies)
	WHERE value->>'status' = 'ongoing'
	ORDER BY key
	) AS ongoing_mtime
	FROM content_archive
	WHERE content_id > %s AND num_present < %s
	ORDER BY content_id
	LIMIT %s
	"""

	if last_content is None:
	last_content = b''

	cur = self._cursor(cur)
	cur.execute(query, (last_content, retention_policy, limit))
	for content_id, present, ongoing, mtimes in cursor_to_bytes(cur):
	yield (content_id, present, dict(zip(ongoing, mtimes)))

	@stored_procedure('swh_mktemp_content_archive')
	def mktemp_content_archive(self, cur=None):
	"""Trigger the creation of the temporary table tmp_content_archive
	during the lifetime of the transaction.

	Use from archiver.storage module:
	self.db.mktemp_content_archive()
	# copy data over to the temp table
	self.db.copy_to([{'colname': id0}, {'colname': id1}],
	'tmp_cache_content',
	['colname'], cur)

	"""
	pass

	def content_archive_get_missing(self, backend_name, cur=None):
	"""Retrieve the content missing from backend_name.

	"""
	cur = self._cursor(cur)
	cur.execute("select * from swh_content_archive_missing(%s)",
	(backend_name,))
	yield from cursor_to_bytes(cur)

	def content_archive_get_unknown(self, cur=None):
	"""Retrieve unknown sha1 from archiver db.

	"""
	cur = self._cursor(cur)
	cur.execute('select * from swh_content_archive_unknown()')
	yield from cursor_to_bytes(cur)

	def content_archive_insert(self, content_id, source, status, cur=None):
	"""Insert a new entry in the db for the content_id.

	Args:
	content_id: content concerned
	source: name of the source
	status: the status of the content for that source

	"""
	if isinstance(content_id, bytes):
	content_id = '\\x%s' % hashutil.hash_to_hex(content_id)

	query = """INSERT INTO content_archive(content_id, copies, num_present)
	VALUES('%s', '{"%s": {"status": "%s", "mtime": %d}}', 1)
	""" % (content_id, source, status, int(time.time()))
	cur = self._cursor(cur)
	cur.execute(query)

	def content_archive_update(self, content_id, archive_id,
	new_status=None, cur=None):
	""" Update the status of an archive content and set its mtime to

	Change the mtime of an archived content for the given archive and set
	it's mtime to the current time.

	Args:
	content_id (str): content sha1
	archive_id (str): name of the archive
	new_status (str): one of 'missing', 'present' or 'ongoing'.
	this status will replace the previous one. If not given,
	the function only change the mtime of the content for the
	given archive.
	"""
	+ if isinstance(content_id, bytes):
	+ content_id = '\\x%s' % hashutil.hash_to_hex(content_id)
	+
	if new_status is not None:
	query = """UPDATE content_archive
	SET copies=jsonb_set(
	copies, '{%s}',
	'{"status":"%s", "mtime":%d}'
	)
	WHERE content_id='%s'
	""" % (archive_id,
	new_status, int(time.time()),
	content_id)
	else:
	query = """ UPDATE content_archive
	SET copies=jsonb_set(copies, '{%s,mtime}', '%d')
	WHERE content_id='%s'
	""" % (archive_id, int(time.time()))

	cur = self._cursor(cur)
	cur.execute(query)
	diff --git a/swh/storage/archiver/director.py b/swh/storage/archiver/director.py
	index b6dfebe3..facd033b 100644
	--- a/swh/storage/archiver/director.py
	+++ b/swh/storage/archiver/director.py
	@@ -1,229 +1,285 @@
	# Copyright (C) 2015-2016 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import abc
	import click
	import sys

	from swh.core import config, utils, hashutil
	+from swh.objstorage import get_objstorage
	from swh.scheduler.celery_backend.config import app

	from . import tasks # noqa
	from .storage import ArchiverStorage


	class ArchiverDirectorBase(config.SWHConfig, metaclass=abc.ABCMeta):
	"""Abstract Director class

	An archiver director is in charge of dispatching batch of
	contents to archiver workers (for them to archive).

	Inherit from this class and provide:
	- ADDITIONAL_CONFIG: Some added configuration needed for the
	director to work
	- CONFIG_BASE_FILENAME: relative path to lookup for the
	configuration file
	- def get_contents_to_archive(self): Implementation method to read
	contents to archive

	"""
	DEFAULT_CONFIG = {
	'batch_max_size': ('int', 1500),
	'asynchronous': ('bool', True),

	'dbconn': ('str', 'dbname=softwareheritage-archiver-dev user=guest')
	}

	# Destined to be overridden by subclass
	ADDITIONAL_CONFIG = {}

	# We use the same configuration file as the worker
	CONFIG_BASE_FILENAME = 'archiver/worker'

	# The worker's task queue name to use
	TASK_NAME = None

	def __init__(self):
	""" Constructor of the archiver director.

	Args:
	db_conn_archiver: Either a libpq connection string,
	or a psycopg2 connection for the archiver db.
	config: optionnal additional configuration. Keys in the dict will
	override the one parsed from the configuration file.
	"""
	super().__init__()
	self.config = self.parse_config_file(
	additional_configs=[self.ADDITIONAL_CONFIG])
	self.archiver_storage = ArchiverStorage(self.config['dbconn'])

	def run(self):
	""" Run the archiver director.

	The archiver director will check all the contents of the archiver
	database and do the required backup jobs.
	"""
	if self.config['asynchronous']:
	run_fn = self.run_async_worker
	else:
	run_fn = self.run_sync_worker

	for batch in self.read_batch_contents():
	run_fn(batch)

	def run_async_worker(self, batch):
	""" Produce a worker that will be added to the task queue.
	"""
	task = app.tasks[self.TASK_NAME]
	task.delay(batch=batch)

	def run_sync_worker(self, batch):
	""" Run synchronously a worker on the given batch.
	"""
	task = app.tasks[self.TASK_NAME]
	task(batch=batch)

	def read_batch_contents(self):
	""" Create batch of contents that needs to be archived

	Yields:
	batch of sha1 that corresponds to contents that needs more archive
	copies.
	"""
	contents = []
	for content in self.get_contents_to_archive():
	contents.append(content)
	if len(contents) > self.config['batch_max_size']:
	yield contents
	contents = []
	if len(contents) > 0:
	yield contents

	@abc.abstractmethod
	def get_contents_to_archive(self):
	"""Retrieve generator of sha1 to archive

	Yields:
	sha1 to archive

	"""
	pass


	class ArchiverWithRetentionPolicyDirector(ArchiverDirectorBase):
	"""Process the files in order to know which one is needed as backup.

	The archiver director processes the files in the local storage in order
	to know which one needs archival and it delegates this task to
	archiver workers.
	"""

	ADDITIONAL_CONFIG = {
	'retention_policy': ('int', 2),
	}

	TASK_NAME = 'swh.storage.archiver.tasks.SWHArchiverWithRetentionPolicyTask'

	def get_contents_to_archive(self):
	"""Create batch of contents that needs to be archived

	Yields:
	Datas about a content as a tuple
	(content_id, present_copies, ongoing_copies) where ongoing_copies
	is a dict mapping copy to mtime.

	"""
	last_content = None
	while True:
	archiver_contents = list(
	self.archiver_storage.content_archive_get_unarchived_copies(
	last_content=last_content,
	retention_policy=self.config['retention_policy']))
	if not archiver_contents:
	return
	for content_id, _, _ in archiver_contents:
	last_content = content_id
	yield content_id


	def read_sha1_from_stdin():
	"""Read sha1 from stdin.

	"""
	for sha1 in sys.stdin:
	yield {'content_id': hashutil.hex_to_hash(sha1.rstrip())}


	class ArchiverStdinToBackendDirector(ArchiverDirectorBase):
	"""A cloud archiver director in charge of reading contents and send
	them in batch in the cloud.

	- The archiver director processes the files in the local storage in
	- order to know which one needs archival and it delegates this task
	- to archiver workers.
	+ The archiver director, in order:
	+ - Reads sha1 to send to a specific backend.
	+ - Checks if those sha1 are known in the archiver. If they are not,
	+ add them
	+ - if the sha1 are missing, they are sent for the worker to archive
	+
	+ If the flag force_copy is set, this will force the copy to be sent
	+ for archive even though it has already been done.

	"""
	ADDITIONAL_CONFIG = {
	'destination': ('str', 'azure'),
	'force_copy': ('bool', False),
	+ 'source': ('str', 'uffizi'),
	+ 'storages': ('list[dict]',
	+ [
	+ {'host': 'uffizi',
	+ 'cls': 'pathslicing',
	+ 'args': {'root': '/tmp/softwareheritage/objects',
	+ 'slicing': '0:2/2:4/4:6'}},
	+ {'host': 'banco',
	+ 'cls': 'remote',
	+ 'args': {'base_url': 'http://banco:5003/'}}
	+ ])
	}

	CONFIG_BASE_FILENAME = 'archiver/worker-to-backend'

	TASK_NAME = 'swh.storage.archiver.tasks.SWHArchiverToBackendTask'

	def __init__(self):
	super().__init__()
	self.destination = self.config['destination']
	self.force_copy = self.config['force_copy']
	+ self.objstorages = {
	+ storage['host']: get_objstorage(storage['cls'], storage['args'])
	+ for storage in self.config.get('storages', [])
	+ }
	+ # Fallback objstorage
	+ self.source = self.config['source']
	+
	+ def _add_unknown_content_ids(self, content_ids, source_objstorage):
	+ """Check whether some content_id are unknown.
	+ If they are, add them to the archiver db.
	+
	+ Args:
	+ content_ids: List of dict with one key content_id
	+
	+ source_objstorage (ObjStorage): objstorage to check if
	+ content_id is there
	+
	+ """
	+ unknowns = self.archiver_storage.content_archive_get_unknown(
	+ content_ids)
	+ for unknown_id in unknowns:
	+ print('unknown', unknown_id)
	+ if unknown_id not in source_objstorage:
	+ continue
	+ self.archiver_storage.content_archive_insert(
	+ unknown_id, self.source, 'present')

	def get_contents_to_archive(self):
	gen_content_ids = (
	ids for ids in utils.grouper(read_sha1_from_stdin(),
	- self.config['batch_max_size'])
	- )
	+ self.config['batch_max_size']))

	+ source_objstorage = self.objstorages[self.source]
	if self.force_copy:
	for content_ids in gen_content_ids:
	content_ids = list(content_ids)

	if not content_ids:
	continue

	+ # Add missing entries in archiver table
	+ self._add_unknown_content_ids(content_ids, source_objstorage)
	+
	print('Send %s contents to archive' % len(content_ids))

	for content in content_ids:
	- yield content['content_id']
	+ content_id = content['content_id']
	+ # force its status to missing
	+ self.archiver_storage.content_archive_update(
	+ content_id, self.destination, 'missing')
	+ yield content_id

	else:
	for content_ids in gen_content_ids:
	+ content_ids = list(content_ids)
	+
	+ # Add missing entries in archiver table
	+ self._add_unknown_content_ids(content_ids, source_objstorage)
	+
	+ # Filter already copied data
	content_ids = list(
	self.archiver_storage.content_archive_get_missing(
	content_ids=content_ids,
	backend_name=self.destination))

	if not content_ids:
	continue

	print('Send %s contents to archive' % len(content_ids))

	for content in content_ids:
	yield content


	@click.command()
	@click.option('--direct', is_flag=True,
	help="""The archiver sends content for backup to
	one storage.""")
	def launch(direct):
	if direct:
	archiver = ArchiverStdinToBackendDirector()
	else:
	archiver = ArchiverWithRetentionPolicyDirector()

	archiver.run()

	if __name__ == '__main__':
	launch()
	diff --git a/swh/storage/archiver/worker.py b/swh/storage/archiver/worker.py
	index 07129f96..85957419 100644
	--- a/swh/storage/archiver/worker.py
	+++ b/swh/storage/archiver/worker.py
	@@ -1,413 +1,370 @@
	# Copyright (C) 2015 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import abc
	import logging
	import random
	import time

	from collections import defaultdict

	from swh.objstorage import get_objstorage

	from swh.core import hashutil, config

	from swh.objstorage.exc import Error, ObjNotFoundError

	from .storage import ArchiverStorage
	from .copier import ArchiverCopier


	logger = logging.getLogger('archiver.worker')


	class BaseArchiveWorker(config.SWHConfig, metaclass=abc.ABCMeta):
	"""Base archive worker.

	Inherit from this class and override:
	- ADDITIONAL_CONFIG: Some added configuration needed for the
	director to work
	- CONFIG_BASE_FILENAME: relative path to lookup for the
	configuration file
	- def need_archival(self, content_data): Determine if a content
	needs archival or not
	- def choose_backup_servers(self, present, missing): Choose
	which backup server to send copies to

	"""
	DEFAULT_CONFIG = {
	'dbconn': ('str', 'dbname=softwareheritage-archiver-dev'),
	- 'source': ('str', 'uffizi'),
	'storages': ('list[dict]',
	[
	{'host': 'uffizi',
	'cls': 'pathslicing',
	'args': {'root': '/tmp/softwareheritage/objects',
	'slicing': '0:2/2:4/4:6'}},
	{'host': 'banco',
	'cls': 'remote',
	'args': {'base_url': 'http://banco:5003/'}}
	])
	}

	ADDITIONAL_CONFIG = {}

	CONFIG_BASE_FILENAME = 'archiver/worker'

	objstorages = {}

	def __init__(self, batch):
	super().__init__()
	self.config = self.parse_config_file(
	additional_configs=[self.ADDITIONAL_CONFIG])
	self.batch = batch
	self.archiver_db = ArchiverStorage(self.config['dbconn'])
	self.objstorages = {
	storage['host']: get_objstorage(storage['cls'], storage['args'])
	for storage in self.config.get('storages', [])
	}
	self.set_objstorages = set(self.objstorages)
	- # Fallback objstorage
	- self.source = self.config['source']

	def run(self):
	"""Do the task expected from the archiver worker.

	Process the contents in self.batch, ensure that the elements
	still need an archival (using archiver db), and spawn copiers
	to copy files in each destination according to the
	archiver-worker's policy.

	"""
	transfers = defaultdict(list)
	for obj_id in self.batch:
	# Get dict {'missing': [servers], 'present': [servers]}
	# for contents ignoring those who don't need archival.
	copies = self.compute_copies(self.set_objstorages, obj_id)
	- if not copies:
	- # could happen if archiver db lags behind
	- copies = self.compute_fallback_copies(
	- self.source, self.set_objstorages, obj_id)
	- if not copies:
	- msg = 'Unknown content %s' % hashutil.hash_to_hex(obj_id)
	- logger.warning(msg)
	- continue
	+ if not copies: # could not happen if using .director module
	+ msg = 'Unknown content %s' % hashutil.hash_to_hex(obj_id)
	+ logger.warning(msg)
	+ continue

	if not self.need_archival(copies):
	continue

	present = copies.get('present', [])
	missing = copies.get('missing', [])
	if len(present) == 0:
	msg = 'Lost content %s' % hashutil.hash_to_hex(obj_id)
	logger.critical(msg)
	continue

	# Choose servers to be used as srcs and dests.
	for src_dest in self.choose_backup_servers(present, missing):
	transfers[src_dest].append(obj_id)

	# Then run copiers for each of the required transfers.
	for (src, dest), content_ids in transfers.items():
	self.run_copier(src, dest, content_ids)

	- def compute_fallback_copies(self, source, set_objstorages, content_id):
	- """Compute fallback copies for content_id.
	-
	- Args:
	- source: the objstorage where the content_id is supposedly present
	- set_objstorages: the complete set of objstorages
	- content_id: the content concerned
	-
	- Returns:
	- A dictionary with keys 'present' and 'missing' that are
	- mapped to lists of copies ids depending on whenever the
	- content is present or missing on the copy.
	-
	- There is also the key 'ongoing' which is associated with a
	- dict that map to a copy name the mtime of the ongoing
	- status update.
	-
	- """
	- if content_id not in self.objstorages[source]:
	- return None
	-
	- # insert a new entry about of the content_id's presence for that source
	- self.archiver_db.content_archive_insert(
	- content_id=content_id, source=self.source, status='present')
	-
	- # Now compute the fallback copies
	- set_present = {self.source}
	- set_missing = set_objstorages - set_present
	- return {
	- 'present': set_present,
	- 'missing': set_missing,
	- 'ongoing': {}
	- }
	-
	def compute_copies(self, set_objstorages, content_id):
	"""From a content_id, return present and missing copies.

	Args:
	objstorages (set): objstorage's id name
	content_id: the content concerned

	Returns:
	A dictionary with keys 'present' and 'missing' that are
	mapped to lists of copies ids depending on whenever the
	content is present or missing on the copy.

	There is also the key 'ongoing' which is associated with a
	dict that map to a copy name the mtime of the ongoing
	status update.

	"""
	result = self.archiver_db.content_archive_get(content_id)
	if not result:
	return None
	_, present, ongoing = result
	set_present = set(present)
	set_ongoing = set(ongoing)
	set_missing = set_objstorages - set_present - set_ongoing
	return {
	'present': set_present,
	'missing': set_missing,
	'ongoing': ongoing
	}

	def run_copier(self, source, destination, content_ids):
	"""Run a copier in order to archive the given contents.

	Upload the given contents from the source to the destination.
	If the process fails, the whole content is considered uncopied
	and remains 'ongoing', waiting to be rescheduled as there is a
	delay.

	Args:
	source (str): source storage's identifier
	destination (str): destination storage's identifier
	content_ids ([sha1]): list of content ids to archive.

	"""
	# Check if there are any errors among the contents.
	content_status = self.get_contents_error(content_ids, source)

	# Iterates over the error detected.
	for content_id, real_status in content_status.items():
	# Remove them from the to-archive list,
	# as they cannot be retrieved correctly.
	content_ids.remove(content_id)
	# Update their status to reflect their real state.
	self.content_archive_update(
	content_id, archive_id=source, new_status=real_status)

	# Now perform the copy on the remaining contents
	ac = ArchiverCopier(
	source=self.objstorages[source],
	destination=self.objstorages[destination],
	content_ids=content_ids)

	if ac.run():
	# Once the archival complete, update the database.
	for content_id in content_ids:
	self.content_archive_update(
	content_id, archive_id=destination, new_status='present')

	def get_contents_error(self, content_ids, source_storage):
	"""Indicates what is the error associated to a content when needed

	Check the given content on the given storage. If an error is detected,
	it will be reported through the returned dict.

	Args:
	content_ids ([sha1]): list of content ids to check
	source_storage (str): the source storage holding the
	contents to check.

	Returns:
	a dict that map {content_id -> error_status} for each content_id
	with an error. The `error_status` result may be 'missing' or
	'corrupted'.

	"""
	content_status = {}
	storage = self.objstorages[source_storage]
	for content_id in content_ids:
	try:
	storage.check(content_id)
	except Error:
	content_status[content_id] = 'corrupted'
	logger.error('%s corrupted!' % hashutil.hash_to_hex(
	content_id))
	except ObjNotFoundError:
	content_status[content_id] = 'missing'
	logger.error('%s missing!' % hashutil.hash_to_hex(content_id))

	return content_status

	def content_archive_update(self, content_id, archive_id, new_status=None):
	"""Update the status of a archive content and set its mtime to now.

	Change the last modification time of an archived content and change
	its status to the given one.

	Args:
	content_id (str): The content id.
	archive_id (str): The id of the concerned archive.
	new_status (str): One of missing, ongoing or present, this
	status will replace the previous one. If not given, the
	function only changes the mtime of the content.
	"""
	db_obj_id = r'\x' + hashutil.hash_to_hex(content_id)
	self.archiver_db.content_archive_update(
	db_obj_id,
	archive_id,
	new_status
	)

	@abc.abstractmethod
	def need_archival(self, content_data):
	"""Indicate if the content needs to be archived.

	Args:
	content_data (dict): dict that contains two lists 'present' and
	'missing' with copies id corresponding to this status.

	Returns:
	True if there is not enough copies, False otherwise.

	"""
	pass

	@abc.abstractmethod
	def choose_backup_servers(self, present, missing):
	"""Choose and yield the required amount of couple source/destination

	For each required copy, choose a unique destination server
	among the missing copies and a source server among the
	presents.

	Yields:
	tuple (source (str), destination (src)) for each required copy.

	"""
	pass


	class ArchiverWithRetentionPolicyWorker(BaseArchiveWorker):
	""" Do the required backups on a given batch of contents.

	Process the content of a content batch in order to do the needed backups on
	the slaves servers.
	"""

	ADDITIONAL_CONFIG = {
	'retention_policy': ('int', 2),
	'archival_max_age': ('int', 3600),
	}

	def __init__(self, batch):
	""" Constructor of the ArchiverWorker class.

	Args:
	batch: list of object's sha1 that potentially need archival.
	"""
	super().__init__(batch)
	config = self.config
	self.retention_policy = config['retention_policy']
	self.archival_max_age = config['archival_max_age']

	if len(self.objstorages) < self.retention_policy:
	raise ValueError('Retention policy is too high for the number of '
	'provided servers')

	def need_archival(self, content_data):
	""" Indicate if the content need to be archived.

	Args:
	content_data (dict): dict that contains two lists 'present' and
	'missing' with copies id corresponding to this status.
	Returns: True if there is not enough copies, False otherwise.
	"""
	nb_presents = len(content_data.get('present', []))
	for copy, mtime in content_data.get('ongoing', {}).items():
	if not self._is_archival_delay_elasped(mtime):
	nb_presents += 1
	return nb_presents < self.retention_policy

	def _is_archival_delay_elapsed(self, start_time):
	""" Indicates if the archival delay is elapsed given the start_time

	Args:
	start_time (float): time at which the archival started.

	Returns:
	True if the archival delay is elasped, False otherwise
	"""
	elapsed = time.time() - start_time
	return elapsed > self.archival_max_age

	def choose_backup_servers(self, present, missing):
	"""Choose and yield the required amount of couple source/destination

	For each required copy, choose a unique destination server
	among the missing copies and a source server among the
	presents.

	Each destination server is unique so after archival, the
	retention policy requirement will be fulfilled. However, the
	source server may be used multiple times.

	Yields:
	tuple (source, destination) for each required copy.

	"""
	# Transform from set to list to allow random selections
	missing = list(missing)
	present = list(present)
	nb_required = self.retention_policy - len(present)
	destinations = random.sample(missing, nb_required)
	sources = [random.choice(present) for dest in destinations]
	yield from zip(sources, destinations)


	class ArchiverToBackendWorker(BaseArchiveWorker):
	"""Worker that send copies over from a source to another backend.

	Process the content of a content batch from source objstorage to
	destination objstorage.

	"""

	CONFIG_BASE_FILENAME = 'archiver/worker-to-backend'

	def __init__(self, batch):
	"""Constructor of the ArchiverWorkerToBackend class.

	Args:
	batch: list of object's sha1 that potentially need archival.

	"""
	super().__init__(batch)
	self.destination = self.config['destination']

	def need_archival(self, content_data):
	"""Indicate if the content needs to be archived.

	Args:
	content_data (dict): dict that contains 3 lists 'present',
	'ongoing' and 'missing' with copies id corresponding to
	this status.

	Returns:
	True if we need to archive, False otherwise

	"""
	- if self.destination in content_data.get('missing', {}):
	- return True
	- return False
	+ return self.destination in content_data.get('missing', {})

	def choose_backup_servers(self, present, missing):
	yield (random.choice(list(present)), self.destination)

File Metadata

Mime Type: text/x-diff
Expires: Fri, Jul 4, 3:22 PM (5 d, 8 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3236628

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions