Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9311452
director.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
director.py
View Options
# Copyright (C) 2015-2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
logging
import
click
from
datetime
import
datetime
from
swh.core
import
hashutil
,
config
from
swh.objstorage
import
PathSlicingObjStorage
from
swh.objstorage.api.client
import
RemoteObjStorage
from
swh.scheduler.celery_backend.config
import
app
from
.
import
tasks
# NOQA
from
.storage
import
ArchiverStorage
DEFAULT_CONFIG
=
{
'objstorage_type'
:
(
'str'
,
'local_storage'
),
'objstorage_path'
:
(
'str'
,
'/tmp/swh-storage/objects'
),
'objstorage_slicing'
:
(
'str'
,
'0:2/2:4/4:6'
),
'objstorage_url'
:
(
'str'
,
'http://localhost:5003/'
),
'batch_max_size'
:
(
'int'
,
50
),
'archival_max_age'
:
(
'int'
,
3600
),
'retention_policy'
:
(
'int'
,
2
),
'asynchronous'
:
(
'bool'
,
True
),
'dbconn'
:
(
'str'
,
'dbname=softwareheritage-archiver-dev user=guest'
)
}
task_name
=
'swh.storage.archiver.tasks.SWHArchiverTask'
logger
=
logging
.
getLogger
()
class
ArchiverDirector
():
"""Process the files in order to know which one is needed as backup.
The archiver director processes the files in the local storage in order
to know which one needs archival and it delegates this task to
archiver workers.
Attributes:
master_objstorage: the local storage of the master server.
master_objstorage_args (dict): arguments of the master objstorage
initialization.
archiver_storage: a wrapper for archiver db operations.
db_conn_archiver: Either a libpq connection string,
or a psycopg2 connection for the archiver db.
slave_objstorages: Iterable of remote obj storages to the slaves
servers used for backup.
config: Archiver_configuration. A dictionary that must contain
the following keys:
objstorage_type (str): type of objstorage used (local_storage
or remote_storage).
If the storage is local, the arguments keys must be present
objstorage_path (str): master's objstorage path
objstorage_slicing (str): masters's objstorage slicing
Otherwise, if it's a remote objstorage, the keys must be:
objstorage_url (str): url of the remote objstorage
batch_max_size (int): The number of content items that can be
given to the same archiver worker.
archival_max_age (int): Delay given to the worker to copy all
the files in a given batch.
retention_policy (int): Required number of copies for the
content to be considered safe.
asynchronous (boolean): Indicate whenever the archival should
run in asynchronous mode or not.
"""
def
__init__
(
self
,
db_conn_archiver
,
config
):
""" Constructor of the archiver director.
Args:
db_conn_archiver: Either a libpq connection string,
or a psycopg2 connection for the archiver db.
config: Archiver_configuration. A dictionary that must contain
the following keys:
objstorage_type (str): type of objstorage used
(local_objstorage or remote_objstorage).
If the storage is local, the arguments keys must be present
objstorage_path (str): master's objstorage path
objstorage_slicing (str): masters's objstorage slicing
Otherwise, if it's a remote objstorage, the keys must be:
objstorage_url (str): url of the remote objstorage
batch_max_size (int): The number of content items that can be
given to the same archiver worker.
archival_max_age (int): Delay given to the worker to copy all
the files in a given batch.
retention_policy (int): Required number of copies for the
content to be considered safe.
asynchronous (boolean): Indicate whenever the archival should
run in asynchronous mode or not.
"""
# Get the slave storages
self
.
db_conn_archiver
=
db_conn_archiver
self
.
archiver_storage
=
ArchiverStorage
(
db_conn_archiver
)
self
.
slave_objstorages
=
{
id
:
url
for
id
,
url
in
self
.
archiver_storage
.
archive_ls
()
}
# Check that there is enough backup servers for the retention policy
if
config
[
'retention_policy'
]
>
len
(
self
.
slave_objstorages
)
+
1
:
raise
ValueError
(
"Can't have a retention policy of
%d
with
%d
backup servers"
%
(
config
[
'retention_policy'
],
len
(
self
.
slave_objstorages
))
)
# Get the master storage that contains content to be archived
if
config
[
'objstorage_type'
]
==
'local_objstorage'
:
master_objstorage_args
=
{
'root'
:
config
[
'objstorage_path'
],
'slicing'
:
config
[
'objstorage_slicing'
]
}
master_objstorage
=
PathSlicingObjStorage
(
**
master_objstorage_args
)
elif
config
[
'objstorage_type'
]
==
'remote_objstorage'
:
master_objstorage_args
=
{
'base_url'
:
config
[
'objstorage_url'
]}
master_objstorage
=
RemoteObjStorage
(
**
master_objstorage_args
)
else
:
raise
ValueError
(
'Unknow objstorage class `
%s
`'
%
config
[
'objstorage_type'
]
)
self
.
master_objstorage
=
master_objstorage
self
.
master_objstorage_args
=
master_objstorage_args
# Keep the full configuration
self
.
config
=
config
def
run
(
self
):
""" Run the archiver director.
The archiver director will check all the contents of the archiver
database and do the required backup jobs.
"""
if
self
.
config
[
'asynchronous'
]:
run_fn
=
self
.
run_async_worker
else
:
run_fn
=
self
.
run_sync_worker
for
batch
in
self
.
get_unarchived_content
():
run_fn
(
batch
)
def
run_async_worker
(
self
,
batch
):
""" Produce a worker that will be added to the task queue.
"""
task
=
app
.
tasks
[
task_name
]
task
.
delay
(
batch
,
archiver_args
=
self
.
db_conn_archiver
,
master_objstorage_args
=
self
.
master_objstorage_args
,
slave_objstorages
=
self
.
slave_objstorages
,
config
=
self
.
config
)
def
run_sync_worker
(
self
,
batch
):
""" Run synchronously a worker on the given batch.
"""
task
=
app
.
tasks
[
task_name
]
task
(
batch
,
archiver_args
=
self
.
db_conn_archiver
,
master_objstorage_args
=
self
.
master_objstorage_args
,
slave_objstorages
=
self
.
slave_objstorages
,
config
=
self
.
config
)
def
get_unarchived_content
(
self
):
""" Get contents that need to be archived.
Yields:
A batch of contents. Batches are dictionaries which associates
a content id to the data about servers that contains it or not.
{'id1':
{'present': [('slave1', 'slave1_url')],
'missing': [('slave2', 'slave2_url'),
('slave3', 'slave3_url')]
},
'id2':
{'present': [],
'missing': [
('slave1', 'slave1_url'),
('slave2', 'slave2_url'),
('slave3', 'slave3_url')
]}
}
Where keys (idX) are sha1 of the content and (slaveX, slaveX_url)
are ids and urls of the storage slaves.
At least all the content that don't have enough copies on the
backups servers are distributed into these batches.
"""
# Get the data about each content referenced into the archiver.
missing_copy
=
{}
for
content_id
in
self
.
archiver_storage
.
content_archive_ls
():
db_content_id
=
'
\\
x'
+
hashutil
.
hash_to_hex
(
content_id
[
0
])
# Fetch the datas about archival status of the content
backups
=
self
.
archiver_storage
.
content_archive_get
(
content
=
db_content_id
)
for
_content_id
,
server_id
,
status
,
mtime
in
backups
:
virtual_status
=
self
.
get_virtual_status
(
status
,
mtime
)
server_data
=
(
server_id
,
self
.
slave_objstorages
[
server_id
])
missing_copy
.
setdefault
(
db_content_id
,
{
'present'
:
[],
'missing'
:
[]}
)
.
setdefault
(
virtual_status
,
[])
.
append
(
server_data
)
# Check the content before archival.
try
:
self
.
master_objstorage
.
check
(
content_id
[
0
])
except
Exception
as
e
:
# Exception can be Error or ObjNotFoundError.
logger
.
error
(
e
)
# TODO Do something to restore the content?
if
len
(
missing_copy
)
>=
self
.
config
[
'batch_max_size'
]:
yield
missing_copy
missing_copy
=
{}
if
len
(
missing_copy
)
>
0
:
yield
missing_copy
def
get_virtual_status
(
self
,
status
,
mtime
):
""" Compute the virtual presence of a content.
If the status is ongoing but the time is not elasped, the archiver
consider it will be present in the futur, and so consider it as
present.
However, if the time is elasped, the copy may have failed, so consider
the content as missing.
Arguments:
status (string): One of ('present', 'missing', 'ongoing'). The
status of the content.
mtime (datetime): Time at which the content have been updated for
the last time.
Returns:
The virtual status of the studied content, which is 'present' or
'missing'.
Raises:
ValueError: if the status is not one 'present', 'missing'
or 'ongoing'
"""
if
status
in
(
'present'
,
'missing'
):
return
status
# If the status is 'ongoing' but there is still time, another worker
# may still be on the task.
if
status
==
'ongoing'
:
mtime
=
mtime
.
replace
(
tzinfo
=
None
)
elapsed
=
(
datetime
.
now
()
-
mtime
)
.
total_seconds
()
if
elapsed
<=
self
.
config
[
'archival_max_age'
]:
return
'present'
else
:
return
'missing'
else
:
raise
ValueError
(
"status must be either 'present', 'missing' "
"or 'ongoing'"
)
@click.command
()
@click.argument
(
'config-path'
,
required
=
1
)
@click.option
(
'--dbconn'
,
default
=
DEFAULT_CONFIG
[
'dbconn'
][
1
],
help
=
"Connection string for the archiver database"
)
@click.option
(
'--async/--sync'
,
default
=
DEFAULT_CONFIG
[
'asynchronous'
][
1
],
help
=
"Indicates if the archiver should run asynchronously"
)
def
launch
(
config_path
,
dbconn
,
dbconn_storage
,
async
):
# The configuration have following priority :
# command line > file config > default config
cl_config
=
{
'dbconn'
:
dbconn
,
'asynchronous'
:
async
}
conf
=
config
.
read
(
config_path
,
DEFAULT_CONFIG
)
conf
.
update
(
cl_config
)
# Create connection data and run the archiver.
archiver
=
ArchiverDirector
(
conf
[
'dbconn'
],
conf
)
logger
.
info
(
"Starting an archival at"
,
datetime
.
now
())
archiver
.
run
()
if
__name__
==
'__main__'
:
launch
()
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Thu, Jul 3, 10:16 AM (2 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3314311
Attached To
rDSTO Storage manager
Event Timeline
Log In to Comment