Changeset View
Changeset View
Standalone View
Standalone View
swh/objstorage/replayer/cli.py
# Copyright (C) 2016-2020 The Software Heritage developers | # Copyright (C) 2016-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
# WARNING: do not import unnecessary things here to keep cli startup time under | # WARNING: do not import unnecessary things here to keep cli startup time under | ||||
# control | # control | ||||
import logging | import logging | ||||
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines | def content_replay(ctx, stop_after_objects, exclude_sha1_file, check_dst): | ||||
objstorage. It must contain a concatenation of all (sha1) hashes, | objstorage. It must contain a concatenation of all (sha1) hashes, | ||||
and it must be sorted. | and it must be sorted. | ||||
This file will not be fully loaded into memory at any given time, | This file will not be fully loaded into memory at any given time, | ||||
so it can be arbitrarily large. | so it can be arbitrarily large. | ||||
``--check-dst`` sets whether the replayer should check in the destination | ``--check-dst`` sets whether the replayer should check in the destination | ||||
ObjStorage before copying an object. You can turn that off if you know | ObjStorage before copying an object. You can turn that off if you know | ||||
you're copying to an empty ObjStorage. | you're copying to an empty ObjStorage. | ||||
The expected configuration file should have 3 sections: | |||||
- objstorage: the source object storage from which to retrieve objects to | |||||
copy; this objstorage can (and should) be a read-only objstorage, | |||||
https://docs.softwareheritage.org/devel/apidoc/swh.objstorage.html | |||||
- objstorage_dst: the destination objstorage in which objects will be | |||||
written into, | |||||
- journal_client: the configuration of the kafka journal from which the | |||||
`content` topic will be consumed to get the list of content objects to | |||||
copy from the source objstorage to the destination one. | |||||
https://docs.softwareheritage.org/devel/apidoc/swh.journal.client.html | |||||
In addition to these 3 mandatory sections, an optional 'replayer' section | |||||
can be provided with an 'error_reporter' config entry allowing to specify a | |||||
Redis connection parameter set that will be used to report objects that | |||||
could not be copied, eg.:: | |||||
objstorage: | |||||
[...] | |||||
objstorage_dst: | |||||
[...] | |||||
journal_client: | |||||
[...] | |||||
replayer: | |||||
error_reporter: | |||||
host: redis.local | |||||
port: 6379 | |||||
vlorentz: is that indented enough for Sphinx? | |||||
Done Inline ActionsShould be yes (not checked the result however) douardda: Should be yes (not checked the result however) | |||||
Done Inline Actionsdouardda: {F5268237} | |||||
db: 1 | |||||
""" | """ | ||||
import functools | import functools | ||||
import mmap | import mmap | ||||
from swh.journal.client import get_journal_client | from swh.journal.client import get_journal_client | ||||
from swh.model.model import SHA1_SIZE | from swh.model.model import SHA1_SIZE | ||||
from swh.objstorage.factory import get_objstorage | from swh.objstorage.factory import get_objstorage | ||||
from swh.objstorage.replayer.replay import ( | from swh.objstorage.replayer.replay import ( | ||||
Show All 23 Lines | if exclude_sha1_file: | ||||
nb_excluded_hashes = int(map_.size() / SHA1_SIZE) | nb_excluded_hashes = int(map_.size() / SHA1_SIZE) | ||||
def exclude_fn(obj): | def exclude_fn(obj): | ||||
return is_hash_in_bytearray(obj["sha1"], map_, nb_excluded_hashes) | return is_hash_in_bytearray(obj["sha1"], map_, nb_excluded_hashes) | ||||
else: | else: | ||||
exclude_fn = None | exclude_fn = None | ||||
journal_cfg = conf["journal_client"] | journal_cfg = conf.pop("journal_client") | ||||
journal_cfg.setdefault("cls", "kafka") | replayer_cfg = conf.pop("replayer", {}) | ||||
if "error_reporter" in journal_cfg: | if "error_reporter" in replayer_cfg: | ||||
from redis import Redis | from redis import Redis | ||||
from swh.objstorage.replayer import replay | from swh.objstorage.replayer import replay | ||||
replay.REPORTER = Redis(**journal_cfg.pop("error_reporter")).set | |||||
replay.REPORTER = Redis(**replayer_cfg.get("error_reporter")).set | |||||
client = get_journal_client( | client = get_journal_client( | ||||
**journal_cfg, stop_after_objects=stop_after_objects, object_types=("content",), | **journal_cfg, stop_after_objects=stop_after_objects, object_types=("content",), | ||||
) | ) | ||||
worker_fn = functools.partial( | worker_fn = functools.partial( | ||||
process_replay_objects_content, | process_replay_objects_content, | ||||
src=objstorage_src, | src=objstorage_src, | ||||
dst=objstorage_dst, | dst=objstorage_dst, | ||||
Show All 26 Lines |
is that indented enough for Sphinx?