Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/rehash.py
# Copyright (C) 2017-2020 The Software Heritage developers | # Copyright (C) 2017-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import defaultdict | from collections import defaultdict | ||||
import itertools | import itertools | ||||
import logging | import logging | ||||
from typing import Any, Dict, Generator, List, Optional, Tuple | from typing import Any, Dict, Generator, List, Optional, Tuple | ||||
from swh.core import utils | from swh.core import utils | ||||
from swh.core.config import SWHConfig | from swh.core.config import load_from_envvar | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.model import Content | from swh.model.model import Content | ||||
from swh.objstorage.exc import ObjNotFoundError | from swh.objstorage.exc import ObjNotFoundError | ||||
from swh.objstorage.factory import get_objstorage | from swh.objstorage.factory import get_objstorage | ||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
DEFAULT_CONFIG: Dict[str, Any] = { | |||||
"storage": {"cls": "memory"}, | |||||
"objstorage": {"cls": "memory"}, | |||||
# the set of checksums that should be computed. | |||||
# Examples: 'sha1_git', 'blake2b512', 'blake2s256' | |||||
"compute_checksums": [], | |||||
# whether checksums that already exist in the DB should be | |||||
# recomputed/updated or left untouched | |||||
"recompute_checksums": False, | |||||
# Number of contents to retrieve blobs at the same time | |||||
"batch_size_retrieve_content": 10, | |||||
# Number of contents to update at the same time | |||||
"batch_size_update": 100, | |||||
} | |||||
class RecomputeChecksums(SWHConfig): | |||||
class RecomputeChecksums: | |||||
"""Class in charge of (re)computing content's hashes. | """Class in charge of (re)computing content's hashes. | ||||
Hashes to compute are defined across 2 configuration options: | Hashes to compute are defined across 2 configuration options: | ||||
compute_checksums ([str]) | compute_checksums ([str]) | ||||
list of hash algorithms that | list of hash algorithms that | ||||
py:func:`swh.model.hashutil.MultiHash.from_data` function should | py:func:`swh.model.hashutil.MultiHash.from_data` function should | ||||
be able to deal with. For variable-length checksums, a desired | be able to deal with. For variable-length checksums, a desired | ||||
checksum length should also be provided. Their format is | checksum length should also be provided. Their format is | ||||
<algorithm's name>:<variable-length> e.g: blake2:512 | <algorithm's name>:<variable-length> e.g: blake2:512 | ||||
recompute_checksums (bool) | recompute_checksums (bool) | ||||
a boolean to notify that we also want to recompute potential existing | a boolean to notify that we also want to recompute potential existing | ||||
hashes specified in compute_checksums. Default to False. | hashes specified in compute_checksums. Default to False. | ||||
""" | """ | ||||
DEFAULT_CONFIG = { | |||||
# The storage to read from or update metadata to | |||||
"storage": ( | |||||
"dict", | |||||
{"cls": "remote", "args": {"url": "http://localhost:5002/"},}, | |||||
), | |||||
# The objstorage to read contents' data from | |||||
"objstorage": ( | |||||
"dict", | |||||
{ | |||||
"cls": "pathslicing", | |||||
"args": { | |||||
"root": "/srv/softwareheritage/objects", | |||||
"slicing": "0:2/2:4/4:6", | |||||
}, | |||||
}, | |||||
), | |||||
# the set of checksums that should be computed. | |||||
# Examples: 'sha1_git', 'blake2b512', 'blake2s256' | |||||
"compute_checksums": ("list[str]", []), | |||||
# whether checksums that already exist in the DB should be | |||||
# recomputed/updated or left untouched | |||||
"recompute_checksums": ("bool", False), | |||||
# Number of contents to retrieve blobs at the same time | |||||
"batch_size_retrieve_content": ("int", 10), | |||||
# Number of contents to update at the same time | |||||
"batch_size_update": ("int", 100), | |||||
} | |||||
CONFIG_BASE_FILENAME = "indexer/rehash" | |||||
def __init__(self) -> None: | def __init__(self) -> None: | ||||
self.config = self.parse_config_file() | self.config = load_from_envvar(DEFAULT_CONFIG) | ||||
self.storage = get_storage(**self.config["storage"]) | self.storage = get_storage(**self.config["storage"]) | ||||
self.objstorage = get_objstorage(**self.config["objstorage"]) | self.objstorage = get_objstorage(**self.config["objstorage"]) | ||||
self.compute_checksums = self.config["compute_checksums"] | self.compute_checksums = self.config["compute_checksums"] | ||||
self.recompute_checksums = self.config["recompute_checksums"] | self.recompute_checksums = self.config["recompute_checksums"] | ||||
self.batch_size_retrieve_content = self.config["batch_size_retrieve_content"] | self.batch_size_retrieve_content = self.config["batch_size_retrieve_content"] | ||||
self.batch_size_update = self.config["batch_size_update"] | self.batch_size_update = self.config["batch_size_update"] | ||||
self.log = logging.getLogger("swh.indexer.rehash") | self.log = logging.getLogger("swh.indexer.rehash") | ||||
▲ Show 20 Lines • Show All 113 Lines • Show Last 20 Lines |