Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/rehash.py
# Copyright (C) 2017-2020 The Software Heritage developers | # Copyright (C) 2017-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import defaultdict | from collections import defaultdict | ||||
import itertools | import itertools | ||||
import logging | import logging | ||||
from typing import Any, Dict, Generator, List, Optional, Tuple | from typing import Any, Dict, Generator, List, Optional, Tuple, cast | ||||
import sentry_sdk | import sentry_sdk | ||||
from swh.core import utils | from swh.core import utils | ||||
from swh.core.config import load_from_envvar | from swh.core.config import load_from_envvar | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.model import Content | from swh.model.model import Content | ||||
from swh.objstorage.exc import ObjNotFoundError | from swh.objstorage.exc import ObjNotFoundError | ||||
from swh.objstorage.factory import get_objstorage | from swh.objstorage.factory import get_objstorage | ||||
from swh.objstorage.interface import ObjId | |||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
DEFAULT_CONFIG: Dict[str, Any] = { | DEFAULT_CONFIG: Dict[str, Any] = { | ||||
"storage": {"cls": "memory"}, | "storage": {"cls": "memory"}, | ||||
"objstorage": {"cls": "memory"}, | "objstorage": {"cls": "memory"}, | ||||
# the set of checksums that should be computed. | # the set of checksums that should be computed. | ||||
# Examples: 'sha1_git', 'blake2b512', 'blake2s256' | # Examples: 'sha1_git', 'blake2b512', 'blake2s256' | ||||
"compute_checksums": [], | "compute_checksums": [], | ||||
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines | ) -> Generator[Tuple[Dict[str, Any], List[Any]], Any, None]: | ||||
content_metadata: List[Optional[Content]] = self.storage.content_get( | content_metadata: List[Optional[Content]] = self.storage.content_get( | ||||
sha1s | sha1s | ||||
) | ) | ||||
except Exception: | except Exception: | ||||
self.log.exception("Problem when reading contents metadata.") | self.log.exception("Problem when reading contents metadata.") | ||||
sentry_sdk.capture_exception() | sentry_sdk.capture_exception() | ||||
continue | continue | ||||
for sha1, content_model in zip(sha1s, content_metadata): | for content in content_metadata: | ||||
if not content_model: | if not content: | ||||
continue | continue | ||||
content: Dict = content_model.to_dict() | |||||
# Recompute checksums provided in compute_checksums options | # Recompute checksums provided in compute_checksums options | ||||
if self.recompute_checksums: | if self.recompute_checksums: | ||||
checksums_to_compute = list(self.compute_checksums) | checksums_to_compute = list(self.compute_checksums) | ||||
else: | else: | ||||
# Compute checksums provided in compute_checksums | # Compute checksums provided in compute_checksums | ||||
# options not already defined for that content | # options not already defined for that content | ||||
checksums_to_compute = [ | checksums_to_compute = [ | ||||
h for h in self.compute_checksums if not content.get(h) | h for h in self.compute_checksums if not content.get_hash(h) | ||||
] | ] | ||||
if not checksums_to_compute: # Nothing to recompute | if not checksums_to_compute: # Nothing to recompute | ||||
continue | continue | ||||
try: | try: | ||||
raw_content = self.objstorage.get(sha1) | raw_content = self.objstorage.get(cast(ObjId, content.hashes())) | ||||
except ObjNotFoundError: | except ObjNotFoundError: | ||||
self.log.warning("Content %s not found in objstorage!", sha1) | self.log.warning( | ||||
"Content %s not found in objstorage!", content.hashes() | |||||
) | |||||
continue | continue | ||||
content_hashes = hashutil.MultiHash.from_data( | content_hashes = hashutil.MultiHash.from_data( | ||||
raw_content, hash_names=checksums_to_compute | raw_content, hash_names=checksums_to_compute | ||||
).digest() | ).digest() | ||||
content.update(content_hashes) | content_dict = content.to_dict() | ||||
yield content, checksums_to_compute | content_dict.update(content_hashes) | ||||
yield content_dict, checksums_to_compute | |||||
def run(self, contents: List[Dict[str, Any]]) -> Dict: | def run(self, contents: List[Dict[str, Any]]) -> Dict: | ||||
"""Given a list of content: | """Given a list of content: | ||||
- (re)compute a given set of checksums on contents available in our | - (re)compute a given set of checksums on contents available in our | ||||
object storage | object storage | ||||
- update those contents with the new metadata | - update those contents with the new metadata | ||||
Show All 36 Lines |