Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/rehash.py
# Copyright (C) 2017-2020 The Software Heritage developers | # Copyright (C) 2017-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | import logging | ||||
import itertools | import itertools | ||||
from collections import defaultdict | from collections import defaultdict | ||||
from typing import Dict, Any, Tuple, List, Generator | from typing import Any, Dict, Generator, List, Optional, Tuple | ||||
from swh.core import utils | from swh.core import utils | ||||
from swh.core.config import SWHConfig | from swh.core.config import SWHConfig | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.model import Content | |||||
from swh.objstorage import get_objstorage | from swh.objstorage import get_objstorage | ||||
from swh.objstorage.exc import ObjNotFoundError | from swh.objstorage.exc import ObjNotFoundError | ||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
class RecomputeChecksums(SWHConfig): | class RecomputeChecksums(SWHConfig): | ||||
"""Class in charge of (re)computing content's hashes. | """Class in charge of (re)computing content's hashes. | ||||
▲ Show 20 Lines • Show All 82 Lines • ▼ Show 20 Lines | ) -> Generator[Tuple[Dict[str, Any], List[Any]], Any, None]: | ||||
Yields: | Yields: | ||||
tuple: tuple of (content to update, list of checksums computed) | tuple: tuple of (content to update, list of checksums computed) | ||||
""" | """ | ||||
content_ids = self._read_content_ids(all_contents) | content_ids = self._read_content_ids(all_contents) | ||||
for contents in utils.grouper(content_ids, self.batch_size_retrieve_content): | for contents in utils.grouper(content_ids, self.batch_size_retrieve_content): | ||||
contents_iter = itertools.tee(contents, 2) | contents_iter = itertools.tee(contents, 2) | ||||
try: | try: | ||||
content_metadata: Dict[ | sha1s = [s for s in contents_iter[0]] | ||||
bytes, List[Dict] | content_metadata: List[Optional[Content]] = self.storage.content_get( | ||||
] = self.storage.content_get_metadata( # noqa | sha1s | ||||
[s for s in contents_iter[0]] | |||||
) | ) | ||||
except Exception: | except Exception: | ||||
self.log.exception("Problem when reading contents metadata.") | self.log.exception("Problem when reading contents metadata.") | ||||
continue | continue | ||||
for sha1, content_dicts in content_metadata.items(): | for sha1, content_model in zip(sha1s, content_metadata): | ||||
if not content_dicts: | if not content_model: | ||||
continue | continue | ||||
content: Dict = content_dicts[0] | content: Dict = content_model.to_dict() | ||||
# Recompute checksums provided in compute_checksums options | # Recompute checksums provided in compute_checksums options | ||||
if self.recompute_checksums: | if self.recompute_checksums: | ||||
checksums_to_compute = list(self.compute_checksums) | checksums_to_compute = list(self.compute_checksums) | ||||
else: | else: | ||||
# Compute checksums provided in compute_checksums | # Compute checksums provided in compute_checksums | ||||
# options not already defined for that content | # options not already defined for that content | ||||
checksums_to_compute = [ | checksums_to_compute = [ | ||||
h for h in self.compute_checksums if not content.get(h) | h for h in self.compute_checksums if not content.get(h) | ||||
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines |