diff --git a/swh/indexer/rehash.py b/swh/indexer/rehash.py index 9319b22..db8b8db 100644 --- a/swh/indexer/rehash.py +++ b/swh/indexer/rehash.py @@ -1,144 +1,148 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict from swh.model import hashutil from swh.core import utils from swh.core.config import SWHConfig from swh.storage import get_storage class RecomputeChecksums(SWHConfig): """Class in charge of (re)computing content's hashes. - Corrupted contents won't be updated. - Hashes to compute are defined across 2 configuration options: - - primary_key ([str]): List of keys composing the primary key of a - content in the storage db. - - compute_checksums ([str]): list of hash algorithms that - swh.model.hashutil.hashdata function should be able to deal - with. For variable-length checksums a desired checksum length - should also be provided (e.g: blake2:512) + swh.model.hashutil.hash_data function should be able to deal + with. For variable-length checksums, a desired checksum length + should also be provided. Their format is : e.g: blake2:512 - - recompute_checksums (bool): a boolean to notify that we - also want to recompute existing hash (as defined in - swh.model.hashutil.ALGORITHMS). As an important design detail, - there is currently one limitation about sha1. Since it's a - primary key on content, this cannot be updated so the - computations are skipped for that one. + - recompute_checksums (bool): a boolean to notify that we also + want to recompute potential existing hashes specified in + compute_checksums. Default to False. """ DEFAULT_CONFIG = { 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/' }, }), # the set of checksums that should be computed. For # variable-length checksums a desired checksum length should also - # be provided. Examples: 'sha3:224', 'blake2:512', 'sha512' + # be provided. Examples: 'sha1_git', 'sha3:224', 'blake2:512', 'sha512' 'compute_checksums': ( 'list[str]', []), # whether checksums that already exist in the DB should be # recomputed/updated or left untouched 'recompute_checksums': ('bool', False), - # primary key used for content. This will serve to check the - # data is not corrupted. The content sent should reflect the - # keys defined here. - 'primary_key': ('list[str]', ['sha1']), # Number of contents to retrieve blobs at the same time 'batch_size_retrieve_content': ('int', 10), # Number of contents to update at the same time 'batch_size_update': ('int', 100) } CONFIG_BASE_FILENAME = 'storage/rehash' def __init__(self): self.config = self.parse_config_file() self.storage = get_storage(**self.config['storage']) self.compute_checksums = self.config['compute_checksums'] - self.recompute_checksums = set(self.config[ - 'recompute_checksums']) - self.primary_key = set(self.config['primary_key']) + self.recompute_checksums = self.config[ + 'recompute_checksums'] self.batch_size_retrieve_content = self.config[ 'batch_size_retrieve_content'] self.batch_size_update = self.config[ 'batch_size_update'] - for key in self.primary_key: - if key not in hashutil.ALGORITHMS: - raise ValueError('Primary key should be in %s' % - hashutil.ALGORITHMS) - if not self.compute_checksums: raise ValueError('Checksums list should not be empty.') - def get_new_contents_metadata(self, all_contents, checksum_algorithms): + def _read_content_ids(self, contents): + """Read the content identifiers from the contents. + + """ + for c in contents: + h = c['sha1'] + if isinstance(h, str): + h = hashutil.hash_to_bytes(h) + + yield h + + def get_new_contents_metadata(self, all_contents): """Retrieve raw contents and compute new checksums on the contents. Unknown or corrupted contents are skipped. Args: - ids ([dict]): Content with the necessary keys (cf. primary_key - option) + all_contents ([dict]): List of contents as dictionary with + the necessary primary keys checksum_algorithms ([str]): List of checksums to compute + Yields: + tuple of: content to update, list of checksums computed + """ for contents in utils.grouper(all_contents, self.batch_size_retrieve_content): - # Retrieve the raw data - contents = self.storage.content_get( - (c['sha1'] for c in contents)) + contents = self.storage.content_get_metadata( + self._read_content_ids(contents)) for content in contents: - raw_content = content['data'] - - updated_content = hashutil.hashdata( - raw_content, algo=checksum_algorithms) - - # Check the invariant primary key - for key in self.primary_key: - old_value = content[key] - new_value = updated_content[key] - if old_value != new_value: - self.log.error( - "Corrupted content! Old %s %s and new one %s don't" - " match." % (key, old_value, new_value)) + # Retrieve content's data + raw_contents = list(self.storage.content_get( + [content['sha1']])) + raw_content = raw_contents[0] + if not raw_content: continue - yield updated_content + raw_content = raw_content['data'] + + if self.recompute_checksums: # Recompute checksums provided + # in compute_checksums options + checksums_to_compute = list(self.compute_checksums) + else: # Compute checkums provided in compute_checksums + # options not already defined for that content + checksums_to_compute = [h for h in self.compute_checksums + if not content.get(h)] + + if not checksums_to_compute: # Nothing to recompute + continue + + # Actually computing the checksums for that content + updated_content = hashutil.hash_data( + raw_content, algorithms=checksums_to_compute) + content.update(updated_content) + yield content, checksums_to_compute def run(self, contents): """Given a list of content (dict): - (re)compute a given set of checksums on contents available in our object storage - update those contents with the new metadata Args: - - ids ([dict]): content identifier as dictionary. The - key present in such dictionary should be the ones - defined in the 'primary_key' option. + - contents ([dict]): contents as dictionary with + necessary keys. key present in such dictionary + should be the ones defined in the 'primary_key' + option. """ - # Determine checksums to compute - checksum_algorithms = self.compute_checksums - if self.recompute_checksums: - checksum_algorithms = checksum_algorithms + set( - hashutil.ALGORITHMS) - - # Whatever the choice on checksums to recompute, we cannot - # update the 'composite' primary key so removing it from the - # columns to update - keys_to_update = list(checksum_algorithms - self.primary_key) - - for contents in utils.grouper( - self.get_new_contents_metadata(self, contents), + for data in utils.grouper( + self.get_new_contents_metadata(contents), self.batch_size_update): - self.storage.content_update(list(contents), - keys=keys_to_update) + + groups = defaultdict(list) + for content, keys_to_update in list(data): + keys = ','.join(keys_to_update) + groups[keys].append(content) + + for keys_to_update, contents in groups.items(): + keys = keys_to_update.split(',') + self.storage.content_update(contents, + keys=keys)