Page MenuHomeSoftware Heritage

D186.id605.diff
No OneTemporary

D186.id605.diff

diff --git a/swh/indexer/rehash.py b/swh/indexer/rehash.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/rehash.py
@@ -0,0 +1,140 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from collections import defaultdict
+
+from swh.model import hashutil
+from swh.core import utils
+from swh.core.config import SWHConfig
+from swh.storage import get_storage
+
+
+class RecomputeChecksums(SWHConfig):
+ """Class in charge of (re)computing content's hashes.
+
+ Hashes to compute are defined across 2 configuration options:
+
+ - compute_checksums ([str]): list of hash algorithms that
+ swh.model.hashutil.hashdata function should be able to deal
+ with. For variable-length checksums, a desired checksum length
+ should also be provided. Their format is the algorithm's name, a
+ ':' and then the length e.g, blake2:512
+
+ - recompute_checksums (bool): a boolean to notify that we also
+ want to recompute potential existing hashes specified in
+ compute_checksums. Default to False.
+
+ """
+ DEFAULT_CONFIG = {
+ 'storage': ('dict', {
+ 'cls': 'remote',
+ 'args': {
+ 'url': 'http://localhost:5002/'
+ },
+ }),
+ # the set of checksums that should be computed. For
+ # variable-length checksums a desired checksum length should also
+ # be provided. Examples: 'sha1_git', 'sha3:224', 'blake2:512', 'sha512'
+ 'compute_checksums': (
+ 'list[str]', []),
+ # whether checksums that already exist in the DB should be
+ # recomputed/updated or left untouched
+ 'recompute_checksums': ('bool', False),
+ # Number of contents to retrieve blobs at the same time
+ 'batch_size_retrieve_content': ('int', 10),
+ # Number of contents to update at the same time
+ 'batch_size_update': ('int', 100)
+ }
+
+ CONFIG_BASE_FILENAME = 'storage/rehash'
+
+ def __init__(self):
+ self.config = self.parse_config_file()
+ self.storage = get_storage(**self.config['storage'])
+ self.compute_checksums = self.config['compute_checksums']
+ self.recompute_checksums = set(self.config[
+ 'recompute_checksums'])
+ self.batch_size_retrieve_content = self.config[
+ 'batch_size_retrieve_content']
+ self.batch_size_update = self.config[
+ 'batch_size_update']
+
+ for key in self.primary_key:
+ if key not in hashutil.ALGORITHMS:
+ raise ValueError('Primary key should be in %s' %
+ hashutil.ALGORITHMS)
+
+ if not self.compute_checksums:
+ raise ValueError('Checksums list should not be empty.')
+
+ def get_new_contents_metadata(self, all_contents):
+ """Retrieve raw contents and compute new checksums on the
+ contents. Unknown or corrupted contents are skipped.
+
+ Args:
+ all_contents ([dict]): List of contents as dictionary with
+ the necessary primary keys
+ checksum_algorithms ([str]): List of checksums to compute
+
+ Yields:
+ tuple of: content to update, list of checksums computed
+
+ """
+ for contents in utils.grouper(all_contents,
+ self.batch_size_retrieve_content):
+ contents = self.storage.content_get_metadata(
+ (c['sha1'] for c in contents))
+
+ for content in contents:
+ # Retrieve content's data
+ raw_contents = list(self.storage.content_get(
+ [content['sha1']]))
+ raw_content = raw_contents[0]
+ if not raw_content:
+ continue
+
+ raw_content = raw_content['data']
+
+ if self.recompute_checksums: # Recompute checksums provided
+ # in compute_checksums options
+ checksums_to_compute = self.compute_checksums
+ else: # Compute checkums provided in compute_checksums
+ # options not already defined for that content
+ checksums_to_compute = [h for h in self.compute_checksums
+ if not content.get(h)]
+
+ if not checksums_to_compute: # Nothing to recompute
+ continue
+
+ # Actually computing the checksums for that content
+ updated_content = hashutil.hash_data(
+ raw_content, algo=checksums_to_compute)
+ content.update(updated_content)
+ yield content, checksums_to_compute
+
+ def run(self, contents):
+ """Given a list of content (dict):
+ - (re)compute a given set of checksums on contents
+ available in our object storage
+ - update those contents with the new metadata
+
+ Args:
+ - contents ([dict]): contents as dictionary with
+ necessary keys. key present in such dictionary
+ should be the ones defined in the 'primary_key'
+ option.
+
+ """
+ for data in utils.grouper(
+ self.get_new_contents_metadata(self, contents),
+ self.batch_size_update):
+
+ groups = defaultdict(list)
+ for content, keys_to_update in list(data):
+ groups[keys_to_update].append(content)
+
+ for keys_to_update, contents in groups.items():
+ self.storage.content_update(contents,
+ keys=keys_to_update)
diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py
--- a/swh/indexer/tasks.py
+++ b/swh/indexer/tasks.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016 The Software Heritage developers
+# Copyright (C) 2016-2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -11,6 +11,7 @@
from .language import ContentLanguageIndexer
from .ctags import CtagsIndexer
from .fossology_license import ContentFossologyLicenseIndexer
+from .rehash import RecomputeChecksums
class SWHOrchestratorAllContentsTask(Task):
@@ -73,3 +74,13 @@
def run(self, *args, **kwargs):
ContentFossologyLicenseIndexer().run(*args, **kwargs)
+
+
+class SWHRecomputeChecksums(Task):
+ """Task which recomputes hashes and possibly new ones.
+
+ """
+ task_queue = 'swh_indexer_recompute_content_hashes'
+
+ def run(self, *args, **kwargs):
+ RecomputeChecksums().run(*args, **kwargs)

File Metadata

Mime Type
text/plain
Expires
Jan 30 2025, 9:52 AM (6 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222622

Event Timeline