Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163524
D186.id605.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D186.id605.diff
View Options
diff --git a/swh/indexer/rehash.py b/swh/indexer/rehash.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/rehash.py
@@ -0,0 +1,140 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from collections import defaultdict
+
+from swh.model import hashutil
+from swh.core import utils
+from swh.core.config import SWHConfig
+from swh.storage import get_storage
+
+
+class RecomputeChecksums(SWHConfig):
+ """Class in charge of (re)computing content's hashes.
+
+ Hashes to compute are defined across 2 configuration options:
+
+ - compute_checksums ([str]): list of hash algorithms that
+ swh.model.hashutil.hashdata function should be able to deal
+ with. For variable-length checksums, a desired checksum length
+ should also be provided. Their format is the algorithm's name, a
+ ':' and then the length e.g, blake2:512
+
+ - recompute_checksums (bool): a boolean to notify that we also
+ want to recompute potential existing hashes specified in
+ compute_checksums. Default to False.
+
+ """
+ DEFAULT_CONFIG = {
+ 'storage': ('dict', {
+ 'cls': 'remote',
+ 'args': {
+ 'url': 'http://localhost:5002/'
+ },
+ }),
+ # the set of checksums that should be computed. For
+ # variable-length checksums a desired checksum length should also
+ # be provided. Examples: 'sha1_git', 'sha3:224', 'blake2:512', 'sha512'
+ 'compute_checksums': (
+ 'list[str]', []),
+ # whether checksums that already exist in the DB should be
+ # recomputed/updated or left untouched
+ 'recompute_checksums': ('bool', False),
+ # Number of contents to retrieve blobs at the same time
+ 'batch_size_retrieve_content': ('int', 10),
+ # Number of contents to update at the same time
+ 'batch_size_update': ('int', 100)
+ }
+
+ CONFIG_BASE_FILENAME = 'storage/rehash'
+
+ def __init__(self):
+ self.config = self.parse_config_file()
+ self.storage = get_storage(**self.config['storage'])
+ self.compute_checksums = self.config['compute_checksums']
+ self.recompute_checksums = set(self.config[
+ 'recompute_checksums'])
+ self.batch_size_retrieve_content = self.config[
+ 'batch_size_retrieve_content']
+ self.batch_size_update = self.config[
+ 'batch_size_update']
+
+ for key in self.primary_key:
+ if key not in hashutil.ALGORITHMS:
+ raise ValueError('Primary key should be in %s' %
+ hashutil.ALGORITHMS)
+
+ if not self.compute_checksums:
+ raise ValueError('Checksums list should not be empty.')
+
+ def get_new_contents_metadata(self, all_contents):
+ """Retrieve raw contents and compute new checksums on the
+ contents. Unknown or corrupted contents are skipped.
+
+ Args:
+ all_contents ([dict]): List of contents as dictionary with
+ the necessary primary keys
+ checksum_algorithms ([str]): List of checksums to compute
+
+ Yields:
+ tuple of: content to update, list of checksums computed
+
+ """
+ for contents in utils.grouper(all_contents,
+ self.batch_size_retrieve_content):
+ contents = self.storage.content_get_metadata(
+ (c['sha1'] for c in contents))
+
+ for content in contents:
+ # Retrieve content's data
+ raw_contents = list(self.storage.content_get(
+ [content['sha1']]))
+ raw_content = raw_contents[0]
+ if not raw_content:
+ continue
+
+ raw_content = raw_content['data']
+
+ if self.recompute_checksums: # Recompute checksums provided
+ # in compute_checksums options
+ checksums_to_compute = self.compute_checksums
+ else: # Compute checkums provided in compute_checksums
+ # options not already defined for that content
+ checksums_to_compute = [h for h in self.compute_checksums
+ if not content.get(h)]
+
+ if not checksums_to_compute: # Nothing to recompute
+ continue
+
+ # Actually computing the checksums for that content
+ updated_content = hashutil.hash_data(
+ raw_content, algo=checksums_to_compute)
+ content.update(updated_content)
+ yield content, checksums_to_compute
+
+ def run(self, contents):
+ """Given a list of content (dict):
+ - (re)compute a given set of checksums on contents
+ available in our object storage
+ - update those contents with the new metadata
+
+ Args:
+ - contents ([dict]): contents as dictionary with
+ necessary keys. key present in such dictionary
+ should be the ones defined in the 'primary_key'
+ option.
+
+ """
+ for data in utils.grouper(
+ self.get_new_contents_metadata(self, contents),
+ self.batch_size_update):
+
+ groups = defaultdict(list)
+ for content, keys_to_update in list(data):
+ groups[keys_to_update].append(content)
+
+ for keys_to_update, contents in groups.items():
+ self.storage.content_update(contents,
+ keys=keys_to_update)
diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py
--- a/swh/indexer/tasks.py
+++ b/swh/indexer/tasks.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016 The Software Heritage developers
+# Copyright (C) 2016-2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -11,6 +11,7 @@
from .language import ContentLanguageIndexer
from .ctags import CtagsIndexer
from .fossology_license import ContentFossologyLicenseIndexer
+from .rehash import RecomputeChecksums
class SWHOrchestratorAllContentsTask(Task):
@@ -73,3 +74,13 @@
def run(self, *args, **kwargs):
ContentFossologyLicenseIndexer().run(*args, **kwargs)
+
+
+class SWHRecomputeChecksums(Task):
+ """Task which recomputes hashes and possibly new ones.
+
+ """
+ task_queue = 'swh_indexer_recompute_content_hashes'
+
+ def run(self, *args, **kwargs):
+ RecomputeChecksums().run(*args, **kwargs)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jan 30 2025, 9:52 AM (6 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222622
Attached To
D186: Recompute class to trigger an add/update hash checksums in storage
Event Timeline
Log In to Comment