Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/cassandra/storage.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import base64 | import base64 | ||||
import collections | |||||
import datetime | import datetime | ||||
import itertools | import itertools | ||||
import operator | import operator | ||||
import random | import random | ||||
import re | import re | ||||
from typing import ( | from typing import ( | ||||
Any, | Any, | ||||
Callable, | Callable, | ||||
▲ Show 20 Lines • Show All 395 Lines • ▼ Show 20 Lines | ) -> Iterable[bytes]: | ||||
if contents_with_missing_hashes: | if contents_with_missing_hashes: | ||||
# For these, we need the expensive index lookups + main table. | # For these, we need the expensive index lookups + main table. | ||||
# Get all contents in the database that match (at least) one of the | # Get all contents in the database that match (at least) one of the | ||||
# requested contents, concurrently. | # requested contents, concurrently. | ||||
found_contents = self._content_find_many(contents_with_missing_hashes) | found_contents = self._content_find_many(contents_with_missing_hashes) | ||||
for missing_content in contents_with_missing_hashes: | # Bucket the known contents by hash | ||||
found_contents_by_hash: Dict[str, Dict[str, list]] = { | |||||
algo: collections.defaultdict(list) for algo in DEFAULT_ALGORITHMS | |||||
} | |||||
for found_content in found_contents: | for found_content in found_contents: | ||||
for algo in DEFAULT_ALGORITHMS: | |||||
found_contents_by_hash[algo][found_content.get_hash(algo)].append( | |||||
found_content | |||||
) | |||||
# For each of the requested contents, check if they are in the | |||||
# 'found_contents' set (via 'found_contents_by_hash' for efficient access, | |||||
# since we need to check using dict inclusion instead of hash+equality) | |||||
for missing_content in contents_with_missing_hashes: | |||||
# Pick any of the algorithms provided in missing_content | |||||
algo = next(algo for (algo, hash_) in missing_content.items() if hash_) | |||||
# Get the list of found_contents that match this hash in the | |||||
# missing_content. (its length is at most 1, unless there is a | |||||
# collision) | |||||
found_contents_with_same_hash = found_contents_by_hash[algo][ | |||||
missing_content[algo] | |||||
] | |||||
# Check if there is a found_content that matches all hashes in the | |||||
# missing_content. | |||||
# This is functionally equivalent to 'for found_content in | |||||
# found_contents', but runs almost in constant time (it is linear | |||||
# in the number of hash collisions) instead of linear. | |||||
# This allows this function to run in linear time overall instead of | |||||
# quadratic. | |||||
for found_content in found_contents_with_same_hash: | |||||
# check if the found_content.hashes() dictionary contains a superset | # check if the found_content.hashes() dictionary contains a superset | ||||
# of the (key, value) pairs in missing_content | # of the (key, value) pairs in missing_content | ||||
if missing_content.items() <= found_content.hashes().items(): | if missing_content.items() <= found_content.hashes().items(): | ||||
# Found! | # Found! | ||||
break | break | ||||
else: | else: | ||||
# Not found | # Not found | ||||
yield missing_content[key_hash] | yield missing_content[key_hash] | ||||
▲ Show 20 Lines • Show All 1,299 Lines • Show Last 20 Lines |