Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/tests/test_storage.py
# Copyright (C) 2015-2019 The Software Heritage developers | # Copyright (C) 2015-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import copy | import copy | ||||
from contextlib import contextmanager | from contextlib import contextmanager | ||||
import datetime | import datetime | ||||
import itertools | import itertools | ||||
import math | |||||
import queue | import queue | ||||
import random | import random | ||||
import threading | import threading | ||||
from collections import defaultdict | from collections import defaultdict | ||||
from datetime import timedelta | from datetime import timedelta | ||||
from unittest.mock import Mock | from unittest.mock import Mock | ||||
import psycopg2 | import psycopg2 | ||||
import pytest | import pytest | ||||
from hypothesis import given, strategies, settings, HealthCheck | from hypothesis import given, strategies, settings, HealthCheck | ||||
from typing import ClassVar, Optional | from typing import ClassVar, Optional | ||||
from swh.model import from_disk, identifiers | from swh.model import from_disk, identifiers | ||||
from swh.model.model import SHA1_SIZE | |||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||
from swh.model.hypothesis_strategies import objects | from swh.model.hypothesis_strategies import objects | ||||
from swh.storage import HashCollision | from swh.storage import HashCollision | ||||
from swh.storage.converters import origin_url_to_sha1 as sha1 | from swh.storage.converters import origin_url_to_sha1 as sha1 | ||||
from .storage_data import data | from .storage_data import data | ||||
Show All 31 Lines | |||||
def cmpdir(directory): | def cmpdir(directory): | ||||
return (directory['type'], directory['dir_id']) | return (directory['type'], directory['dir_id']) | ||||
def short_revision(revision): | def short_revision(revision): | ||||
return [revision['id'], revision['parents']] | return [revision['id'], revision['parents']] | ||||
def assert_contents_ok(expected_contents, actual_contents, | |||||
keys_to_check={'sha1', 'data'}): | |||||
"""Assert that a given list of contents matches on a given set of keys. | |||||
""" | |||||
for k in keys_to_check: | |||||
expected_list = set([c.get(k) for c in expected_contents]) | |||||
actual_list = set([c.get(k) for c in actual_contents]) | |||||
assert actual_list == expected_list, k | |||||
class TestStorage: | class TestStorage: | ||||
"""Main class for Storage testing. | """Main class for Storage testing. | ||||
This class is used as-is to test local storage (see TestLocalStorage | This class is used as-is to test local storage (see TestLocalStorage | ||||
below) and remote storage (see TestRemoteStorage in | below) and remote storage (see TestRemoteStorage in | ||||
test_remote_storage.py. | test_remote_storage.py. | ||||
We need to have the two classes inherit from this base class | We need to have the two classes inherit from this base class | ||||
▲ Show 20 Lines • Show All 271 Lines • ▼ Show 20 Lines | def test_content_missing_per_sha1(self, swh_storage): | ||||
missing_cont = data.missing_cont | missing_cont = data.missing_cont | ||||
swh_storage.content_add([cont2]) | swh_storage.content_add([cont2]) | ||||
# when | # when | ||||
gen = swh_storage.content_missing_per_sha1([cont2['sha1'], | gen = swh_storage.content_missing_per_sha1([cont2['sha1'], | ||||
missing_cont['sha1']]) | missing_cont['sha1']]) | ||||
# then | # then | ||||
assert list(gen) == [missing_cont['sha1']] | assert list(gen) == [missing_cont['sha1']] | ||||
def test_content_get_partition(self, swh_storage, swh_contents): | |||||
"""content_get_partition paginates results if limit exceeded""" | |||||
expected_contents = [c for c in swh_contents | |||||
if c['status'] != 'absent'] | |||||
actual_contents = [] | |||||
for i in range(16): | |||||
actual_result = swh_storage.content_get_partition(i, 16) | |||||
assert actual_result['next_page_token'] is None | |||||
actual_contents.extend(actual_result['contents']) | |||||
assert_contents_ok( | |||||
expected_contents, actual_contents, ['sha1']) | |||||
def test_content_get_partition_full(self, swh_storage, swh_contents): | |||||
"""content_get_partition for a single partition returns all available | |||||
contents""" | |||||
expected_contents = [c for c in swh_contents | |||||
if c['status'] != 'absent'] | |||||
actual_result = swh_storage.content_get_partition(0, 1) | |||||
assert actual_result['next_page_token'] is None | |||||
actual_contents = actual_result['contents'] | |||||
assert_contents_ok( | |||||
expected_contents, actual_contents, ['sha1']) | |||||
def test_content_get_partition_empty(self, swh_storage, swh_contents): | |||||
"""content_get_partition for an empty partition returns nothing""" | |||||
first_sha1 = min(content['sha1'] for content in swh_contents) | |||||
first_sha1 = int.from_bytes(first_sha1, 'big') | |||||
# nb_partitions = smallest power of 2 such that first_sha1 is not in | |||||
# the first partition | |||||
nb_partitions = \ | |||||
1 << (SHA1_SIZE*8 - math.floor(math.log2(first_sha1)) + 1) | |||||
actual_result = swh_storage.content_get_partition(0, nb_partitions) | |||||
assert actual_result['next_page_token'] is None | |||||
assert len(actual_result['contents']) == 0 | |||||
def test_content_get_partition_limit_none(self, swh_storage): | |||||
"""content_get_partition call with wrong limit input should fail""" | |||||
with pytest.raises(ValueError) as e: | |||||
swh_storage.content_get_partition(1, 16, limit=None) | |||||
assert e.value.args == ('Development error: limit should not be None',) | |||||
def test_generate_content_get_partition_pagination( | |||||
self, swh_storage, swh_contents): | |||||
"""content_get_partition returns contents within range provided""" | |||||
expected_contents = [c for c in swh_contents | |||||
if c['status'] != 'absent'] | |||||
# retrieve contents | |||||
actual_contents = [] | |||||
for i in range(4): | |||||
page_token = None | |||||
while True: | |||||
actual_result = swh_storage.content_get_partition( | |||||
i, 4, limit=3, page_token=page_token) | |||||
actual_contents.extend(actual_result['contents']) | |||||
page_token = actual_result['next_page_token'] | |||||
if page_token is None: | |||||
break | |||||
assert_contents_ok( | |||||
expected_contents, actual_contents, ['sha1']) | |||||
def test_content_get_metadata(self, swh_storage): | def test_content_get_metadata(self, swh_storage): | ||||
cont1 = data.cont | cont1 = data.cont | ||||
cont2 = data.cont2 | cont2 = data.cont2 | ||||
swh_storage.content_add([cont1, cont2]) | swh_storage.content_add([cont1, cont2]) | ||||
actual_md = list(swh_storage.content_get_metadata( | actual_md = list(swh_storage.content_get_metadata( | ||||
[cont1['sha1'], cont2['sha1']])) | [cont1['sha1'], cont2['sha1']])) | ||||
▲ Show 20 Lines • Show All 2,689 Lines • ▼ Show 20 Lines | def test_origin_metadata_get_by_provider_type(self, swh_storage): | ||||
}] | }] | ||||
# then | # then | ||||
assert len(m_by_provider) == 1 | assert len(m_by_provider) == 1 | ||||
assert m_by_provider == expected_results | assert m_by_provider == expected_results | ||||
class TestStorageGeneratedData: | class TestStorageGeneratedData: | ||||
def assert_contents_ok(self, expected_contents, actual_contents, | |||||
keys_to_check={'sha1', 'data'}): | |||||
"""Assert that a given list of contents matches on a given set of keys. | |||||
""" | |||||
for k in keys_to_check: | |||||
expected_list = set([c.get(k) for c in expected_contents]) | |||||
actual_list = set([c.get(k) for c in actual_contents]) | |||||
assert actual_list == expected_list, k | |||||
def test_generate_content_get(self, swh_storage, swh_contents): | def test_generate_content_get(self, swh_storage, swh_contents): | ||||
contents_with_data = [c for c in swh_contents | contents_with_data = [c for c in swh_contents | ||||
if c['status'] != 'absent'] | if c['status'] != 'absent'] | ||||
# input the list of sha1s we want from storage | # input the list of sha1s we want from storage | ||||
get_sha1s = [c['sha1'] for c in contents_with_data] | get_sha1s = [c['sha1'] for c in contents_with_data] | ||||
# retrieve contents | # retrieve contents | ||||
actual_contents = list(swh_storage.content_get(get_sha1s)) | actual_contents = list(swh_storage.content_get(get_sha1s)) | ||||
assert None not in actual_contents | assert None not in actual_contents | ||||
self.assert_contents_ok(contents_with_data, actual_contents) | assert_contents_ok(contents_with_data, actual_contents) | ||||
def test_generate_content_get_metadata(self, swh_storage, swh_contents): | def test_generate_content_get_metadata(self, swh_storage, swh_contents): | ||||
# input the list of sha1s we want from storage | # input the list of sha1s we want from storage | ||||
expected_contents = [c for c in swh_contents | expected_contents = [c for c in swh_contents | ||||
if c['status'] != 'absent'] | if c['status'] != 'absent'] | ||||
get_sha1s = [c['sha1'] for c in expected_contents] | get_sha1s = [c['sha1'] for c in expected_contents] | ||||
# retrieve contents | # retrieve contents | ||||
actual_contents = list(swh_storage.content_get_metadata(get_sha1s)) | actual_contents = list(swh_storage.content_get_metadata(get_sha1s)) | ||||
assert len(actual_contents) == len(get_sha1s) | assert len(actual_contents) == len(get_sha1s) | ||||
keys_to_check = {'length', 'status', | keys_to_check = {'length', 'status', | ||||
'sha1', 'sha1_git', 'sha256', 'blake2s256'} | 'sha1', 'sha1_git', 'sha256', 'blake2s256'} | ||||
self.assert_contents_ok(expected_contents, actual_contents, | assert_contents_ok(expected_contents, actual_contents, | ||||
keys_to_check=keys_to_check) | keys_to_check=keys_to_check) | ||||
def test_generate_content_get_range(self, swh_storage, swh_contents): | def test_generate_content_get_range(self, swh_storage, swh_contents): | ||||
"""content_get_range returns complete range""" | """content_get_range returns complete range""" | ||||
present_contents = [c for c in swh_contents | present_contents = [c for c in swh_contents | ||||
if c['status'] != 'absent'] | if c['status'] != 'absent'] | ||||
get_sha1s = sorted([c['sha1'] for c in swh_contents | get_sha1s = sorted([c['sha1'] for c in swh_contents | ||||
if c['status'] != 'absent']) | if c['status'] != 'absent']) | ||||
start = get_sha1s[2] | start = get_sha1s[2] | ||||
end = get_sha1s[-2] | end = get_sha1s[-2] | ||||
actual_result = swh_storage.content_get_range(start, end) | actual_result = swh_storage.content_get_range(start, end) | ||||
assert actual_result['next'] is None | assert actual_result['next'] is None | ||||
actual_contents = actual_result['contents'] | actual_contents = actual_result['contents'] | ||||
expected_contents = [c for c in present_contents | expected_contents = [c for c in present_contents | ||||
if start <= c['sha1'] <= end] | if start <= c['sha1'] <= end] | ||||
if expected_contents: | if expected_contents: | ||||
self.assert_contents_ok( | assert_contents_ok( | ||||
expected_contents, actual_contents, ['sha1']) | expected_contents, actual_contents, ['sha1']) | ||||
else: | else: | ||||
assert actual_contents == [] | assert actual_contents == [] | ||||
def test_generate_content_get_range_full(self, swh_storage, swh_contents): | def test_generate_content_get_range_full(self, swh_storage, swh_contents): | ||||
"""content_get_range for a full range returns all available contents""" | """content_get_range for a full range returns all available contents""" | ||||
present_contents = [c for c in swh_contents | present_contents = [c for c in swh_contents | ||||
if c['status'] != 'absent'] | if c['status'] != 'absent'] | ||||
start = b'0' * 40 | start = b'0' * 40 | ||||
end = b'f' * 40 | end = b'f' * 40 | ||||
actual_result = swh_storage.content_get_range(start, end) | actual_result = swh_storage.content_get_range(start, end) | ||||
assert actual_result['next'] is None | assert actual_result['next'] is None | ||||
actual_contents = actual_result['contents'] | actual_contents = actual_result['contents'] | ||||
expected_contents = [c for c in present_contents | expected_contents = [c for c in present_contents | ||||
if start <= c['sha1'] <= end] | if start <= c['sha1'] <= end] | ||||
if expected_contents: | if expected_contents: | ||||
self.assert_contents_ok( | assert_contents_ok( | ||||
expected_contents, actual_contents, ['sha1']) | expected_contents, actual_contents, ['sha1']) | ||||
else: | else: | ||||
assert actual_contents == [] | assert actual_contents == [] | ||||
def test_generate_content_get_range_empty(self, swh_storage, swh_contents): | def test_generate_content_get_range_empty(self, swh_storage, swh_contents): | ||||
"""content_get_range for an empty range returns nothing""" | """content_get_range for an empty range returns nothing""" | ||||
start = b'0' * 40 | start = b'0' * 40 | ||||
end = b'f' * 40 | end = b'f' * 40 | ||||
Show All 21 Lines | def test_generate_content_get_range_no_limit( | ||||
actual_result = swh_storage.content_get_range(start, end) | actual_result = swh_storage.content_get_range(start, end) | ||||
actual_contents = actual_result['contents'] | actual_contents = actual_result['contents'] | ||||
assert actual_result['next'] is None | assert actual_result['next'] is None | ||||
assert len(actual_contents) == len(get_sha1s) | assert len(actual_contents) == len(get_sha1s) | ||||
expected_contents = [c for c in swh_contents | expected_contents = [c for c in swh_contents | ||||
if c['status'] != 'absent'] | if c['status'] != 'absent'] | ||||
self.assert_contents_ok( | assert_contents_ok( | ||||
expected_contents, actual_contents, ['sha1']) | expected_contents, actual_contents, ['sha1']) | ||||
def test_generate_content_get_range_limit(self, swh_storage, swh_contents): | def test_generate_content_get_range_limit(self, swh_storage, swh_contents): | ||||
"""content_get_range paginates results if limit exceeded""" | """content_get_range paginates results if limit exceeded""" | ||||
contents_map = {c['sha1']: c for c in swh_contents} | contents_map = {c['sha1']: c for c in swh_contents} | ||||
# input the list of sha1s we want from storage | # input the list of sha1s we want from storage | ||||
get_sha1s = sorted([c['sha1'] for c in swh_contents | get_sha1s = sorted([c['sha1'] for c in swh_contents | ||||
if c['status'] != 'absent']) | if c['status'] != 'absent']) | ||||
start = get_sha1s[0] | start = get_sha1s[0] | ||||
end = get_sha1s[-1] | end = get_sha1s[-1] | ||||
# retrieve contents limited to n-1 results | # retrieve contents limited to n-1 results | ||||
limited_results = len(get_sha1s) - 1 | limited_results = len(get_sha1s) - 1 | ||||
actual_result = swh_storage.content_get_range( | actual_result = swh_storage.content_get_range( | ||||
start, end, limit=limited_results) | start, end, limit=limited_results) | ||||
actual_contents = actual_result['contents'] | actual_contents = actual_result['contents'] | ||||
assert actual_result['next'] == get_sha1s[-1] | assert actual_result['next'] == get_sha1s[-1] | ||||
assert len(actual_contents) == limited_results | assert len(actual_contents) == limited_results | ||||
expected_contents = [contents_map[sha1] for sha1 in get_sha1s[:-1]] | expected_contents = [contents_map[sha1] for sha1 in get_sha1s[:-1]] | ||||
self.assert_contents_ok( | assert_contents_ok( | ||||
expected_contents, actual_contents, ['sha1']) | expected_contents, actual_contents, ['sha1']) | ||||
# retrieve next part | # retrieve next part | ||||
actual_results2 = swh_storage.content_get_range(start=end, end=end) | actual_results2 = swh_storage.content_get_range(start=end, end=end) | ||||
assert actual_results2['next'] is None | assert actual_results2['next'] is None | ||||
actual_contents2 = actual_results2['contents'] | actual_contents2 = actual_results2['contents'] | ||||
assert len(actual_contents2) == 1 | assert len(actual_contents2) == 1 | ||||
self.assert_contents_ok( | assert_contents_ok( | ||||
[contents_map[get_sha1s[-1]]], actual_contents2, ['sha1']) | [contents_map[get_sha1s[-1]]], actual_contents2, ['sha1']) | ||||
def test_origin_get_range_from_zero(self, swh_storage, swh_origins): | def test_origin_get_range_from_zero(self, swh_storage, swh_origins): | ||||
actual_origins = list( | actual_origins = list( | ||||
swh_storage.origin_get_range(origin_from=0, | swh_storage.origin_get_range(origin_from=0, | ||||
origin_count=0)) | origin_count=0)) | ||||
assert len(actual_origins) == 0 | assert len(actual_origins) == 0 | ||||
▲ Show 20 Lines • Show All 350 Lines • Show Last 20 Lines |