Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/tests/test_utils.py
# Copyright (C) 2017-2018 The Software Heritage developers | # Copyright (C) 2017-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import hashlib | |||||
import random | |||||
from swh.objstorage.exc import ObjNotFoundError | from swh.objstorage.exc import ObjNotFoundError | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes, hash_to_hex | ||||
from swh.indexer.storage import INDEXER_CFG_KEY | from swh.indexer.storage import INDEXER_CFG_KEY | ||||
BASE_TEST_CONFIG = { | BASE_TEST_CONFIG = { | ||||
'storage': { | 'storage': { | ||||
'cls': 'memory', | 'cls': 'memory', | ||||
'args': { | 'args': { | ||||
}, | }, | ||||
▲ Show 20 Lines • Show All 464 Lines • ▼ Show 20 Lines | for (snap_id, snap_branches) in SNAPSHOTS.items(): | ||||
'id': snap_id, | 'id': snap_id, | ||||
'branches': snap_branches | 'branches': snap_branches | ||||
}) | }) | ||||
storage.revision_add(REVISIONS) | storage.revision_add(REVISIONS) | ||||
storage.directory_add([{ | storage.directory_add([{ | ||||
'id': DIRECTORY_ID, | 'id': DIRECTORY_ID, | ||||
'entries': DIRECTORY, | 'entries': DIRECTORY, | ||||
}]) | }]) | ||||
for (obj_id, content) in OBJ_STORAGE_DATA.items(): | |||||
if hasattr(hashlib, 'blake2s'): | |||||
blake2s256 = hashlib.blake2s(content, digest_size=32).digest() | |||||
else: | |||||
# fallback for Python <3.6 | |||||
blake2s256 = bytes([random.randint(0, 255) for _ in range(32)]) | |||||
storage.content_add([{ | |||||
'data': content, | |||||
'length': len(content), | |||||
'status': 'visible', | |||||
'sha1': hash_to_bytes(obj_id), | |||||
'sha1_git': hash_to_bytes(obj_id), | |||||
'sha256': hashlib.sha256(content).digest(), | |||||
'blake2s256': blake2s256 | |||||
}]) | |||||
class MockStorage(): | class MockStorage(): | ||||
"""Mock a real swh-storage storage to simplify reading indexers' | """Mock a real swh-storage storage to simplify reading indexers' | ||||
outputs. | outputs. | ||||
""" | """ | ||||
def origin_get(self, id_): | def origin_get(self, id_): | ||||
▲ Show 20 Lines • Show All 155 Lines • ▼ Show 20 Lines | class CommonIndexerWithErrorsTest: | ||||
def test_wrong_unknown_configuration_tool_range(self): | def test_wrong_unknown_configuration_tool_range(self): | ||||
"""Range Indexer with unknown configuration tool fails check""" | """Range Indexer with unknown configuration tool fails check""" | ||||
if self.RangeIndexer is not None: | if self.RangeIndexer is not None: | ||||
with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): | with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): | ||||
self.RangeIndexer() | self.RangeIndexer() | ||||
class CommonContentIndexerTest: | class CommonContentIndexerTest: | ||||
def assert_results_ok(self, actual_results, expected_results=None): | def get_indexer_results(self, ids): | ||||
"""Override this for indexers that don't have a mock storage.""" | |||||
return self.indexer.idx_storage.state | |||||
def assert_results_ok(self, sha1s, expected_results=None): | |||||
sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) | |||||
for sha1 in sha1s] | |||||
actual_results = self.get_indexer_results(sha1s) | |||||
if expected_results is None: | if expected_results is None: | ||||
expected_results = self.expected_results | expected_results = self.expected_results | ||||
for indexed_data in actual_results: | for indexed_data in actual_results: | ||||
_id = indexed_data['id'] | _id = indexed_data['id'] | ||||
self.assertEqual(indexed_data, expected_results[_id]) | self.assertEqual(indexed_data, expected_results[_id]) | ||||
_tool_id = indexed_data['indexer_configuration_id'] | _tool_id = indexed_data['indexer_configuration_id'] | ||||
self.assertEqual(_tool_id, self.indexer.tool['id']) | self.assertEqual(_tool_id, self.indexer.tool['id']) | ||||
def test_index(self): | def test_index(self): | ||||
"""Known sha1 have their data indexed | """Known sha1 have their data indexed | ||||
""" | """ | ||||
sha1s = [self.id0, self.id1, self.id2] | sha1s = [self.id0, self.id1, self.id2] | ||||
# when | # when | ||||
self.indexer.run(sha1s, policy_update='update-dups') | self.indexer.run(sha1s, policy_update='update-dups') | ||||
actual_results = self.indexer.idx_storage.state | self.assert_results_ok(sha1s) | ||||
self.assertTrue(self.indexer.idx_storage.conflict_update) | |||||
self.assert_results_ok(actual_results) | |||||
# 2nd pass | # 2nd pass | ||||
self.indexer.run(sha1s, policy_update='ignore-dups') | self.indexer.run(sha1s, policy_update='ignore-dups') | ||||
self.assertFalse(self.indexer.idx_storage.conflict_update) | self.assert_results_ok(sha1s) | ||||
self.assert_results_ok(actual_results) | |||||
def test_index_one_unknown_sha1(self): | def test_index_one_unknown_sha1(self): | ||||
"""Unknown sha1 are not indexed""" | """Unknown sha1 are not indexed""" | ||||
sha1s = [self.id1, | sha1s = [self.id1, | ||||
'799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown | '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown | ||||
'800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown | '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown | ||||
# when | # when | ||||
self.indexer.run(sha1s, policy_update='update-dups') | self.indexer.run(sha1s, policy_update='update-dups') | ||||
actual_results = self.indexer.idx_storage.state | |||||
# then | # then | ||||
expected_results = { | expected_results = { | ||||
k: v for k, v in self.expected_results.items() if k in sha1s | k: v for k, v in self.expected_results.items() if k in sha1s | ||||
} | } | ||||
self.assert_results_ok(actual_results, expected_results) | self.assert_results_ok(sha1s, expected_results) | ||||
class CommonContentIndexerRangeTest: | class CommonContentIndexerRangeTest: | ||||
"""Allows to factorize tests on range indexer. | """Allows to factorize tests on range indexer. | ||||
""" | """ | ||||
def setUp(self): | |||||
self.contents = sorted(OBJ_STORAGE_DATA) | |||||
def assert_results_ok(self, start, end, actual_results, | def assert_results_ok(self, start, end, actual_results, | ||||
expected_results=None): | expected_results=None): | ||||
if expected_results is None: | if expected_results is None: | ||||
expected_results = self.expected_results | expected_results = self.expected_results | ||||
actual_results = list(actual_results) | |||||
for indexed_data in actual_results: | for indexed_data in actual_results: | ||||
_id = indexed_data['id'] | _id = indexed_data['id'] | ||||
self.assertEqual(indexed_data, expected_results[_id]) | assert isinstance(_id, bytes) | ||||
self.assertTrue(start <= _id and _id <= end) | indexed_data = indexed_data.copy() | ||||
indexed_data['id'] = hash_to_hex(indexed_data['id']) | |||||
self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)]) | |||||
self.assertTrue(start <= _id <= end) | |||||
_tool_id = indexed_data['indexer_configuration_id'] | _tool_id = indexed_data['indexer_configuration_id'] | ||||
self.assertEqual(_tool_id, self.indexer.tool['id']) | self.assertEqual(_tool_id, self.indexer.tool['id']) | ||||
def test__index_contents(self): | def test__index_contents(self): | ||||
"""Indexing contents without existing data results in indexed data | """Indexing contents without existing data results in indexed data | ||||
""" | """ | ||||
start, end = [self.contents[0], self.contents[2]] # output hex ids | _start, _end = [self.contents[0], self.contents[2]] # output hex ids | ||||
start, end = map(hashutil.hash_to_bytes, (_start, _end)) | |||||
# given | # given | ||||
actual_results = list(self.indexer._index_contents( | actual_results = list(self.indexer._index_contents( | ||||
start, end, indexed={})) | start, end, indexed={})) | ||||
self.assert_results_ok(start, end, actual_results) | self.assert_results_ok(start, end, actual_results) | ||||
def test__index_contents_with_indexed_data(self): | def test__index_contents_with_indexed_data(self): | ||||
"""Indexing contents with existing data results in less indexed data | """Indexing contents with existing data results in less indexed data | ||||
""" | """ | ||||
start, end = [self.contents[0], self.contents[2]] # output hex ids | _start, _end = [self.contents[0], self.contents[2]] # output hex ids | ||||
start, end = map(hashutil.hash_to_bytes, (_start, _end)) | |||||
data_indexed = [self.id0, self.id2] | data_indexed = [self.id0, self.id2] | ||||
# given | # given | ||||
actual_results = self.indexer._index_contents( | actual_results = self.indexer._index_contents( | ||||
start, end, indexed=set(data_indexed)) | start, end, indexed=set(map(hash_to_bytes, data_indexed))) | ||||
# craft the expected results | # craft the expected results | ||||
expected_results = self.expected_results.copy() | expected_results = self.expected_results.copy() | ||||
for already_indexed_key in data_indexed: | for already_indexed_key in data_indexed: | ||||
expected_results.pop(already_indexed_key) | expected_results.pop(already_indexed_key) | ||||
self.assert_results_ok( | self.assert_results_ok( | ||||
start, end, actual_results, expected_results) | start, end, actual_results, expected_results) | ||||
def test_generate_content_get(self): | def test_generate_content_get(self): | ||||
"""Optimal indexing should result in indexed data | """Optimal indexing should result in indexed data | ||||
""" | """ | ||||
start, end = [self.contents[0], self.contents[2]] # output hex ids | _start, _end = [self.contents[0], self.contents[2]] # output hex ids | ||||
start, end = map(hashutil.hash_to_bytes, (_start, _end)) | |||||
# given | # given | ||||
actual_results = self.indexer.run(start, end) | actual_results = self.indexer.run(start, end) | ||||
# then | # then | ||||
self.assertTrue(actual_results) | self.assertTrue(actual_results) | ||||
def test_generate_content_get_input_as_bytes(self): | def test_generate_content_get_input_as_bytes(self): | ||||
Show All 10 Lines | def test_generate_content_get_input_as_bytes(self): | ||||
start, end, skip_existing=False) | start, end, skip_existing=False) | ||||
# no already indexed data so same result as prior test | # no already indexed data so same result as prior test | ||||
# then | # then | ||||
self.assertTrue(actual_results) | self.assertTrue(actual_results) | ||||
def test_generate_content_get_no_result(self): | def test_generate_content_get_no_result(self): | ||||
"""No result indexed returns False""" | """No result indexed returns False""" | ||||
start, end = ['0000000000000000000000000000000000000000', | _start, _end = ['0000000000000000000000000000000000000000', | ||||
'0000000000000000000000000000000000000001'] | '0000000000000000000000000000000000000001'] | ||||
start, end = map(hashutil.hash_to_bytes, (_start, _end)) | |||||
# given | # given | ||||
actual_results = self.indexer.run( | actual_results = self.indexer.run( | ||||
start, end, incremental=False) | start, end, incremental=False) | ||||
# then | # then | ||||
self.assertFalse(actual_results) | self.assertFalse(actual_results) | ||||
Show All 11 Lines |