test_tenacious.py
No OneTemporary
Actions

Size

13 KB

Subscribers

None

test_tenacious.py
View Options

	# Copyright (C) 2020-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from collections import Counter
	from contextlib import contextmanager
	from unittest.mock import patch

	import attr
	import pytest

	from swh.model import model
	from swh.model.tests.swh_model_data import TEST_OBJECTS
	from swh.storage import get_storage
	from swh.storage.in_memory import InMemoryStorage
	from swh.storage.proxies.tenacious import TenaciousProxyStorage
	from swh.storage.tests.storage_data import StorageData
	from swh.storage.tests.storage_tests import (
	TestStorageGeneratedData as _TestStorageGeneratedData,
	)
	from swh.storage.tests.storage_tests import TestStorage as _TestStorage # noqa
	from swh.storage.utils import now

	data = StorageData()
	collections = {
	"origin": data.origins,
	"content": data.contents,
	"skipped_content": data.skipped_contents,
	"revision": data.revisions,
	"directory": data.directories,
	"release": data.releases,
	"snapshot": data.snapshots,
	}
	# generic storage tests (using imported TestStorage* classes)


	@pytest.fixture
	def swh_storage_backend_config2():
	yield {
	"cls": "memory",
	"journal_writer": {"cls": "memory",},
	}


	@pytest.fixture
	def swh_storage():
	storage_config = {
	"cls": "pipeline",
	"steps": [
	{"cls": "tenacious"},
	{"cls": "memory", "journal_writer": {"cls": "memory",}},
	],
	}

	storage = get_storage(**storage_config)
	storage.journal_writer = storage.storage.journal_writer
	return storage


	class TestTenaciousStorage(_TestStorage):
	@pytest.mark.skip(
	'The "person" table of the pgsql is a legacy thing, and not '
	"supported by the cassandra/in-memory backend."
	)
	def test_person_fullname_unicity(self):
	pass

	@pytest.mark.skip(reason="No collision with the tenacious storage")
	def test_content_add_collision(self, swh_storage, sample_data):
	pass

	@pytest.mark.skip(reason="No collision with the tenacious storage")
	def test_content_add_metadata_collision(self, swh_storage, sample_data):
	pass

	@pytest.mark.skip("content_update is not implemented")
	def test_content_update(self):
	pass

	@pytest.mark.skip("Not supported by Cassandra/InMemory storage")
	def test_origin_count(self):
	pass


	class TestTenaciousStorageGeneratedData(_TestStorageGeneratedData):
	@pytest.mark.skip("Not supported by Cassandra/InMemory")
	def test_origin_count(self):
	pass

	@pytest.mark.skip("Not supported by Cassandra/InMemory")
	def test_origin_count_with_visit_no_visits(self):
	pass

	@pytest.mark.skip("Not supported by Cassandra/InMemory")
	def test_origin_count_with_visit_with_visits_and_snapshot(self):
	pass

	@pytest.mark.skip("Not supported by Cassandra/InMemory")
	def test_origin_count_with_visit_with_visits_no_snapshot(self):
	pass


	# specific tests for the tenacious behavior


	def get_tenacious_storage(**config):
	storage_config = {
	"cls": "pipeline",
	"steps": [
	{"cls": "validate"},
	{"cls": "tenacious", **config},
	{"cls": "memory"},
	],
	}

	return get_storage(**storage_config)


	@contextmanager
	def disabled_validators():
	attr.set_run_validators(False)
	yield
	attr.set_run_validators(True)


	def popid(d):
	d.pop("id")
	return d


	testdata = [
	pytest.param(
	"content",
	"content_add",
	list(TEST_OBJECTS["content"]),
	attr.evolve(model.Content.from_data(data=b"too big"), length=1000),
	attr.evolve(model.Content.from_data(data=b"to fail"), length=1000),
	id="content",
	),
	pytest.param(
	"content",
	"content_add_metadata",
	[attr.evolve(cnt, ctime=now()) for cnt in TEST_OBJECTS["content"]],
	attr.evolve(model.Content.from_data(data=b"too big"), length=1000, ctime=now()),
	attr.evolve(model.Content.from_data(data=b"to fail"), length=1000, ctime=now()),
	id="content_metadata",
	),
	pytest.param(
	"skipped_content",
	"skipped_content_add",
	list(TEST_OBJECTS["skipped_content"]),
	attr.evolve(
	model.SkippedContent.from_data(data=b"too big", reason="too big"),
	length=1000,
	),
	attr.evolve(
	model.SkippedContent.from_data(data=b"to fail", reason="to fail"),
	length=1000,
	),
	id="skipped_content",
	),
	pytest.param(
	"directory",
	"directory_add",
	list(TEST_OBJECTS["directory"]),
	data.directory,
	data.directory2,
	id="directory",
	),
	pytest.param(
	"revision",
	"revision_add",
	list(TEST_OBJECTS["revision"]),
	data.revision,
	data.revision2,
	id="revision",
	),
	pytest.param(
	"release",
	"release_add",
	list(TEST_OBJECTS["release"]),
	data.release,
	data.release2,
	id="release",
	),
	pytest.param(
	"snapshot",
	"snapshot_add",
	list(TEST_OBJECTS["snapshot"]),
	data.snapshot,
	data.complete_snapshot,
	id="snapshot",
	),
	pytest.param(
	"origin",
	"origin_add",
	list(TEST_OBJECTS["origin"]),
	data.origin,
	data.origin2,
	id="origin",
	),
	]


	class LimitedInMemoryStorage(InMemoryStorage):
	# forbidden are 'bad1' and 'bad2' arguments of `testdata`
	forbidden = [x[0][3] for x in testdata] + [x[0][4] for x in testdata]

	def __init__(self, args, *kw):
	self.add_calls = Counter()
	super().__init__(args, *kw)

	def reset(self):
	super().reset()
	self.add_calls.clear()

	def content_add(self, contents):
	return self._maybe_add(super().content_add, "content", contents)

	def content_add_metadata(self, contents):
	return self._maybe_add(super().content_add_metadata, "content", contents)

	def skipped_content_add(self, skipped_contents):
	return self._maybe_add(
	super().skipped_content_add, "skipped_content", skipped_contents
	)

	def origin_add(self, origins):
	return self._maybe_add(super().origin_add, "origin", origins)

	def directory_add(self, directories):
	return self._maybe_add(super().directory_add, "directory", directories)

	def revision_add(self, revisions):
	return self._maybe_add(super().revision_add, "revision", revisions)

	def release_add(self, releases):
	return self._maybe_add(super().release_add, "release", releases)

	def snapshot_add(self, snapshots):
	return self._maybe_add(super().snapshot_add, "snapshot", snapshots)

	def _maybe_add(self, add_func, object_type, objects):
	self.add_calls[object_type] += 1
	if any(c in self.forbidden for c in objects):
	raise ValueError(
	f"{object_type} is forbidden",
	[c.unique_key() for c in objects if c in self.forbidden],
	)
	return add_func(objects)


	@patch("swh.storage.in_memory.InMemoryStorage", LimitedInMemoryStorage)
	@pytest.mark.parametrize("object_type, add_func_name, objects, bad1, bad2", testdata)
	def test_tenacious_proxy_storage(object_type, add_func_name, objects, bad1, bad2):
	storage = get_tenacious_storage()
	tenacious = storage.storage
	in_memory = tenacious.storage
	assert isinstance(tenacious, TenaciousProxyStorage)
	assert isinstance(in_memory, LimitedInMemoryStorage)

	size = len(objects)

	add_func = getattr(storage, add_func_name)

	# Note: when checking the LimitedInMemoryStorage.add_calls counter, it's
	# hard to guess the exact number of calls in the end (depends on the size
	# of batch and the position of bad objects in this batch). So we will only
	# check a lower limit of the form (n + m), where n is the minimum expected
	# number of additions (due to the batch begin split), and m is the fact
	# that bad objects are tried (individually) several (3) times before giving
	# up. So for one bad object, m is 3; for 2 bad objects, m is 6, etc.

	s = add_func(objects)
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 0
	assert storage.add_calls[object_type] == (1 + 0)
	in_memory.reset()
	tenacious.reset()

	# bad1 is the last element
	s = add_func(objects + [bad1])
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 1

	assert storage.add_calls[object_type] >= (2 + 3)
	in_memory.reset()
	tenacious.reset()

	# bad1 and bad2 are the last elements
	s = add_func(objects + [bad1, bad2])
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 2
	assert storage.add_calls[object_type] >= (3 + 6)
	in_memory.reset()
	tenacious.reset()

	# bad1 is the first element
	s = add_func([bad1] + objects)
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 1
	assert storage.add_calls[object_type] >= (2 + 3)
	in_memory.reset()
	tenacious.reset()

	# bad1 and bad2 are the first elements
	s = add_func([bad1, bad2] + objects)
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 2
	assert storage.add_calls[object_type] >= (3 + 6)
	in_memory.reset()
	tenacious.reset()

	# bad1 is in the middle of the list of inserted elements
	s = add_func(objects[: size // 2] + [bad1] + objects[size // 2 :])
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 1
	assert storage.add_calls[object_type] >= (3 + 3)
	in_memory.reset()
	tenacious.reset()

	# bad1 and bad2 are together in the middle of the list of inserted elements
	s = add_func(objects[: size // 2] + [bad1, bad2] + objects[size // 2 :])
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 2
	assert storage.add_calls[object_type] >= (3 + 6)
	in_memory.reset()
	tenacious.reset()

	# bad1 and bad2 are spread in the middle of the list of inserted elements
	s = add_func(
	objects[: size // 3]
	+ [bad1]
	+ objects[size // 3 : 2 * (size // 3)]
	+ [bad2]
	+ objects[2 * (size // 3) :]
	)
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 2
	assert storage.add_calls[object_type] >= (3 + 6)
	in_memory.reset()
	tenacious.reset()

	# bad1 is the only element
	s = add_func([bad1])
	assert s.get(f"{object_type}:add", 0) == 0
	assert s.get(f"{object_type}:add:errors", 0) == 1
	assert storage.add_calls[object_type] == (0 + 3)
	in_memory.reset()
	tenacious.reset()

	# bad1 and bad2 are the only elements
	s = add_func([bad1, bad2])
	assert s.get(f"{object_type}:add", 0) == 0
	assert s.get(f"{object_type}:add:errors", 0) == 2
	assert storage.add_calls[object_type] == (1 + 6)
	in_memory.reset()
	tenacious.reset()


	@patch("swh.storage.in_memory.InMemoryStorage", LimitedInMemoryStorage)
	@pytest.mark.parametrize("object_type, add_func_name, objects, bad1, bad2", testdata)
	def test_tenacious_proxy_storage_rate_limit(
	object_type, add_func_name, objects, bad1, bad2
	):
	storage = get_tenacious_storage(error_rate_limit={"errors": 1, "window_size": 3})
	tenacious = storage.storage
	in_memory = tenacious.storage
	assert isinstance(tenacious, TenaciousProxyStorage)
	assert isinstance(in_memory, LimitedInMemoryStorage)

	size = len(objects)

	add_func = getattr(storage, add_func_name)

	# with no insertion failure, no impact
	s = add_func(objects)
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 0
	in_memory.reset()
	tenacious.reset()

	# with one insertion failure, no impact
	s = add_func([bad1] + objects)
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 1
	in_memory.reset()
	tenacious.reset()

	s = add_func(objects[: size // 2] + [bad1] + objects[size // 2 :])
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 1
	in_memory.reset()
	tenacious.reset()

	# with two consecutive insertion failures, exception is raised
	with pytest.raises(RuntimeError, match="Too many insertion errors"):
	add_func([bad1, bad2] + objects)
	in_memory.reset()
	tenacious.reset()

	if size > 2:
	# with two consecutive insertion failures, exception is raised
	# (errors not at the beginning)
	with pytest.raises(RuntimeError, match="Too many insertion errors"):
	add_func(objects[: size // 2] + [bad1, bad2] + objects[size // 2 :])
	in_memory.reset()
	tenacious.reset()

	# with two non-consecutive insertion failures, no impact
	# (errors are far enough to not reach the rate limit)
	s = add_func(
	objects[: size // 3]
	+ [bad1]
	+ objects[size // 3 : 2 * (size // 3)]
	+ [bad2]
	+ objects[2 * (size // 3) :]
	)
	assert s.get(f"{object_type}:add", 0) == size
	assert s.get(f"{object_type}:add:errors", 0) == 2
	in_memory.reset()
	tenacious.reset()

File Metadata

Mime Type: text/x-python
Expires: Thu, Jul 3, 10:17 AM (2 w, 5 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3283968

test_tenacious.pyNo OneTemporaryActions

test_tenacious.pyView Options

File Metadata

Event Timeline

test_tenacious.py
No OneTemporary
Actions

test_tenacious.py
View Options