diff --git a/swh/objstorage/backends/winery/objstorage.py b/swh/objstorage/backends/winery/objstorage.py --- a/swh/objstorage/backends/winery/objstorage.py +++ b/swh/objstorage/backends/winery/objstorage.py @@ -12,6 +12,7 @@ from .roshard import ROShard from .rwshard import RWShard from .sharedbase import SharedBase +from .stats import Stats logger = logging.getLogger(__name__) @@ -90,6 +91,7 @@ class Packer: def __init__(self, shard, **kwargs): + self.stats = Stats(kwargs.get("output_dir")) self.args = kwargs self.shard = shard self.init() @@ -106,6 +108,9 @@ self.ro.create(self.rw.count()) for obj_id, content in self.rw.all(): self.ro.add(content, obj_id) + if self.stats.stats_active: + self.stats.stats_read(obj_id, content) + self.stats.stats_write(obj_id, content) self.ro.save() base = SharedBase(**self.args) base.shard_packing_ends(self.shard) diff --git a/swh/objstorage/backends/winery/roshard.py b/swh/objstorage/backends/winery/roshard.py --- a/swh/objstorage/backends/winery/roshard.py +++ b/swh/objstorage/backends/winery/roshard.py @@ -21,7 +21,7 @@ self.args = kwargs self.rbd = sh.sudo.bake("rbd", f"--pool={self.name}") self.ceph = sh.sudo.bake("ceph") - self.image_size = self.args["shard_max_size"] * 2 + self.image_size = int((self.args["shard_max_size"] * 2) / (1024 * 1024)) def image_list(self): try: diff --git a/swh/objstorage/backends/winery/stats.py b/swh/objstorage/backends/winery/stats.py new file mode 100644 --- /dev/null +++ b/swh/objstorage/backends/winery/stats.py @@ -0,0 +1,87 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +import os +import time + +logger = logging.getLogger(__name__) + + +class Stats: + def __init__(self, d): + if d is None: + self._stats_active = False + return + + self._stats_active = True + if not os.path.exists(d): + os.makedirs(d) + self._stats_filename = f"{d}/{os.getpid()}.csv" + self._stats_fd = open(self.stats_filename, "w") + self._stats_fd.write( + # time in seconds since epoch + "time," + # total number of objects written at this point in time + "object_write_count," + # total number of bytes written at this point in time + "bytes_write," + # total number of objects read at this point in time + "object_read_count," + # total number of bytes read at this point in time + "bytes_read" + "\n" + ) + self._stats_fd.flush() + self._stats_last_write = time.monotonic() + self._stats_flush_interval = 5 + self._stats = { + "object_write_count": 0, + "bytes_write": 0, + "object_read_count": 0, + "bytes_read": 0, + } + + @property + def stats_active(self): + return self._stats_active + + @property + def stats_filename(self): + return self._stats_filename + + def __del__(self): + if self.stats_active and not self._stats_fd.closed: + self._stats_print() + self._stats_fd.close() + + def _stats_print(self): + ll = ",".join( + str(self._stats[x]) + for x in [ + "object_write_count", + "bytes_write", + "object_read_count", + "bytes_read", + ] + ) + self._stats_fd.write(f"{int(time.monotonic())},{ll}\n") + self._stats_fd.flush() + + def _stats_maybe_print(self): + now = time.monotonic() + if now - self._stats_last_write > self._stats_flush_interval: + self._stats_print() + self._stats_last_write = now + + def stats_read(self, key, content): + self._stats["object_read_count"] += 1 + self._stats["bytes_read"] += len(key) + len(content) + self._stats_maybe_print() + + def stats_write(self, key, content): + self._stats["object_write_count"] += 1 + self._stats["bytes_write"] += len(key) + len(content) + self._stats_maybe_print() diff --git a/swh/objstorage/tests/conftest.py b/swh/objstorage/tests/conftest.py --- a/swh/objstorage/tests/conftest.py +++ b/swh/objstorage/tests/conftest.py @@ -3,6 +3,11 @@ def pytest_addoption(parser): + parser.addoption( + "--winery-bench-output-directory", + help="Directory in which the performance results are stored", + default="/tmp/winery", + ) parser.addoption( "--winery-bench-rw-workers", type=int, diff --git a/swh/objstorage/tests/test_objstorage_winery.py b/swh/objstorage/tests/test_objstorage_winery.py --- a/swh/objstorage/tests/test_objstorage_winery.py +++ b/swh/objstorage/tests/test_objstorage_winery.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os import time import pytest @@ -11,6 +12,7 @@ from swh.objstorage import exc from swh.objstorage.backends.winery.database import DatabaseAdmin from swh.objstorage.backends.winery.objstorage import Packer, pack +from swh.objstorage.backends.winery.stats import Stats from swh.objstorage.backends.winery.throttler import ( BandwidthCalculator, IOThrottler, @@ -206,6 +208,7 @@ f":@{postgresql.info.host}:{postgresql.info.port}" ) kwargs = { + "output_dir": pytestconfig.getoption("--winery-bench-output-directory"), "rw_workers": pytestconfig.getoption("--winery-bench-rw-workers"), "ro_workers": pytestconfig.getoption("--winery-bench-ro-workers"), "shard_max_size": pytestconfig.getoption("--winery-shard-max-size"), @@ -218,7 +221,8 @@ "throttle_read": pytestconfig.getoption("--winery-bench-throttle-read"), "throttle_write": pytestconfig.getoption("--winery-bench-throttle-write"), } - assert await Bench(kwargs).run() == kwargs["rw_workers"] + kwargs["ro_workers"] + count = await Bench(kwargs).run() + assert count > 0 @pytest.mark.asyncio @@ -372,3 +376,21 @@ assert t.throttle_add(writer, key, content) is True assert t.throttle_get(reader, key) == content + + +def test_winery_stats(tmpdir): + s = Stats(None) + assert s.stats_active is False + s = Stats(tmpdir / "stats") + assert s.stats_active is True + assert os.path.exists(s.stats_filename) + size = os.path.getsize(s.stats_filename) + s._stats_flush_interval = 0 + k = "KEY" + v = "CONTENT" + s.stats_read(k, v) + s.stats_write(k, v) + s.stats_read(k, v) + s.stats_write(k, v) + s.__del__() + assert os.path.getsize(s.stats_filename) > size diff --git a/swh/objstorage/tests/winery_benchmark.py b/swh/objstorage/tests/winery_benchmark.py --- a/swh/objstorage/tests/winery_benchmark.py +++ b/swh/objstorage/tests/winery_benchmark.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,6 +10,9 @@ import random import time +import psycopg2 + +from swh.objstorage.backends.winery.stats import Stats from swh.objstorage.factory import get_objstorage logger = logging.getLogger(__name__) @@ -19,8 +22,9 @@ return Worker(args).run(kind) -class Worker(object): +class Worker: def __init__(self, args): + self.stats = Stats(args.get("output_dir")) self.args = args def run(self, kind): @@ -28,6 +32,14 @@ return kind def ro(self): + try: + self._ro() + except psycopg2.OperationalError: + # It may happen when the database is dropped, just + # conclude the read loop gracefully and move on + pass + + def _ro(self): self.storage = get_objstorage( cls="winery", readonly=True, @@ -36,6 +48,7 @@ shard_max_size=self.args["shard_max_size"], throttle_read=self.args["throttle_read"], throttle_write=self.args["throttle_write"], + output_dir=self.args.get("output_dir"), ) with self.storage.winery.base.db.cursor() as c: while True: @@ -52,7 +65,10 @@ start = time.time() for row in c: obj_id = row[0].tobytes() - assert self.storage.get(obj_id) is not None + content = self.storage.get(obj_id) + assert content is not None + if self.stats.stats_active: + self.stats.stats_read(obj_id, content) elapsed = time.time() - start logger.info(f"Worker(ro, {os.getpid()}): finished ({elapsed:.2f}s)") @@ -78,6 +94,7 @@ shard_max_size=self.args["shard_max_size"], throttle_read=self.args["throttle_read"], throttle_write=self.args["throttle_write"], + output_dir=self.args.get("output_dir"), ) self.payloads_define() random_content = open("/dev/urandom", "rb") @@ -86,7 +103,9 @@ count = 0 while len(self.storage.winery.packers) == 0: content = random_content.read(random.choice(self.payloads)) - self.storage.add(content=content) + obj_id = self.storage.add(content=content) + if self.stats.stats_active: + self.stats.stats_write(obj_id, content) count += 1 logger.info(f"Worker(rw, {os.getpid()}): packing {count} objects") packer = self.storage.winery.packers[0] diff --git a/winery-test-environment/README.md b/winery-test-environment/README.md --- a/winery-test-environment/README.md +++ b/winery-test-environment/README.md @@ -65,6 +65,20 @@ * ssh -i context/cluster_key -F context/ssh-config ceph1 +# Run the benchmarks + +The `tox -e winery` command is used to run the benchmarks with the desired parameters. Upon completion the raw data can be found in the `winery-test-environment/context/stats` directory and is displayed on the standard output as well as rendered in a graph, if a display is available (see the `winery-test-environment/render-stats.py` for the details). + +### Example + +* tox -e winery -- -s --log-cli-level=INFO -vvv -k test_winery_bench_real --winery-bench-duration 30 --winery-shard-max-size $((10 * 1024 * 1024)) --winery-bench-ro-worker-max-request 2000 + +### Get all benchmark flags + +Run the following command and look for flags that start with `--winery-bench-` + +* tox -e winery -- --help + # Destroy ## libvirt diff --git a/winery-test-environment/remote-tox.sh b/winery-test-environment/remote-tox.sh --- a/winery-test-environment/remote-tox.sh +++ b/winery-test-environment/remote-tox.sh @@ -16,16 +16,28 @@ fi } -function copy() { +function copy_to() { RSYNC_RSH="$SSH" rsync -av --exclude=.mypy_cache --exclude=.coverage --exclude=.eggs --exclude=swh.objstorage.egg-info --exclude=winery-test-environment/context --exclude=.tox --exclude='*~' --exclude=__pycache__ --exclude='*.py[co]' $(git rev-parse --show-toplevel)/ debian@ceph1:/home/debian/swh-objstorage/ } +function copy_from() { + RSYNC_RSH="$SSH" rsync -av --delete debian@ceph1:/tmp/winery/ ${DIR}/context/stats/ +} + +function render() { + python ${DIR}/render-stats.py ${DIR}/context/stats/ +} + function run() { sanity_check || return 1 - copy || return 1 + copy_to || return 1 $SSH -t debian@ceph1 bash -c "'cd swh-objstorage ; ../venv/bin/tox -e py3 -- -k test_winery $*'" || return 1 + + copy_from || return 1 + + render || return 1 } run "$@" diff --git a/winery-test-environment/render-stats.py b/winery-test-environment/render-stats.py new file mode 100644 --- /dev/null +++ b/winery-test-environment/render-stats.py @@ -0,0 +1,59 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import sys + +from matplotlib import pyplot as plt +from matplotlib.ticker import FormatStrFormatter +import pandas as pd + + +def human(size, unit): + if size < 1024: + return f"{int(size)} {unit}/s" + elif size / 1024 < 1024: + return f"{round(size/1024, 1)} K{unit}/s" + elif size / (1024 * 1024) < 1024: + return f"{round(size / (1024 * 1024), 1)} M{unit}/s" + elif size / (1024 * 1024 * 1024) < 1024: + return f"{round(size / (1024 * 1024 * 1024), 1)} G{unit}/s" + + +def read_stats(stats): + dfs = [] + files = os.listdir(stats) + for file in files: + f = f"{stats}/{file}" + if not os.path.isfile(f): + continue + dfs.append(pd.read_csv(f)) + df = pd.concat(dfs) + df.set_index("time") + return df.sort_values(by=["time"]) + + +def main(stats): + df = read_stats(stats) + print(df) + t = df["time"].to_numpy() + sec = t[-1] - t[0] + a = df.sum() / sec + print(f'Bytes write {human(a["bytes_write"], "B")}') + print(f'Objects write {human(a["object_write_count"], "object")}') + print(f'Bytes read {human(a["bytes_read"], "B")}') + print(f'Objects read {human(a["object_read_count"], "object")}') + + df["date"] = pd.to_datetime(df["time"], unit="s") + + p = df.plot(x="time", y=["bytes_write", "bytes_read"]) + p.set_xlabel("Time") + p.yaxis.set_major_formatter(FormatStrFormatter("%.0f")) + p.set_ylabel("B/s") + plt.show() + + +if __name__ == "__main__": + main(sys.argv[1]) diff --git a/winery-test-environment/requirements.txt b/winery-test-environment/requirements.txt --- a/winery-test-environment/requirements.txt +++ b/winery-test-environment/requirements.txt @@ -1,2 +1,5 @@ ansible mitogen +pandas +matplotlib +PyQt5