Page MenuHomeSoftware Heritage

D7030.id25490.diff
No OneTemporary

D7030.id25490.diff

diff --git a/swh/objstorage/backends/winery/objstorage.py b/swh/objstorage/backends/winery/objstorage.py
--- a/swh/objstorage/backends/winery/objstorage.py
+++ b/swh/objstorage/backends/winery/objstorage.py
@@ -12,6 +12,7 @@
from .roshard import ROShard
from .rwshard import RWShard
from .sharedbase import SharedBase
+from .stats import Stats
logger = logging.getLogger(__name__)
@@ -90,6 +91,7 @@
class Packer:
def __init__(self, shard, **kwargs):
+ self.stats = Stats(kwargs.get("output_dir"))
self.args = kwargs
self.shard = shard
self.init()
@@ -106,6 +108,9 @@
self.ro.create(self.rw.count())
for obj_id, content in self.rw.all():
self.ro.add(content, obj_id)
+ if self.stats.stats_active:
+ self.stats.stats_read(obj_id, content)
+ self.stats.stats_write(obj_id, content)
self.ro.save()
base = SharedBase(**self.args)
base.shard_packing_ends(self.shard)
diff --git a/swh/objstorage/backends/winery/roshard.py b/swh/objstorage/backends/winery/roshard.py
--- a/swh/objstorage/backends/winery/roshard.py
+++ b/swh/objstorage/backends/winery/roshard.py
@@ -21,7 +21,7 @@
self.args = kwargs
self.rbd = sh.sudo.bake("rbd", f"--pool={self.name}")
self.ceph = sh.sudo.bake("ceph")
- self.image_size = self.args["shard_max_size"] * 2
+ self.image_size = int((self.args["shard_max_size"] * 2) / (1024 * 1024))
def image_list(self):
try:
diff --git a/swh/objstorage/backends/winery/stats.py b/swh/objstorage/backends/winery/stats.py
new file mode 100644
--- /dev/null
+++ b/swh/objstorage/backends/winery/stats.py
@@ -0,0 +1,87 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+import os
+import time
+
+logger = logging.getLogger(__name__)
+
+
+class Stats:
+ def __init__(self, d):
+ if d is None:
+ self._stats_active = False
+ return
+
+ self._stats_active = True
+ if not os.path.exists(d):
+ os.makedirs(d)
+ self._stats_filename = f"{d}/{os.getpid()}.csv"
+ self._stats_fd = open(self.stats_filename, "w")
+ self._stats_fd.write(
+ # time in seconds since epoch
+ "time,"
+ # total number of objects written at this point in time
+ "object_write_count,"
+ # total number of bytes written at this point in time
+ "bytes_write,"
+ # total number of objects read at this point in time
+ "object_read_count,"
+ # total number of bytes read at this point in time
+ "bytes_read"
+ "\n"
+ )
+ self._stats_fd.flush()
+ self._stats_last_write = time.monotonic()
+ self._stats_flush_interval = 5
+ self._stats = {
+ "object_write_count": 0,
+ "bytes_write": 0,
+ "object_read_count": 0,
+ "bytes_read": 0,
+ }
+
+ @property
+ def stats_active(self):
+ return self._stats_active
+
+ @property
+ def stats_filename(self):
+ return self._stats_filename
+
+ def __del__(self):
+ if self.stats_active and not self._stats_fd.closed:
+ self._stats_print()
+ self._stats_fd.close()
+
+ def _stats_print(self):
+ ll = ",".join(
+ str(self._stats[x])
+ for x in [
+ "object_write_count",
+ "bytes_write",
+ "object_read_count",
+ "bytes_read",
+ ]
+ )
+ self._stats_fd.write(f"{int(time.monotonic())},{ll}\n")
+ self._stats_fd.flush()
+
+ def _stats_maybe_print(self):
+ now = time.monotonic()
+ if now - self._stats_last_write > self._stats_flush_interval:
+ self._stats_print()
+ self._stats_last_write = now
+
+ def stats_read(self, key, content):
+ self._stats["object_read_count"] += 1
+ self._stats["bytes_read"] += len(key) + len(content)
+ self._stats_maybe_print()
+
+ def stats_write(self, key, content):
+ self._stats["object_write_count"] += 1
+ self._stats["bytes_write"] += len(key) + len(content)
+ self._stats_maybe_print()
diff --git a/swh/objstorage/tests/conftest.py b/swh/objstorage/tests/conftest.py
--- a/swh/objstorage/tests/conftest.py
+++ b/swh/objstorage/tests/conftest.py
@@ -3,6 +3,11 @@
def pytest_addoption(parser):
+ parser.addoption(
+ "--winery-bench-output-directory",
+ help="Directory in which the performance results are stored",
+ default="/tmp/winery",
+ )
parser.addoption(
"--winery-bench-rw-workers",
type=int,
diff --git a/swh/objstorage/tests/test_objstorage_winery.py b/swh/objstorage/tests/test_objstorage_winery.py
--- a/swh/objstorage/tests/test_objstorage_winery.py
+++ b/swh/objstorage/tests/test_objstorage_winery.py
@@ -3,6 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import os
import time
import pytest
@@ -11,6 +12,7 @@
from swh.objstorage import exc
from swh.objstorage.backends.winery.database import DatabaseAdmin
from swh.objstorage.backends.winery.objstorage import Packer, pack
+from swh.objstorage.backends.winery.stats import Stats
from swh.objstorage.backends.winery.throttler import (
BandwidthCalculator,
IOThrottler,
@@ -206,6 +208,7 @@
f":@{postgresql.info.host}:{postgresql.info.port}"
)
kwargs = {
+ "output_dir": pytestconfig.getoption("--winery-bench-output-directory"),
"rw_workers": pytestconfig.getoption("--winery-bench-rw-workers"),
"ro_workers": pytestconfig.getoption("--winery-bench-ro-workers"),
"shard_max_size": pytestconfig.getoption("--winery-shard-max-size"),
@@ -218,7 +221,8 @@
"throttle_read": pytestconfig.getoption("--winery-bench-throttle-read"),
"throttle_write": pytestconfig.getoption("--winery-bench-throttle-write"),
}
- assert await Bench(kwargs).run() == kwargs["rw_workers"] + kwargs["ro_workers"]
+ count = await Bench(kwargs).run()
+ assert count > 0
@pytest.mark.asyncio
@@ -372,3 +376,21 @@
assert t.throttle_add(writer, key, content) is True
assert t.throttle_get(reader, key) == content
+
+
+def test_winery_stats(tmpdir):
+ s = Stats(None)
+ assert s.stats_active is False
+ s = Stats(tmpdir / "stats")
+ assert s.stats_active is True
+ assert os.path.exists(s.stats_filename)
+ size = os.path.getsize(s.stats_filename)
+ s._stats_flush_interval = 0
+ k = "KEY"
+ v = "CONTENT"
+ s.stats_read(k, v)
+ s.stats_write(k, v)
+ s.stats_read(k, v)
+ s.stats_write(k, v)
+ s.__del__()
+ assert os.path.getsize(s.stats_filename) > size
diff --git a/swh/objstorage/tests/winery_benchmark.py b/swh/objstorage/tests/winery_benchmark.py
--- a/swh/objstorage/tests/winery_benchmark.py
+++ b/swh/objstorage/tests/winery_benchmark.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021 The Software Heritage developers
+# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -10,6 +10,9 @@
import random
import time
+import psycopg2
+
+from swh.objstorage.backends.winery.stats import Stats
from swh.objstorage.factory import get_objstorage
logger = logging.getLogger(__name__)
@@ -19,8 +22,9 @@
return Worker(args).run(kind)
-class Worker(object):
+class Worker:
def __init__(self, args):
+ self.stats = Stats(args.get("output_dir"))
self.args = args
def run(self, kind):
@@ -28,6 +32,14 @@
return kind
def ro(self):
+ try:
+ self._ro()
+ except psycopg2.OperationalError:
+ # It may happen when the database is dropped, just
+ # conclude the read loop gracefully and move on
+ pass
+
+ def _ro(self):
self.storage = get_objstorage(
cls="winery",
readonly=True,
@@ -36,6 +48,7 @@
shard_max_size=self.args["shard_max_size"],
throttle_read=self.args["throttle_read"],
throttle_write=self.args["throttle_write"],
+ output_dir=self.args.get("output_dir"),
)
with self.storage.winery.base.db.cursor() as c:
while True:
@@ -52,7 +65,10 @@
start = time.time()
for row in c:
obj_id = row[0].tobytes()
- assert self.storage.get(obj_id) is not None
+ content = self.storage.get(obj_id)
+ assert content is not None
+ if self.stats.stats_active:
+ self.stats.stats_read(obj_id, content)
elapsed = time.time() - start
logger.info(f"Worker(ro, {os.getpid()}): finished ({elapsed:.2f}s)")
@@ -78,6 +94,7 @@
shard_max_size=self.args["shard_max_size"],
throttle_read=self.args["throttle_read"],
throttle_write=self.args["throttle_write"],
+ output_dir=self.args.get("output_dir"),
)
self.payloads_define()
random_content = open("/dev/urandom", "rb")
@@ -86,7 +103,9 @@
count = 0
while len(self.storage.winery.packers) == 0:
content = random_content.read(random.choice(self.payloads))
- self.storage.add(content=content)
+ obj_id = self.storage.add(content=content)
+ if self.stats.stats_active:
+ self.stats.stats_write(obj_id, content)
count += 1
logger.info(f"Worker(rw, {os.getpid()}): packing {count} objects")
packer = self.storage.winery.packers[0]
diff --git a/winery-test-environment/README.md b/winery-test-environment/README.md
--- a/winery-test-environment/README.md
+++ b/winery-test-environment/README.md
@@ -65,6 +65,20 @@
* ssh -i context/cluster_key -F context/ssh-config ceph1
+# Run the benchmarks
+
+The `tox -e winery` command is used to run the benchmarks with the desired parameters. Upon completion the raw data can be found in the `winery-test-environment/context/stats` directory and is displayed on the standard output as well as rendered in a graph, if a display is available (see the `winery-test-environment/render-stats.py` for the details).
+
+### Example
+
+* tox -e winery -- -s --log-cli-level=INFO -vvv -k test_winery_bench_real --winery-bench-duration 30 --winery-shard-max-size $((10 * 1024 * 1024)) --winery-bench-ro-worker-max-request 2000
+
+### Get all benchmark flags
+
+Run the following command and look for flags that start with `--winery-bench-`
+
+* tox -e winery -- --help
+
# Destroy
## libvirt
diff --git a/winery-test-environment/remote-tox.sh b/winery-test-environment/remote-tox.sh
--- a/winery-test-environment/remote-tox.sh
+++ b/winery-test-environment/remote-tox.sh
@@ -16,16 +16,28 @@
fi
}
-function copy() {
+function copy_to() {
RSYNC_RSH="$SSH" rsync -av --exclude=.mypy_cache --exclude=.coverage --exclude=.eggs --exclude=swh.objstorage.egg-info --exclude=winery-test-environment/context --exclude=.tox --exclude='*~' --exclude=__pycache__ --exclude='*.py[co]' $(git rev-parse --show-toplevel)/ debian@ceph1:/home/debian/swh-objstorage/
}
+function copy_from() {
+ RSYNC_RSH="$SSH" rsync -av --delete debian@ceph1:/tmp/winery/ ${DIR}/context/stats/
+}
+
+function render() {
+ python ${DIR}/render-stats.py ${DIR}/context/stats/
+}
+
function run() {
sanity_check || return 1
- copy || return 1
+ copy_to || return 1
$SSH -t debian@ceph1 bash -c "'cd swh-objstorage ; ../venv/bin/tox -e py3 -- -k test_winery $*'" || return 1
+
+ copy_from || return 1
+
+ render || return 1
}
run "$@"
diff --git a/winery-test-environment/render-stats.py b/winery-test-environment/render-stats.py
new file mode 100644
--- /dev/null
+++ b/winery-test-environment/render-stats.py
@@ -0,0 +1,59 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import sys
+
+from matplotlib import pyplot as plt
+from matplotlib.ticker import FormatStrFormatter
+import pandas as pd
+
+
+def human(size, unit):
+ if size < 1024:
+ return f"{int(size)} {unit}/s"
+ elif size / 1024 < 1024:
+ return f"{round(size/1024, 1)} K{unit}/s"
+ elif size / (1024 * 1024) < 1024:
+ return f"{round(size / (1024 * 1024), 1)} M{unit}/s"
+ elif size / (1024 * 1024 * 1024) < 1024:
+ return f"{round(size / (1024 * 1024 * 1024), 1)} G{unit}/s"
+
+
+def read_stats(stats):
+ dfs = []
+ files = os.listdir(stats)
+ for file in files:
+ f = f"{stats}/{file}"
+ if not os.path.isfile(f):
+ continue
+ dfs.append(pd.read_csv(f))
+ df = pd.concat(dfs)
+ df.set_index("time")
+ return df.sort_values(by=["time"])
+
+
+def main(stats):
+ df = read_stats(stats)
+ print(df)
+ t = df["time"].to_numpy()
+ sec = t[-1] - t[0]
+ a = df.sum() / sec
+ print(f'Bytes write {human(a["bytes_write"], "B")}')
+ print(f'Objects write {human(a["object_write_count"], "object")}')
+ print(f'Bytes read {human(a["bytes_read"], "B")}')
+ print(f'Objects read {human(a["object_read_count"], "object")}')
+
+ df["date"] = pd.to_datetime(df["time"], unit="s")
+
+ p = df.plot(x="time", y=["bytes_write", "bytes_read"])
+ p.set_xlabel("Time")
+ p.yaxis.set_major_formatter(FormatStrFormatter("%.0f"))
+ p.set_ylabel("B/s")
+ plt.show()
+
+
+if __name__ == "__main__":
+ main(sys.argv[1])
diff --git a/winery-test-environment/requirements.txt b/winery-test-environment/requirements.txt
--- a/winery-test-environment/requirements.txt
+++ b/winery-test-environment/requirements.txt
@@ -1,2 +1,5 @@
ansible
mitogen
+pandas
+matplotlib
+PyQt5

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:18 PM (5 d, 15 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219891

Event Timeline