diff --git a/sql/blob-size-stats/.gitignore b/sql/blob-size-stats/.gitignore new file mode 100644 index 0000000..3bedf80 --- /dev/null +++ b/sql/blob-size-stats/.gitignore @@ -0,0 +1 @@ +blob-size-stats.csv diff --git a/sql/blob-size-stats/analyze.py b/sql/blob-size-stats/analyze.py new file mode 100755 index 0000000..a58194d --- /dev/null +++ b/sql/blob-size-stats/analyze.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 + +import pandas as pd + +DATA_FILE = "blob-size-stats.csv" + +df = pd.read_csv(DATA_FILE, low_memory=False, skiprows=1, names=["blob size"]) +print(df.describe()) diff --git a/sql/blob-size-stats/blob-size-stats.log b/sql/blob-size-stats/blob-size-stats.log new file mode 100644 index 0000000..56a4b26 --- /dev/null +++ b/sql/blob-size-stats/blob-size-stats.log @@ -0,0 +1,21 @@ +$ date -R +Sat, 30 Jan 2021 19:40:48 +0100 + +$ ./blob-size-stats.sh +real 59m17,410s +user 0m5,709s +sys 0m3,323s + +$ wc -l blob-size-stats.csv +9737167 blob-size-stats.csv + +$ ./analyze.py + blob size +count 9.737166e+06 +mean 8.233840e+04 +std 1.207056e+06 +min 2.000000e+00 +25% 1.054000e+03 +50% 3.313000e+03 +75% 1.308800e+04 +max 1.109894e+08 diff --git a/sql/blob-size-stats/blob-size-stats.sh b/sql/blob-size-stats/blob-size-stats.sh new file mode 100755 index 0000000..10b13d6 --- /dev/null +++ b/sql/blob-size-stats/blob-size-stats.sh @@ -0,0 +1,2 @@ +#!/bin/bash +time psql --csv service=swh-replica < blob-size-stats.sql > blob-size-stats.csv diff --git a/sql/blob-size-stats/blob-size-stats.sql b/sql/blob-size-stats/blob-size-stats.sql new file mode 100644 index 0000000..5d5759b --- /dev/null +++ b/sql/blob-size-stats/blob-size-stats.sql @@ -0,0 +1 @@ +select length from content tablesample bernoulli (0.1);