diff --git a/sysadmin/grid5000/cassandra/loader-stats/compare_results.py b/sysadmin/grid5000/cassandra/loader-stats/compare_results.py new file mode 100644 index 0000000..70d5808 --- /dev/null +++ b/sysadmin/grid5000/cassandra/loader-stats/compare_results.py @@ -0,0 +1,86 @@ +import sys +import copy +import statistics +import pandas as pd +import matplotlib.pyplot as pl + +from typing import List, Dict + + +def load_csv(filename: str) -> Dict[str, List]: + template = { + "unfiltered_count": [], + "missing_duration": [], + "filtered_count": [], + "add_duration": [], + } + + data = { + "content": copy.deepcopy(template), + "directory": copy.deepcopy(template), + "skipped_content": copy.deepcopy(template), + "revision": copy.deepcopy(template), + "release": copy.deepcopy(template), + } + + print("Loading:", filename) + + with open(filename) as f: + content = f.read().splitlines() + + for line in content: + fields = line.strip().split(";") + # print(f"{l}") + values = data[fields[0]] + values["unfiltered_count"].append(int(fields[1])) + values["missing_duration"].append(float(fields[2])) + values["filtered_count"].append(int(fields[3])) + values["add_duration"].append(float(fields[4])) + + return data + + +arg_count = len(sys.argv) - 1 + +files = [] + +for i in range(0, int(arg_count / 2)): + name = str(sys.argv[i * 2 + 1]) + filename = str(sys.argv[i * 2 + 2]) + files.append([name, filename]) + +output = str(sys.argv[arg_count]) + +print(files) +print(output) + +files_data = [] +for f in files: + files_data.append(load_csv(f[1])) + +print("generating graphs...") + +for type in ["content", "directory", "revision"]: + + print(type) + + for op in ["missing_duration", "add_duration"]: + pl.close("all") + graph_data = [] + + quantiles = [] + for data in files_data: + quantiles.append(statistics.quantiles(data[type][op], n=10)) + + print(quantiles) + + for i in range(0, 9): + q = [] + for quantile in quantiles: + q.append(quantile[i]) + graph_data.append(q) + + p = pd.DataFrame(graph_data, index=range(10, 100, 10), columns=[[f[1] for f in files]]) + p.plot.bar(title=f"{output} - {type} - {op}") + print(f"\t saving {output}-{type}-{op}.png") + pl.savefig(f"{output}-{type}-{op}.png") diff --git a/sysadmin/grid5000/cassandra/loader-stats/generate_stats.sh b/sysadmin/grid5000/cassandra/loader-stats/generate_stats.sh new file mode 100755 index 0000000..4b95d6a --- /dev/null +++ b/sysadmin/grid5000/cassandra/loader-stats/generate_stats.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +source_file=$1 + +name=$(basename $source_file .output) + +csv=${name}.csv +stats=${name}.stats + +strings $source_file | grep ' CSV' | cut -f5- -d: | cut -f1 -d '[' > $csv +python stats.py $csv > $stats + +echo Statistics generated in $stats + +less $stats diff --git a/sysadmin/grid5000/cassandra/loader-stats/stats.py b/sysadmin/grid5000/cassandra/loader-stats/stats.py new file mode 100644 index 0000000..c191a5b --- /dev/null +++ b/sysadmin/grid5000/cassandra/loader-stats/stats.py @@ -0,0 +1,92 @@ +import sys +import copy +import statistics +import pandas as pd +import matplotlib.pyplot as pl + +filename = str(sys.argv[1]) + +print('Reading:', filename) + +with open(filename) as f: + content = f.read().splitlines() + +template = {'unfiltered_count':[], 'missing_duration':[], 'filtered_count':[], 'add_duration':[]} + +data = {'content': copy.deepcopy(template), + 'directory': copy.deepcopy(template), + 'skipped_content': copy.deepcopy(template), + 'revision': copy.deepcopy(template), + 'release': copy.deepcopy(template)} + + +for line in content: + l = line.strip().split(';') + # print(f"{l}") + values = data[l[0]] + values['unfiltered_count'].append(int(l[1])) + values['missing_duration'].append(float(l[2])) + values['filtered_count'].append(int(l[3])) + values['add_duration'].append(float(l[4])) + + +for type in ['content', 'directory', 'skipped_content', 'revision', 'release']: + print(f"############### {type}") + d = data[type] + if len(d['unfiltered_count']) > 1: + print(f"Number of unfiltered {type}: {sum(d['unfiltered_count'])}") + print(f"Number of filtered {type}: {sum(d['filtered_count'])}") + print(f"{type}_missing duration: {sum(d['missing_duration'])}") + print(f"{type}_add duration: {sum(d['add_duration'])}") + print() + print(f"Average unfiltered count {type}: {statistics.mean(d['unfiltered_count'])}") + print(f"Average filtered count {type}: {statistics.mean(d['filtered_count'])}") + print(f"Average {type}_missing duration: {statistics.mean(d['missing_duration'])}") + print(f"Average {type}_add duration: {statistics.mean(d['add_duration'])}") + print() + print(f"Median unfiltered count {type}: {statistics.median(d['unfiltered_count'])}") + print(f"Median filtered count {type}: {statistics.median(d['filtered_count'])}") + print(f"Median {type}_missing duration: {statistics.median(d['missing_duration'])}") + print(f"Median {type}_add duration: {statistics.median(d['add_duration'])}") + + print() + print(f"Percentiles {type}_missing duration: {statistics.quantiles(d['missing_duration'], n=10)}") + print(f"Percentiles {type}_add duration: {statistics.quantiles(d['add_duration'], n=10)}") + + else: + print("No data") + + print() + + +print("generating graphs...") + +graph_data=[] + +# graph_data =[ +# [10, 20, 30, 40, 50, 60, 70, 80, 90], +# statistics.quantiles(data['content']['add_duration'], n=10) +# ] +q1 = statistics.quantiles(data['content']['add_duration'], n=10) +q2 = statistics.quantiles(data['directory']['add_duration'], n=10) + +print(q1) +print(q2) + +for i in range(0, 9): + print(i) + graph_data.append([q1[i], q2[i]]) + + # statistics.quantiles(data['directory']['add_duration']), + # statistics.quantiles(data['revision']['add_duration']), + + +print(graph_data) +# p = pd.DataFrame(graph_data, columns=['content', 'directory', 'revision']) +# p = pd.DataFrame(graph_data, columns=['percent','content']) +p = pd.DataFrame(graph_data, index=range(10, 100, 10), columns=['content', 'directory']) + +# p= p.cumsum() +p.plot.bar() +pl.savefig('test') +# p = pd.DataFrame(statistics.quantiles(data['content']['add_duration'], n=10))