#!/bin/bash

RESULTS_DIR="./fio_results"
mkdir -p "$RESULTS_DIR"

declare -A JOBS
JOBS["randread"]="randread.fio"
JOBS["randwrite"]="randwrite.fio"
JOBS["seqread"]="seqread.fio"
JOBS["seqwrite"]="seqwrite.fio"
JOBS["mixed"]="mixed.fio"

cat <<EOF > "${JOBS["randread"]}"
[randread]
ioengine=sync
rw=randread
bs=4k
size=1G
numjobs=4
runtime=60s
time_based
EOF

cat <<EOF > "${JOBS["randwrite"]}"
[randwrite]
ioengine=sync
rw=randwrite
bs=4k
size=1G
numjobs=4
runtime=60s
time_based
EOF

cat <<EOF > "${JOBS["seqread"]}"
[seqread]
ioengine=sync
rw=read
bs=1M
size=1G
numjobs=1
runtime=60s
time_based
EOF

cat <<EOF > "${JOBS["seqwrite"]}"
[seqwrite]
ioengine=sync
rw=write
bs=1M
size=1G
numjobs=1
runtime=60s
time_based
EOF

cat <<EOF > "${JOBS["mixed"]}"
[mixed]
ioengine=sync
rw=randrw
bs=4k
size=1G
numjobs=4
runtime=60s
time_based
rwmixread=70
EOF

for i in {0..9}; do
    mkdir -p "$RESULTS_DIR/${i}"
    
    for JOB in "${!JOBS[@]}"; do
        echo "Running $JOB benchmark..."
        fio "${JOBS[$JOB]}" --output-format=json --output="$RESULTS_DIR/${i}/$JOB-result.json"
    done
done

echo "All benchmarks completed. Results are stored in $RESULTS_DIR."

   {
      "jobname" : "mixed",
      "groupid" : 0,
      "job_start" : 1753822420164,
      "error" : 0,
      "eta" : 0,
      "elapsed" : 61,
      "job options" : {
        "ioengine" : "libaio",
        "rw" : "randrw",
        "bs" : "4k",
        "size" : "1G",
        "numjobs" : "4",
        "runtime" : "60s",
        "time_based" : "",
        "rwmixread" : "70"
      },
      "read" : {
        "io_bytes" : 677052416,
        "io_kbytes" : 661184,
        "bw_bytes" : 11284018,
        "bw" : 11019,
        "iops" : 2754.887419,
        "runtime" : 60001,
        "total_ios" : 165296,
        "short_ios" : 0,
        "drop_ios" : 0,
        "slat_ns" : {
          "min" : 60135,
          "max" : 169270733,
          "mean" : 327892.475353,
          "stddev" : 704004.012420,
          "N" : 165296
        },
        "clat_ns" : {
          "min" : 1581,
          "max" : 2301665,
          "mean" : 6175.911782,
          "stddev" : 34244.334279,
          "N" : 165296,
          "percentile" : {
            "1.000000" : 1880,
            "5.000000" : 1960,
            "10.000000" : 1992,
            "20.000000" : 2064,
            "30.000000" : 2096,
            "40.000000" : 2160,
            "50.000000" : 2224,
            "60.000000" : 2320,
            "70.000000" : 2576,
            "80.000000" : 2864,
            "90.000000" : 3152,
            "95.000000" : 15808,
            "99.000000" : 96768,
            "99.500000" : 136192,
            "99.900000" : 370688,
            "99.950000" : 708608,
            "99.990000" : 1449984
          }
        },
        "lat_ns" : {
          "min" : 62116,
          "max" : 169288876,
          "mean" : 334068.387136,
          "stddev" : 704772.019699,
          "N" : 165296
        },
        "bw_min" : 1780,
        "bw_max" : 16104,
        "bw_agg" : 22.005418,
        "bw_mean" : 11049.655462,
        "bw_dev" : 2798.273603,
        "bw_samples" : 119,
        "iops_min" : 445,
        "iops_max" : 4026,
        "iops_mean" : 2762.285714,
        "iops_stddev" : 699.522745,
        "iops_samples" : 119
      },
      "write" : {
        "io_bytes" : 289918976,
        "io_kbytes" : 283124,
        "bw_bytes" : 4831902,
        "bw" : 4718,
        "iops" : 1179.663672,
        "runtime" : 60001,
        "total_ios" : 70781,
        "short_ios" : 0,
        "drop_ios" : 0,
        "slat_ns" : {
          "min" : 5914,
          "max" : 3033011,
          "mean" : 42302.398016,
          "stddev" : 126243.650789,
          "N" : 70781
        },
        "clat_ns" : {
          "min" : 1042,
          "max" : 2893097,
          "mean" : 6552.313234,
          "stddev" : 48773.628443,
          "N" : 70781,
          "percentile" : {
            "1.000000" : 1112,
            "5.000000" : 1128,
            "10.000000" : 1160,
            "20.000000" : 1208,
            "30.000000" : 1240,
            "40.000000" : 1288,
            "50.000000" : 1560,
            "60.000000" : 1912,
            "70.000000" : 2160,
            "80.000000" : 2288,
            "90.000000" : 2544,
            "95.000000" : 3120,
            "99.000000" : 136192,
            "99.500000" : 183296,
            "99.900000" : 667648,
            "99.950000" : 1138688,
            "99.990000" : 1712128
          }
        },
        "lat_ns" : {
          "min" : 6996,
          "max" : 3078384,
          "mean" : 48854.711250,
          "stddev" : 135555.993167,
          "N" : 70781
        },
        "bw_min" : 790,
        "bw_max" : 7102,
        "bw_agg" : 21.932277,
        "bw_mean" : 4732.731092,
        "bw_dev" : 1164.468350,
        "bw_samples" : 119,
        "iops_min" : 197,
        "iops_max" : 1775,
        "iops_mean" : 1183.058824,
        "iops_stddev" : 291.144244,
        "iops_samples" : 119
      },
      "trim" : {
        "io_bytes" : 0,
        "io_kbytes" : 0,
        "bw_bytes" : 0,
        "bw" : 0,
        "iops" : 0.000000,
        "runtime" : 0,
        "total_ios" : 0,
        "short_ios" : 0,
        "drop_ios" : 0,
        "slat_ns" : {
          "min" : 0,
          "max" : 0,
          "mean" : 0.000000,
          "stddev" : 0.000000,
          "N" : 0
        },
        "clat_ns" : {
          "min" : 0,
          "max" : 0,
          "mean" : 0.000000,
          "stddev" : 0.000000,
          "N" : 0
        },
        "lat_ns" : {
          "min" : 0,
          "max" : 0,
          "mean" : 0.000000,
          "stddev" : 0.000000,
          "N" : 0
        },
        "bw_min" : 0,
        "bw_max" : 0,
        "bw_agg" : 0.000000,
        "bw_mean" : 0.000000,
        "bw_dev" : 0.000000,
        "bw_samples" : 0,
        "iops_min" : 0,
        "iops_max" : 0,
        "iops_mean" : 0.000000,
        "iops_stddev" : 0.000000,
        "iops_samples" : 0
      },
      "sync" : {
        "total_ios" : 0,
        "lat_ns" : {
          "min" : 0,
          "max" : 0,
          "mean" : 0.000000,
          "stddev" : 0.000000,
          "N" : 0
        }
      },
      "job_runtime" : 60000,
      "usr_cpu" : 1.750000,
      "sys_cpu" : 10.748333,
      "ctx" : 261501,
      "majf" : 0,
      "minf" : 13,
      "iodepth_level" : {
        "1" : 100.000000,
        "2" : 0.000000,
        "4" : 0.000000,
        "8" : 0.000000,
        "16" : 0.000000,
        "32" : 0.000000,
        ">=64" : 0.000000
      },
      "iodepth_submit" : {
        "0" : 0.000000,
        "4" : 100.000000,
        "8" : 0.000000,
        "16" : 0.000000,
        "32" : 0.000000,
        "64" : 0.000000,
        ">=64" : 0.000000
      },
      "iodepth_complete" : {
        "0" : 0.000000,
        "4" : 100.000000,
        "8" : 0.000000,
        "16" : 0.000000,
        "32" : 0.000000,
        "64" : 0.000000,
        ">=64" : 0.000000
      },
      "latency_ns" : {
        "2" : 0.000000,
        "4" : 0.000000,
        "10" : 0.000000,
        "20" : 0.000000,
        "50" : 0.000000,
        "100" : 0.000000,
        "250" : 0.000000,
        "500" : 0.000000,
        "750" : 0.000000,
        "1000" : 0.000000
      },
      "latency_us" : {
        "2" : 26.430360,
        "4" : 67.502552,
        "10" : 0.593874,
        "20" : 1.747735,
        "50" : 1.188595,
        "100" : 1.361844,
        "250" : 0.976376,
        "500" : 0.105474,
        "750" : 0.033040,
        "1000" : 0.016944
      },
      "latency_ms" : {
        "2" : 0.040241,
        "4" : 0.010000,
        "10" : 0.000000,
        "20" : 0.000000,
        "50" : 0.000000,
        "100" : 0.000000,
        "250" : 0.000000,
        "500" : 0.000000,
        "750" : 0.000000,
        "1000" : 0.000000,
        "2000" : 0.000000,
        ">=2000" : 0.000000
      },
      "latency_depth" : 1,
      "latency_target" : 0,
      "latency_percentile" : 100.000000,
      "latency_window" : 0
    },

#!/usr/bin/env python

import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import csv
import glob
import sklearn

import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

def process_fio_json_to_csv(json_file_path, csv_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    with open(csv_file_path, 'w', newline='') as csvfile:
        fieldnames = [
            'Job_Name', 'Read_IOPS', 'Read_Bandwidth', 'Read_Latency_Mean', 
            'Read_Latency_Min', 'Read_Latency_Max', 'Read_Errors',
            'Write_IOPS', 'Write_Bandwidth', 'Write_Latency_Mean', 
            'Write_Latency_Min', 'Write_Latency_Max', 'Write_Errors'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for job in data.get('jobs', []):
            job_name = job.get('jobname', 'Unknown_Job')
            
            read_metrics = job.get('read', {})
            write_metrics = job.get('write', {})

            writer.writerow({
                'Job_Name': job_name,
                'Read_IOPS': read_metrics.get('iops', 0),
                'Read_Bandwidth': read_metrics.get('bw_bytes', 0),
                'Read_Latency_Mean': read_metrics.get('clat_ns', {}).get('mean', 0) / 1000, # Konvertujeme na (ms)
                'Read_Latency_Min': read_metrics.get('clat_ns', {}).get('min', 0) / 1000,  # Konvertujeme na (ms)
                'Read_Latency_Max': read_metrics.get('clat_ns', {}).get('max', 0) / 1000,  # Konvertujeme na (ms)
                'Read_Errors': data.get('error', 0),
                'Write_IOPS': write_metrics.get('iops', 0),
                'Write_Bandwidth': write_metrics.get('bw_bytes', 0),
                'Write_Latency_Mean': write_metrics.get('clat_ns', {}).get('mean', 0) / 1000, # Konvertujeme na (ms)
                'Write_Latency_Min': write_metrics.get('clat_ns', {}).get('min', 0) / 1000,  # Konvertujeme na (ms)
                'Write_Latency_Max': write_metrics.get('clat_ns', {}).get('max', 0) / 1000,  # Konvertujeme na (ms)
                'Write_Errors': data.get('error', 0)
            })

source_dir = './benchmark_results'
destination_dir = './semi_processed_results'

os.makedirs(destination_dir, exist_ok=True)

for filesystem in os.listdir(source_dir):
    filesystem_path = os.path.join(os.path.join(source_dir, filesystem), 'fio_results')
    
    if os.path.isdir(filesystem_path):
        for iteration in os.listdir(filesystem_path):
            iteration_file_path = os.path.join(filesystem_path, iteration)
            #print(iteration_file_path)
            for result_file in os.listdir(iteration_file_path):
                if result_file.endswith('.json'):
                    result_file_path = os.path.join(iteration_file_path, result_file)
                    processed_file_dir = os.path.join(os.path.join(destination_dir, filesystem), iteration)
                    os.makedirs(processed_file_dir, exist_ok=True)
                    processed_file_path = os.path.join(processed_file_dir, os.path.splitext(result_file)[0] + '.csv')
                
                    # print(f"result_file_path: {result_file_path}")
                    # print(f"processed_file_dir: {processed_file_dir}")
                    # print(f"processed_file_path: {processed_file_path}")
                    # print()
                
                    with open(result_file_path, 'r') as file:
                        data = json.load(file)
                
                    #print(f"Processing fio_result '{result_file}' from {filesystem}.")
                    process_fio_json_to_csv(result_file_path, processed_file_path)
                    #print(f"Saved processed fio_result to '{processed_file_path}'")
                    #print()

print("Processing complete.")

Processing complete.

source_dir = './semi_processed_results'
destination_dir = './processed_results'

os.makedirs(destination_dir, exist_ok=True)

for filesystem in os.listdir(source_dir):
    filesystem_path = os.path.join(source_dir, filesystem)
    if not os.path.isdir(filesystem_path):
        continue
    #print("filesystem_path: ", filesystem_path)
    out_path = os.path.join(destination_dir, f'{filesystem}.csv')
    header_written = False

    for iteration in os.listdir(filesystem_path):
        iter_path = os.path.join(filesystem_path, iteration)

        #print("iter_path: ", iter_path)

        for filename in os.listdir(iter_path):
            if not filename.endswith('.csv'):
                continue

            src = os.path.join(iter_path, filename)
            with open(src, newline='') as src_csv:
                reader = csv.reader(src_csv)
                rows = list(reader)

            with open(out_path, 'a', newline='') as dst_csv:
                writer = csv.writer(dst_csv)
                if not header_written:
                    header = rows[0] + ['Iteration']
                    writer.writerow(header)
                    header_written = True
                for row in rows[1:]:
                    writer.writerow(row + [iteration])

print("Processing complete.")

Processing complete.

df = pd.read_csv('processed_results/brtfs.csv')
df

def workload_summary(csv_path, workload):
    df = pd.read_csv(csv_path)
    df = df[df['Job_Name'] == workload]

    io_cols = [
        'Read_IOPS', 'Read_Bandwidth', 'Read_Latency_Mean',
        'Write_IOPS', 'Write_Bandwidth', 'Write_Latency_Mean'
    ]
    for col in io_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    stats_summary = df.agg(
        {
            'Read_IOPS': ['mean', 'median'],
            'Read_Bandwidth': ['mean'],
            'Read_Latency_Mean': ['mean'],
            'Write_IOPS': ['mean'],
            'Write_Bandwidth': ['mean'],
            'Write_Latency_Mean': ['mean']
        }
    ).reset_index()

    confidence = 0.95
    n = len(df)

    # Confidence interval for Read_IOPS
    mean = stats_summary.loc[0, 'Read_IOPS']
    std_dev = df['Read_IOPS'].std()
    h = std_dev * stats.t.ppf((1 + confidence) / 2., n - 1) / np.sqrt(n)

    stats_summary['Read_IOPS_CI_Lower'] = mean - h
    stats_summary['Read_IOPS_CI_Upper'] = mean + h

    return stats_summary

def filesystem_summary(processed_data_path, workloads, iterations):
    summaries = []
    
    for workload in workloads:
        for iteration in range(iterations):
            summary = workload_summary(processed_data_path, workload)
            summary['Workload'] = workload
            summary['Iteration'] = iteration
            summaries.append(summary)

    combined_summary = pd.concat(summaries, ignore_index=True)
    
    return combined_summary

def plot_fs_summary(combined_summary, filter_seq, metric, units, filesystem, description):
    plt.figure(figsize=(12, 8))

    mean_data = combined_summary[combined_summary['index'] == 'mean']
    if filter_seq:
        mean_data = mean_data[mean_data['Workload'].isin(['seqwrite', 'seqread'])]
    else:
        mean_data = mean_data[~mean_data['Workload'].isin(['seqwrite', 'seqread'])]
        
    mean_data = mean_data.drop(columns=['index'])
    final_data = mean_data.melt(id_vars=['Workload'], var_name='Metric', value_name='mean')
    final_data = final_data[final_data["Metric"].isin(metric)]
    
    plt.figure(figsize=(12, 8))
    sns.barplot(data=final_data, x='Workload', y='mean', hue='Metric', palette='viridis')

    plt.title(f'{filesystem} - {description}')
    plt.xlabel('Workload')
    plt.ylabel(f'Mean Value {units} across all jobs')
    plt.xticks(rotation=45)
    plt.legend(title=f'Metrics {units}')
    plt.tight_layout()

    plt.show()

def analyze_ci_overlaps(combined_summary):
    def check_overlap(row1, row2):
        return not (
            row1['Read_IOPS_CI_Upper'] < row2['Read_IOPS_CI_Lower'] or 
            row2['Read_IOPS_CI_Upper'] < row1['Read_IOPS_CI_Lower']
        )
        
    mean_data = combined_summary[combined_summary['index'] == 'mean']
    overlap_results = []
    for i in range(len(mean_data)):
        for j in range(i + 1, len(mean_data)):
            workload1 = mean_data.iloc[i]['Workload']
            workload2 = mean_data.iloc[j]['Workload']
            if mean_data.iloc[i]['Read_IOPS_CI_Lower'] is not None and mean_data.iloc[i]['Read_IOPS_CI_Upper'] is not None and \
               mean_data.iloc[j]['Read_IOPS_CI_Lower'] is not None and mean_data.iloc[j]['Read_IOPS_CI_Upper'] is not None:
                overlap = check_overlap(mean_data.iloc[i], mean_data.iloc[j])
                overlap_results.append((workload1, workload2, overlap))
                
    return overlap_results

fs_summaries = []
workloads = ['mixed', 'randread', 'randwrite', 'seqread', 'seqwrite']

csv_path = 'processed_results/brtfs.csv'
summaries = filesystem_summary(csv_path, workloads, 10)
fs_summaries.append([summaries, 'brtfs'])
summaries

#results = analyze_ci_overlaps(summaries)
#results

plot_fs_summary(summaries, filter_seq=False, metric=["Read_IOPS","Write_IOPS", "Read_IOPS_CI_Lower", "Read_IOPS_CI_Upper"], units="", filesystem="brtfs", description="Input/Output Operations per Second")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=True, metric=["Read_Bandwidth", "Write_Bandwidth"], units="(bytes/sec)", filesystem="brtfs", description="Bandwidth in Bytes per Second for Sequential Operations")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=False, metric=["Read_Bandwidth", "Write_Bandwidth"], units="(bytes/sec)", filesystem="brtfs", description="Bandwidth in Bytes per Second for Random and Mixed Operations")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=False, metric=["Read_Latency_Mean", "Write_Latency_Mean"], units="(ms)", filesystem="brtfs", description="Average Response in Milliseconds")

<Figure size 1200x800 with 0 Axes>

csv_path = 'processed_results/ext4.csv'
summaries = filesystem_summary(csv_path, workloads, 10)
fs_summaries.append([summaries, 'ext4'])
summaries

#results = analyze_ci_overlaps(summaries)
#results

plot_fs_summary(summaries, filter_seq=False, metric=["Read_IOPS","Write_IOPS", "Read_IOPS_CI_Lower", "Read_IOPS_CI_Upper"], units="", filesystem="ext4", description="Input/Output Operations per Second")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=True, metric=["Read_Bandwidth", "Write_Bandwidth"], units="(bytes/sec)", filesystem="ext4", description="Bandwidth in Bytes per Second for Sequential Operations")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=False, metric=["Read_Bandwidth", "Write_Bandwidth"], units="(bytes/sec)", filesystem="ext4", description="Bandwidth in Bytes per Second for Random and Mixed Operations")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=False, metric=["Read_Latency_Mean", "Write_Latency_Mean"], units="(ms)", filesystem="ext4", description="Average Response in Milliseconds")

<Figure size 1200x800 with 0 Axes>

csv_path = 'processed_results/xfs.csv'
summaries = filesystem_summary(csv_path, workloads, 10)
fs_summaries.append([summaries, 'xfs'])
summaries

#results = analyze_ci_overlaps(summaries)
#results

plot_fs_summary(summaries, filter_seq=False, metric=["Read_IOPS","Write_IOPS", "Read_IOPS_CI_Lower", "Read_IOPS_CI_Upper"], units="", filesystem="xfs", description="Bandwidth in Bytes per Second for Sequential Operations")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=False, metric=["Read_Bandwidth", "Write_Bandwidth"], units="(bytes/sec)", filesystem="xfs", description="Bandwidth in Bytes per Second for Random and Mixed Operations")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=False, metric=["Read_Latency_Mean", "Write_Latency_Mean"], units="(ms)", filesystem="xfs", description="Average Response in Milliseconds")

<Figure size 1200x800 with 0 Axes>

csv_path = 'processed_results/zfs.csv'
summaries = filesystem_summary(csv_path, workloads, 10)
fs_summaries.append([summaries, 'zfs'])
summaries

#results = analyze_ci_overlaps(summaries)
#results

plot_fs_summary(summaries, filter_seq=False, metric=["Read_IOPS","Write_IOPS", "Read_IOPS_CI_Lower", "Read_IOPS_CI_Upper"], units="", filesystem="zfs", description="Input/Output Operations per Second")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=True, metric=["Read_Bandwidth", "Write_Bandwidth"], units="(bytes/sec)", filesystem="zfs", description="Bandwidth in Bytes per Second for Sequential Operations")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=False, metric=["Read_Bandwidth", "Write_Bandwidth"], units="(bytes/sec)", filesystem="zfs", description="Bandwidth in Bytes per Second for Random and Mixed Operations")

<Figure size 1200x800 with 0 Axes>

plot_fs_summary(summaries, filter_seq=False, metric=["Read_Latency_Mean", "Write_Latency_Mean"], units="(ms)", filesystem="zfs", description="Average Response in Milliseconds")

<Figure size 1200x800 with 0 Axes>

final_summary = []
for fs_summary in fs_summaries:
    summary = fs_summary[0]
    summary['Filesystem'] = fs_summary[1]
    final_summary.append(summary)

final_data = pd.concat(final_summary, ignore_index=True)
final_data.ffill(inplace=True)
#print(final_data)

# independent variables
x = final_data[['Filesystem', 'Workload']]
# dependent variables
y = final_data[['Read_IOPS', 'Write_IOPS', 'Read_Latency_Mean', 'Write_Latency_Mean']]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Filesystem', 'Workload'])
    ])

model = Pipeline(steps=[('preprocessor', preprocessor),
                         ('regressor', LinearRegression())])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('R^2 Score:', r2_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))

R^2 Score: 0.4826403430749351
RMSE: 1836464.825941098

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(
        max_depth=5,
        min_samples_leaf=5,
        random_state=42
    ))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('R^2 Score:', r2_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))

R^2 Score: 0.91236505622809
RMSE: 184343.8175389695

regressor = model.named_steps['regressor']  # Vratime regressor z pipeline
feature_names = model.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(['Filesystem', 'Workload'])

plt.figure(figsize=(20, 10))
plot_tree(regressor, feature_names=feature_names, filled=True)
plt.title('Decision Tree')
plt.show()

model_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    ))
])

model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
print("RF R^2:", r2_score(y_test, y_pred_rf))
print("RF RMSE:", mean_squared_error(y_test, y_pred_rf))

RF R^2: 0.9982959626477509
RF RMSE: 9769.911615594548

def check_assumptions(data, metric, factor='Filesystem'):
    formula = f"{metric} ~ C({factor})"
    model = ols(formula, data=data).fit()
    resid = model.resid
    groups = [group[metric].values for name, group in data.groupby(factor)]

    print(f"\n=== Assumptions for metric '{metric}' ===")

    shapiro_stat, shapiro_p = stats.shapiro(resid)
    print(f"Shapiro-Wilk test: stat={shapiro_stat:.4f}, p={shapiro_p:.4f}")
    if shapiro_p < 0.05:
        print("!!! Residuals are likely not normally distributed (p < 0.05) !!!")

    sm.qqplot(resid, line='45', fit=True)
    plt.title(f"QQ-plot of residuals for {metric}")
    plt.show()

    # Homogenita rozptylov
    levene_stat, levene_p = stats.levene(*groups, center='median')
    print(f"Levene's test: stat={levene_stat:.4f}, p={levene_p:.4f}")
    if levene_p < 0.05:
        print("!!! Variances between groups are not homogeneous (p < 0.05) !!!")

    plt.figure()
    sns.boxplot(x=factor, y=metric, data=data)
    plt.title(f"{metric} by {factor}")
    plt.show()

    return model

def perform_anova_with_checks(data, metrics, factor='Filesystem'):
    anova_results = {}
    for metric in metrics:
        model = check_assumptions(data, metric, factor)
        # ANOVA
        anova_table = sm.stats.anova_lm(model, typ=2)
        print("\nANOVA result:")
        print(anova_table)
        anova_results[metric] = anova_table
    return anova_results

metrics_to_analyze = [
    'Read_IOPS',
    'Write_IOPS',
]
anova_results = perform_anova_with_checks(final_data, metrics_to_analyze, factor="Filesystem")

=== Assumptions for metric 'Read_IOPS' ===
Shapiro-Wilk test: stat=0.8510, p=0.0000
!!! Residuals are likely not normally distributed (p < 0.05) !!!

Levene's test: stat=6.3287, p=0.0003
!!! Variances between groups are not homogeneous (p < 0.05) !!!

ANOVA result:
                     sum_sq     df         F    PR(>F)
C(Filesystem)  3.100036e+07    3.0  2.706863  0.045019
Residual       1.511731e+09  396.0       NaN       NaN

=== Assumptions for metric 'Write_IOPS' ===
Shapiro-Wilk test: stat=0.6978, p=0.0000
!!! Residuals are likely not normally distributed (p < 0.05) !!!

Levene's test: stat=31.4532, p=0.0000
!!! Variances between groups are not homogeneous (p < 0.05) !!!

ANOVA result:
                     sum_sq     df          F        PR(>F)
C(Filesystem)  3.250426e+08    3.0  27.673086  2.883330e-16
Residual       1.550446e+09  396.0        NaN           NaN

metrics_to_analyze = [
    'Read_Bandwidth',
    'Write_Bandwidth',
]
anova_results = perform_anova_with_checks(final_data, metrics_to_analyze, factor="Filesystem")

=== Assumptions for metric 'Read_Bandwidth' ===
Shapiro-Wilk test: stat=0.6402, p=0.0000
!!! Residuals are likely not normally distributed (p < 0.05) !!!

Levene's test: stat=2.9002, p=0.0348
!!! Variances between groups are not homogeneous (p < 0.05) !!!

ANOVA result:
                     sum_sq     df         F    PR(>F)
C(Filesystem)  7.406809e+18    3.0  2.883642  0.035613
Residual       3.390500e+20  396.0       NaN       NaN

=== Assumptions for metric 'Write_Bandwidth' ===
Shapiro-Wilk test: stat=0.6685, p=0.0000
!!! Residuals are likely not normally distributed (p < 0.05) !!!

Levene's test: stat=16.1055, p=0.0000
!!! Variances between groups are not homogeneous (p < 0.05) !!!

ANOVA result:
                     sum_sq     df          F        PR(>F)
C(Filesystem)  6.154461e+18    3.0  16.068696  7.079198e-10
Residual       5.055723e+19  396.0        NaN           NaN

metrics_to_analyze = [
    'Read_Latency_Mean',
    'Write_Latency_Mean',
]
anova_results = perform_anova_with_checks(final_data, metrics_to_analyze, factor="Filesystem")

=== Assumptions for metric 'Read_Latency_Mean' ===
Shapiro-Wilk test: stat=0.7896, p=0.0000
!!! Residuals are likely not normally distributed (p < 0.05) !!!

Levene's test: stat=9.9822, p=0.0000
!!! Variances between groups are not homogeneous (p < 0.05) !!!

ANOVA result:
                     sum_sq     df        F    PR(>F)
C(Filesystem)  3.266334e+06    3.0  5.21157  0.001538
Residual       8.273056e+07  396.0      NaN       NaN

=== Assumptions for metric 'Write_Latency_Mean' ===
Shapiro-Wilk test: stat=0.8692, p=0.0000
!!! Residuals are likely not normally distributed (p < 0.05) !!!

Levene's test: stat=94.2678, p=0.0000
!!! Variances between groups are not homogeneous (p < 0.05) !!!

ANOVA result:
                     sum_sq     df         F        PR(>F)
C(Filesystem)  1.677955e+09    3.0  58.20406  3.428079e-31
Residual       3.805405e+09  396.0       NaN           NaN

def perform_kruskal_wallis_tests(data, metrics, factor='Filesystem'):
    results = {}
    for metric in metrics:
        grouped = [group[metric].dropna().values
                   for _, group in data.groupby(factor)]
        
        stat, p = stats.kruskal(*grouped)
        print(f"Kruskal-Wallis for '{metric}': H = {stat:.4f}, p = {p:.4f}")

metrics_to_test = ['Read_IOPS', 'Write_IOPS', 'Read_Bandwidth', 'Write_Bandwidth', 'Read_Latency_Mean', 'Write_Latency_Mean']
perform_kruskal_wallis_tests(final_data, metrics_to_test, factor='Filesystem')

Kruskal-Wallis for 'Read_IOPS': H = 2.4468, p = 0.4850
Kruskal-Wallis for 'Write_IOPS': H = 37.2742, p = 0.0000
Kruskal-Wallis for 'Read_Bandwidth': H = 0.6404, p = 0.8871
Kruskal-Wallis for 'Write_Bandwidth': H = 12.9371, p = 0.0048
Kruskal-Wallis for 'Read_Latency_Mean': H = 2.1775, p = 0.5364
Kruskal-Wallis for 'Write_Latency_Mean': H = 42.1416, p = 0.0000

def wilcoxon_paired_test(
    data,
    metric,
    group_col = "Filesystem",
    group1 = "ext4",
    group2= "btrfs",
    index_cols = ["Workload", "Iteration"],
    plot_qq = True
):
    wide = (
        data
        .pivot_table(
            index=index_cols,
            columns=group_col,
            values=metric
        )
        .loc[:, [group1, group2]]
        .dropna()
    )

    diffs = wide[group1] - wide[group2]

    # normalita rozdielov
    shapiro_stat, shapiro_p = stats.shapiro(diffs)
    print(f"Shapiro-Wilk for {metric} diffs ({group1}-{group2}): "
          f"stat={shapiro_stat:.4f}, p={shapiro_p:.4f}")

    # QQ-plot
    if plot_qq:
        sm.qqplot(diffs, line="45", fit=True)
        plt.title(f"QQ-plot differences for {metric}")
        plt.show()

    # Wilcoxon
    try:
        w_stat, w_p = stats.wilcoxon(
            wide[group1],
            wide[group2],
            zero_method="wilcox",
            alternative="two-sided"
        )
    except ValueError as e:
        print(f"Wilcoxon test failed for {metric}: {e}")
        w_stat, w_p = None, None

    print(f"Wilcoxon for {metric}: stat={w_stat}, p={w_p}\n")

    return {
        "metric": metric,
        "shapiro_stat": shapiro_stat,
        "shapiro_p": shapiro_p,
        "wilcoxon_stat": w_stat,
        "wilcoxon_p": w_p,
        "n_pairs": len(diffs)
    }

def perform_wilcoxon_paired_tests(
    data,
    metrics,
    group_col = "Filesystem",
    group1 = "ext4",
    group2= "btrfs",
    index_cols = ["Workload", "Iteration"]
):
    results = []
    for metric in metrics:
        res = wilcoxon_paired_test(
            data=data,
            metric=metric,
            group_col=group_col,
            group1=group1,
            group2=group2,
            index_cols=index_cols
        )
        results.append(res)
    return pd.DataFrame(results)

final_data = final_data[final_data['Filesystem'].isin(['brtfs', 'ext4'])]
final_data = final_data[final_data['index'].isin(['mean'])]

#print(final_data)

metrics_to_analyze = [
    'Read_IOPS',
    'Write_IOPS',
]
perform_wilcoxon_paired_tests(final_data, metrics_to_analyze, group_col = "Filesystem", group1= "ext4", group2 = "brtfs", index_cols = ["Workload", "Iteration"])

Shapiro-Wilk for Read_IOPS diffs (ext4-brtfs): stat=0.8203, p=0.0000

Wilcoxon for Read_IOPS: stat=210.0, p=0.6390937131929089

Shapiro-Wilk for Write_IOPS diffs (ext4-brtfs): stat=0.6288, p=0.0000

Wilcoxon for Write_IOPS: stat=0.0, p=1.2598486800387818e-06

metrics_to_analyze = [
    'Read_Bandwidth',
    'Write_Bandwidth',
]
perform_wilcoxon_paired_tests(final_data, metrics_to_analyze, group_col = "Filesystem", group1= "ext4", group2 = "brtfs", index_cols = ["Workload", "Iteration"])

Shapiro-Wilk for Read_Bandwidth diffs (ext4-brtfs): stat=0.5277, p=0.0000

Wilcoxon for Read_Bandwidth: stat=155.0, p=0.10623959199707807

Shapiro-Wilk for Write_Bandwidth diffs (ext4-brtfs): stat=0.5192, p=0.0000

Wilcoxon for Write_Bandwidth: stat=0.0, p=1.2598486800387818e-06

metrics_to_analyze = [
    'Read_Latency_Mean',
    'Write_Latency_Mean',
]
perform_wilcoxon_paired_tests(final_data, metrics_to_analyze, group_col = "Filesystem", group1= "ext4", group2 = "brtfs", index_cols = ["Workload", "Iteration"])

Shapiro-Wilk for Read_Latency_Mean diffs (ext4-brtfs): stat=0.8330, p=0.0000

Wilcoxon for Read_Latency_Mean: stat=210.0, p=0.6390937131929089

Shapiro-Wilk for Write_Latency_Mean diffs (ext4-brtfs): stat=0.8199, p=0.0000

Wilcoxon for Write_Latency_Mean: stat=0.0, p=1.2598486800387818e-06

	Job_Name	Read_IOPS	Read_Bandwidth	Read_Latency_Mean	Read_Latency_Min	Read_Latency_Max	Read_Errors	Write_IOPS	Write_Bandwidth	Write_Latency_Mean	Write_Latency_Min	Write_Latency_Max	Write_Errors	Iteration
0	mixed	567.697743	2325289	537.2872390570001	60.455	53545.272	0	243.041899	995499	2831.60668333	8.28	465766.619	0	6
1	mixed	577.280757	2364541	547.535714591	59.938	57243.541	0	248.325056	1017139	2722.235577718	7.99	462862.361	0	6
2	mixed	562.773954	2305122	549.882368822	61.756	48186.059	0	239.846003	982409	2844.742298311	8.401	465344.578	0	6
3	mixed	560.590657	2296179	546.290920799	62.211	50474.684	0	248.21253	1016678	2762.9590984359997	8.82	450801.746	0	6
4	seqwrite	0.0	0	0.0	0.0	0.0	0	113.507986	119021749	8590.604406548999	382.103	16447037.637	0	6
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
417	randwrite	0.0	0	0.0	0.0	0.0	0	230.015472	942143	4335.342988066	6.995	480680.658	0	8
418	randwrite	0.0	0	0.0	0.0	0.0	0	230.747475	945141	4325.153932877	7.408	477860.984	0	8
419	randwrite	0.0	0	0.0	0.0	0.0	0	245.191988	1004306	4067.403516284	6.043	479348.142	0	8
420	randwrite	0.0	0	0.0	0.0	0.0	0	219.578744	899394	4541.969428095	7.506	479457.954	0	8
421	seqread	1310.717797	1374387225	646.892084237	82.43	15017.528	0	0.0	0	0.0	0.0	0.0	0	8

Statistical analysis — comparison of file-system benchmarks: Btrfs, Ext4, XFS, ZFS¶

Data origin¶

First Phase of Processing¶

Processing of the data¶

Štruktúra dát¶

Analysis of Individual File System Data One by One¶

Brtfs¶

Ext4¶

Xfs¶

Zfs¶

Analysis of Data of Individual File Systems in Relation to Each Other¶

Analysis of Variations¶

Inputs/Outputs per second¶

Bandwidth (bytes/sec)¶

Latency (ms)¶

Pairwise Analysis Between Btrfs and Ext4¶

Inputs/Outputs per second¶

Bandwidth (bytes/sec)¶

Latency (ms)¶

Conclusion of the Analysis of Btrfs vs. Ext4¶

	index	Read_IOPS	Read_Bandwidth	Read_Latency_Mean	Write_IOPS	Write_Bandwidth	Write_Latency_Mean	Read_IOPS_CI_Lower	Read_IOPS_CI_Upper	Workload	Iteration
0	mean	567.569589	2324764.575	666.100357	245.039915	1.003683e+06	2516.909446	562.811206	572.327972	mixed	0
1	median	574.166709	NaN	NaN	NaN	NaN	NaN	562.811206	572.327972	mixed	0
2	mean	567.569589	2324764.575	666.100357	245.039915	1.003683e+06	2516.909446	562.811206	572.327972	mixed	1
3	median	574.166709	NaN	NaN	NaN	NaN	NaN	562.811206	572.327972	mixed	1
4	mean	567.569589	2324764.575	666.100357	245.039915	1.003683e+06	2516.909446	562.811206	572.327972	mixed	2
...	...	...	...	...	...	...	...	...	...	...	...
95	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	seqwrite	7
96	mean	0.000000	0.000	0.000000	124.603583	1.306563e+08	7760.988795	0.000000	0.000000	seqwrite	8
97	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	seqwrite	8
98	mean	0.000000	0.000	0.000000	124.603583	1.306563e+08	7760.988795	0.000000	0.000000	seqwrite	9
99	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	seqwrite	9

	index	Read_IOPS	Read_Bandwidth	Read_Latency_Mean	Write_IOPS	Write_Bandwidth	Write_Latency_Mean	Read_IOPS_CI_Lower	Read_IOPS_CI_Upper	Workload	Iteration
0	mean	3705.347271	15177101.95	328.756066	1593.725267	6527898.2	11.719552	3411.27211	3999.422433	mixed	0
1	median	4064.973917	NaN	NaN	NaN	NaN	NaN	3411.27211	3999.422433	mixed	0
2	mean	3705.347271	15177101.95	328.756066	1593.725267	6527898.2	11.719552	3411.27211	3999.422433	mixed	1
3	median	4064.973917	NaN	NaN	NaN	NaN	NaN	3411.27211	3999.422433	mixed	1
4	mean	3705.347271	15177101.95	328.756066	1593.725267	6527898.2	11.719552	3411.27211	3999.422433	mixed	2
...	...	...	...	...	...	...	...	...	...	...	...
95	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.00000	0.000000	seqwrite	7
96	mean	0.000000	0.00	0.000000	765.921937	803127360.0	965.512927	0.00000	0.000000	seqwrite	8
97	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.00000	0.000000	seqwrite	8
98	mean	0.000000	0.00	0.000000	765.921937	803127360.0	965.512927	0.00000	0.000000	seqwrite	9
99	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.00000	0.000000	seqwrite	9

	index	Read_IOPS	Read_Bandwidth	Read_Latency_Mean	Write_IOPS	Write_Bandwidth	Write_Latency_Mean	Read_IOPS_CI_Lower	Read_IOPS_CI_Upper	Workload	Iteration
0	mean	2736.634934	1.120926e+07	431.161198	1177.463757	4.822891e+06	12.085936	2497.6283	2975.641568	mixed	0
1	median	2426.456113	NaN	NaN	NaN	NaN	NaN	2497.6283	2975.641568	mixed	0
2	mean	2736.634934	1.120926e+07	431.161198	1177.463757	4.822891e+06	12.085936	2497.6283	2975.641568	mixed	1
3	median	2426.456113	NaN	NaN	NaN	NaN	NaN	2497.6283	2975.641568	mixed	1
4	mean	2736.634934	1.120926e+07	431.161198	1177.463757	4.822891e+06	12.085936	2497.6283	2975.641568	mixed	2
...	...	...	...	...	...	...	...	...	...	...	...
95	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.0000	0.000000	seqwrite	7
96	mean	0.000000	0.000000e+00	0.000000	1516.069565	1.589714e+09	519.398134	0.0000	0.000000	seqwrite	8
97	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.0000	0.000000	seqwrite	8
98	mean	0.000000	0.000000e+00	0.000000	1516.069565	1.589714e+09	519.398134	0.0000	0.000000	seqwrite	9
99	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.0000	0.000000	seqwrite	9

	index	Read_IOPS	Read_Bandwidth	Read_Latency_Mean	Write_IOPS	Write_Bandwidth	Write_Latency_Mean	Read_IOPS_CI_Lower	Read_IOPS_CI_Upper	Workload	Iteration
0	mean	338.054943	1384672.6	2039.391012	146.182581	5.987634e+05	13220.572681	300.31346	375.796425	mixed	0
1	median	318.069698	NaN	NaN	NaN	NaN	NaN	300.31346	375.796425	mixed	0
2	mean	338.054943	1384672.6	2039.391012	146.182581	5.987634e+05	13220.572681	300.31346	375.796425	mixed	1
3	median	318.069698	NaN	NaN	NaN	NaN	NaN	300.31346	375.796425	mixed	1
4	mean	338.054943	1384672.6	2039.391012	146.182581	5.987634e+05	13220.572681	300.31346	375.796425	mixed	2
...	...	...	...	...	...	...	...	...	...	...	...
95	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.00000	0.000000	seqwrite	7
96	mean	0.000000	0.0	0.000000	104.092003	1.091484e+08	9749.291413	0.00000	0.000000	seqwrite	8
97	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.00000	0.000000	seqwrite	8
98	mean	0.000000	0.0	0.000000	104.092003	1.091484e+08	9749.291413	0.00000	0.000000	seqwrite	9
99	median	0.000000	NaN	NaN	NaN	NaN	NaN	0.00000	0.000000	seqwrite	9

	metric	shapiro_stat	shapiro_p	wilcoxon_stat	wilcoxon_p	n_pairs
0	Read_IOPS	0.820291	2.659540e-06	210.0	0.639094	50
1	Write_IOPS	0.628792	5.341631e-10	0.0	0.000001	50

	metric	shapiro_stat	shapiro_p	wilcoxon_stat	wilcoxon_p	n_pairs
0	Read_Bandwidth	0.527711	1.890011e-11	155.0	0.106240	50
1	Write_Bandwidth	0.519150	1.457871e-11	0.0	0.000001	50

	metric	shapiro_stat	shapiro_p	wilcoxon_stat	wilcoxon_p	n_pairs
0	Read_Latency_Mean	0.832988	0.000006	210.0	0.639094	50
1	Write_Latency_Mean	0.819868	0.000003	0.0	0.000001	50