Update sweep bench (depracating .jsonl support) (#289)

* Update sweep bench (depracating .jsonl support)

* Fix README.md
This commit is contained in:
saood06
2025-03-25 10:14:44 -05:00
committed by GitHub
parent 6ef4954612
commit c12a6f8558
3 changed files with 57 additions and 38 deletions

View File

@@ -7,6 +7,7 @@ in each ubatch-sized window. Only a single token sequence is used.
The benchmark steps are: The benchmark steps are:
for each ubatch-sized window in context: for each ubatch-sized window in context:
1. generate ubatch/4 tokens (not the whole window to save some time) 1. generate ubatch/4 tokens (not the whole window to save some time)
2. measure generation performance 2. measure generation performance
3. remove generated tokens from KV cache 3. remove generated tokens from KV cache

View File

@@ -9,27 +9,54 @@ args = parser.parse_args()
df = None df = None
for jsonl_file in args.file: #for jsonl_file in args.file:
# Read JSONL file into DataFrame # # Read JSONL file into DataFrame
df_part = pd.read_json(jsonl_file, lines=True) # df_part = pd.read_json(jsonl_file, lines=True)
df_part['label'] = jsonl_file # df_part['label'] = jsonl_file
if df is None: # if df is None:
df = df_part # df = df_part
else: # else:
df = pd.concat([df, df_part]) # df = pd.concat([df, df_part])
#
# Group by model and n_kv, calculate mean and std for both speed metrics
for md_file in args.file:
# Read markdown table file into DataFrame
df_part = pd.read_csv(md_file, sep=r'\s*\|\s*', engine='python',
header=0, skiprows=[1])
# Clean up columns (remove empty columns from markdown formatting)
df_part = df_part.iloc[:, 1:-1]
df_part.columns = [col.strip() for col in df_part.columns]
# Rename columns to match expected names
df_part = df_part.rename(columns={
'N_KV': 'n_kv',
'S_PP t/s': 'speed_pp',
'S_TG t/s': 'speed_tg'
})
# Convert to numeric types
df_part['n_kv'] = pd.to_numeric(df_part['n_kv'])
df_part['speed_pp'] = pd.to_numeric(df_part['speed_pp'])
df_part['speed_tg'] = pd.to_numeric(df_part['speed_tg'])
# Add label and append to main DataFrame
df_part['label'] = md_file
df = pd.concat([df, df_part]) if df is not None else df_part
# Group by label and n_kv, calculate mean and std for both speed metrics
df_grouped = df.groupby(['label', 'n_kv']).agg({ df_grouped = df.groupby(['label', 'n_kv']).agg({
'speed_pp': ['mean', 'std'], 'speed_pp': ['mean', 'std'],
'speed_tg': ['mean', 'std'] 'speed_tg': ['mean', 'std']
}).reset_index() }).reset_index()
# Flatten multi-index columns # Flatten multi-index columns
df_grouped.columns = ['label', 'n_kv', 'speed_pp_mean', 'speed_pp_std', df_grouped.columns = ['label', 'n_kv', 'speed_pp_mean', 'speed_pp_std',
'speed_tg_mean', 'speed_tg_std'] 'speed_tg_mean', 'speed_tg_std']
# Replace NaN with 0 (std for a single sample is NaN) # Replace NaN with 0 (std for a single sample is NaN)
df_grouped['speed_pp_std'] = df_grouped['speed_pp_std'].fillna(0) df_grouped['speed_pp_std'] = df_grouped['speed_pp_std'].fillna(0)
df_grouped['speed_tg_std'] = df_grouped['speed_tg_std'].fillna(0) df_grouped['speed_tg_std'] = df_grouped['speed_tg_std'].fillna(0)
@@ -45,25 +72,20 @@ colors = plt.cm.rainbow(np.linspace(0, 1, len(labels)))
# Create prompt processing plot # Create prompt processing plot
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
ax1 = plt.gca() ax1 = plt.gca()
plt.grid() plt.grid()
ax1.set_xticks(x_ticks) ax1.set_xticks(x_ticks)
# Plot each label's data # Plot each label's data
for label, color in zip(labels, colors): for label, color in zip(labels, colors):
label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv') label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
pp = ax1.errorbar(label_data['n_kv'], label_data['speed_pp_mean'],
# Plot prompt processing yerr=label_data['speed_pp_std'], color=color,
pp = ax1.errorbar(label_data['n_kv'], label_data['speed_pp_mean'],
yerr=label_data['speed_pp_std'], color=color,
marker='o', linestyle='-', label=label) marker='o', linestyle='-', label=label)
# Add labels and title # Add labels and title
ax1.set_xlabel('Context Length (tokens)') ax1.set_xlabel('Context Length (tokens)')
ax1.set_ylabel('Prompt Processing Rate (t/s)') ax1.set_ylabel('Prompt Processing Rate (t/s)')
plt.title('Prompt Processing Performance Comparison') plt.title('Prompt Processing Performance Comparison')
ax1.legend(loc='upper right') ax1.legend(loc='upper right')
# Adjust layout and save # Adjust layout and save
@@ -74,24 +96,20 @@ plt.close()
# Create token generation plot # Create token generation plot
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
ax1 = plt.gca() ax1 = plt.gca()
plt.grid() plt.grid()
ax1.set_xticks(x_ticks) ax1.set_xticks(x_ticks)
# Plot each model's data # Plot each model's data
for label, color in zip(labels, colors): for label, color in zip(labels, colors):
label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv') label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
# Plot token generation
tg = ax1.errorbar(label_data['n_kv'], label_data['speed_tg_mean'], tg = ax1.errorbar(label_data['n_kv'], label_data['speed_tg_mean'],
yerr=label_data['speed_tg_std'], color=color, yerr=label_data['speed_tg_std'], color=color,
marker='s', linestyle='-', label=label) marker='s', linestyle='-', label=label)
# Add labels and title # Add labels and title
ax1.set_xlabel('Context Length (n_kv)') ax1.set_xlabel('Context Length (n_kv)')
ax1.set_ylabel('Token Generation Rate (t/s)') ax1.set_ylabel('Token Generation Rate (t/s)')
plt.title('Token Generation Performance Comparison') plt.title('Token Generation Performance Comparison')
ax1.legend(loc='upper right') ax1.legend(loc='upper right')
# Adjust layout and save # Adjust layout and save

View File

@@ -18,9 +18,9 @@
#include <vector> #include <vector>
static void print_usage(int, char ** argv) { static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n"); LOG_TEE("\nexample usage:\n");
LOG("\n %s -m model.gguf -c 8192 -b 2048 -ub 512\n", argv[0]); LOG_TEE("\n %s -m model.gguf -c 8192 -b 2048 -ub 512\n", argv[0]);
LOG("\n"); LOG_TEE("\n");
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);
if (ret != 0) { if (ret != 0) {
LOG("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false; return false;
} }
@@ -97,11 +97,11 @@ int main(int argc, char ** argv) {
const unsigned int tg = params.n_ubatch / 4; const unsigned int tg = params.n_ubatch / 4;
if (!params.sweep_bench_output_jsonl) { if (!params.sweep_bench_output_jsonl) {
LOG("\n"); LOG_TEE("\n");
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG("\n"); LOG_TEE("\n");
LOG("|%6s | %6s | %6s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s"); LOG_TEE("|%6s | %6s | %6s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s");
LOG("|%6s-|-%6s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "------", "--------", "--------", "--------", "--------"); LOG_TEE("|%6s-|-%6s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "------", "--------", "--------", "--------", "--------");
} }
llama_batch batch = llama_batch_init(n_kv_max, 0, 1); llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
@@ -111,7 +111,7 @@ int main(int argc, char ** argv) {
llama_batch_add(batch, bos, 0, { 0 }, false); llama_batch_add(batch, bos, 0, { 0 }, false);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG("%s: llama_decode() failed\n", __func__); LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
} }
@@ -131,7 +131,7 @@ int main(int argc, char ** argv) {
llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true); llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG("%s: llama_decode() failed\n", __func__); LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
} }
@@ -153,7 +153,7 @@ int main(int argc, char ** argv) {
const auto t_pp_start = ggml_time_us(); const auto t_pp_start = ggml_time_us();
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG("%s: llama_decode() failed\n", __func__); LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
@@ -167,14 +167,14 @@ int main(int argc, char ** argv) {
const float speed_tg = tg / t_tg; const float speed_tg = tg / t_tg;
if(params.sweep_bench_output_jsonl) { if(params.sweep_bench_output_jsonl) {
LOG( LOG_TEE(
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, " "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
"\"pp\": %d, \"tg\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f }\n", "\"pp\": %d, \"tg\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f }\n",
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg
); );
} else { } else {
LOG("|%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg); LOG_TEE("|%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg);
} }
} }