Add new sweep-bench benchmark (#225)

* examples : add new sweep-bench benchmark * Change documentation to reference ik_llama.cpp * Made it compile with ik_llama * Fix JSONL output --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2026-04-30 19:31:48 +00:00 · 2025-02-23 00:16:27 -06:00
parent 2212c1c636
commit ce1b59f08c
7 changed files with 370 additions and 0 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1360,6 +1360,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.warmup = false;
        return true;
    }
    if (arg == "--output-format") {
        CHECK_ARG
        std::string value(argv[i]);
        /**/ if (value == "jsonl") { params.sweep_bench_output_jsonl = true; }
        else if (value == "md") { params.sweep_bench_output_jsonl = false; }
        else { invalid_param = true; }
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
    // Parse args for logging parameters
    if (log_param_single_parse(argv[i])) {
--- a/common/common.h
+++ b/common/common.h
@@ -269,6 +269,8 @@ struct gpt_params {
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
    bool sweep_bench_output_jsonl = false;
 };
 void gpt_params_handle_hf_token(gpt_params & params);
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -51,5 +51,6 @@ else()
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
    add_subdirectory(speculative)
    add_subdirectory(sweep-bench)
    add_subdirectory(tokenize)
 endif()
--- a/examples/sweep-bench/CMakeLists.txt
+++ b/examples/sweep-bench/CMakeLists.txt
@@ -0,0 +1,5 @@
 set(TARGET llama-sweep-bench)
 add_executable(${TARGET} sweep-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sweep-bench/README.md
+++ b/examples/sweep-bench/README.md
@@ -0,0 +1,64 @@
 # ik_llama.cpp/example/sweep-bench
 Benchmark the prompt processing and token generation performance of `ik_llama.cpp`
 by doing a sweep over a whole context size and gathering performance metrics
 in each ubatch-sized window. Only a single token sequence is used.
 The benchmark steps are:
 for each ubatch-sized window in context:
    1. generate ubatch/4 tokens (not the whole window to save some time)
    2. measure generation performance
    3. remove generated tokens from KV cache
    4. prepare a ubatch-sized batch of random tokens
    4. process prepated batch
    5. measure prompt processing performance
 The purpose of the benchmark is to visualize how the performance changes with
 the context size without averaging the metrics values over the whole context.
 ## Usage
 ./llama-sweep-bench -c 8704 -ub 512 -m models/Meta-Llama-3.2-3B-Instruct-Q8_0.gguf
 ## Sample results
 - `PP` - prompt tokens per ubatch
 - `TG` - generated tokens per ubatch
 - `N_KV` - current KV cache size
 - `T_PP` - prompt processing time (i.e. time to first token)
 - `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
 - `T_TG` - time to generate all batches
 - `S_TG` - text generation speed (`(B*TG)/T_TG`)
 |    PP |     TG |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |
 |-------|--------|--------|----------|----------|----------|----------|
 |   512 |    128 |      0 |    1.100 |   465.51 |    2.311 |    55.38 |
 |   512 |    128 |    512 |    1.183 |   432.97 |    1.895 |    67.55 |
 |   512 |    128 |   1024 |    1.305 |   392.38 |    2.071 |    61.81 |
 |   512 |    128 |   1536 |    1.279 |   400.42 |    2.164 |    59.14 |
 |   512 |    128 |   2048 |    1.571 |   325.96 |    2.280 |    56.14 |
 |   512 |    128 |   2560 |    1.431 |   357.87 |    2.418 |    52.94 |
 |   512 |    128 |   3072 |    1.515 |   337.93 |    2.566 |    49.88 |
 |   512 |    128 |   3584 |    1.588 |   322.34 |    2.722 |    47.03 |
 |   512 |    128 |   4096 |    1.675 |   305.70 |    2.864 |    44.69 |
 |   512 |    128 |   4608 |    1.769 |   289.50 |    2.999 |    42.68 |
 |   512 |    128 |   5120 |    1.845 |   277.48 |    3.102 |    41.26 |
 |   512 |    128 |   5632 |    1.893 |   270.46 |    3.219 |    39.76 |
 |   512 |    128 |   6144 |    1.953 |   262.20 |    3.348 |    38.23 |
 |   512 |    128 |   6656 |    2.018 |   253.71 |    3.474 |    36.84 |
 |   512 |    128 |   7168 |    2.078 |   246.34 |    3.589 |    35.66 |
 |   512 |    128 |   7680 |    2.140 |   239.22 |    3.717 |    34.43 |
 |   512 |    128 |   8192 |    2.196 |   233.15 |    3.854 |    33.21 |
 ### JSONL output
 Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
 ```json lines
 {"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 0, "t_pp": 1.093814, "speed_pp": 468.086884, "t_tg": 1.780312, "speed_tg": 71.897514 }
 {"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 512, "t_pp": 1.169302, "speed_pp": 437.868073, "t_tg": 1.897474, "speed_tg": 67.458099 }
 {"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 1024, "t_pp": 1.183700, "speed_pp": 432.542053, "t_tg": 2.059179, "speed_tg": 62.160694 }
 {"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 1536, "t_pp": 1.428625, "speed_pp": 358.386566, "t_tg": 2.160639, "speed_tg": 59.241734 }
 {"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 2048, "t_pp": 1.360647, "speed_pp": 376.291595, "t_tg": 2.274003, "speed_tg": 56.288403 }
 ```
--- a/examples/sweep-bench/sweep-bench-plot.py
+++ b/examples/sweep-bench/sweep-bench-plot.py
@@ -0,0 +1,100 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument('file', nargs='+')
 args = parser.parse_args()
 df = None
 for jsonl_file in args.file:
    # Read JSONL file into DataFrame
    df_part = pd.read_json(jsonl_file, lines=True)
    df_part['label'] = jsonl_file
    if df is None:
        df = df_part
    else:
        df = pd.concat([df, df_part])
 # Group by model and n_kv, calculate mean and std for both speed metrics
 df_grouped = df.groupby(['label', 'n_kv']).agg({
    'speed_pp': ['mean', 'std'],
    'speed_tg': ['mean', 'std']
 }).reset_index()
 # Flatten multi-index columns
 df_grouped.columns = ['label', 'n_kv', 'speed_pp_mean', 'speed_pp_std', 
                      'speed_tg_mean', 'speed_tg_std']
 # Replace NaN with 0 (std for a single sample is NaN)
 df_grouped['speed_pp_std'] = df_grouped['speed_pp_std'].fillna(0)
 df_grouped['speed_tg_std'] = df_grouped['speed_tg_std'].fillna(0)
 # Prepare ticks values for X axis (prune for readability)
 x_ticks = df['n_kv'].unique()
 while len(x_ticks) > 16:
    x_ticks = x_ticks[::2]
 # Get unique labels and color map
 labels = df_grouped['label'].unique()
 colors = plt.cm.rainbow(np.linspace(0, 1, len(labels)))
 # Create prompt processing plot
 plt.figure(figsize=(10, 6))
 ax1 = plt.gca()
 plt.grid()
 ax1.set_xticks(x_ticks)
 # Plot each label's data
 for label, color in zip(labels, colors):
    label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
    # Plot prompt processing
    pp = ax1.errorbar(label_data['n_kv'], label_data['speed_pp_mean'], 
                     yerr=label_data['speed_pp_std'], color=color, 
                     marker='o', linestyle='-', label=label)
 # Add labels and title
 ax1.set_xlabel('Context Length (tokens)')
 ax1.set_ylabel('Prompt Processing Rate (t/s)')
 plt.title('Prompt Processing Performance Comparison')
 ax1.legend(loc='upper right')
 # Adjust layout and save
 plt.tight_layout()
 plt.savefig('performance_comparison_pp.png', bbox_inches='tight')
 plt.close()
 # Create token generation plot
 plt.figure(figsize=(10, 6))
 ax1 = plt.gca()
 plt.grid()
 ax1.set_xticks(x_ticks)
 # Plot each model's data
 for label, color in zip(labels, colors):
    label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
    # Plot token generation
    tg = ax1.errorbar(label_data['n_kv'], label_data['speed_tg_mean'],
                     yerr=label_data['speed_tg_std'], color=color, 
                     marker='s', linestyle='-', label=label)
 # Add labels and title
 ax1.set_xlabel('Context Length (n_kv)')
 ax1.set_ylabel('Token Generation Rate (t/s)')
 plt.title('Token Generation Performance Comparison')
 ax1.legend(loc='upper right')
 # Adjust layout and save
 plt.tight_layout()
 plt.savefig('performance_comparison_tg.png', bbox_inches='tight')
 plt.close()
--- a/examples/sweep-bench/sweep-bench.cpp
+++ b/examples/sweep-bench/sweep-bench.cpp
@@ -0,0 +1,189 @@
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
 #include "llama-vocab.h"
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
 #include <windows.h>
 #endif
 #include <algorithm>
 #include <cstdlib>
 #include <cstdio>
 #include <string>
 #include <vector>
 static void print_usage(int, char ** argv) {
    LOG("\nexample usage:\n");
    LOG("\n    %s -m model.gguf -c 8192 -b 2048 -ub 512\n", argv[0]);
    LOG("\n");
 }
 int main(int argc, char ** argv) {
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        print_usage(argc, argv);
        return 1;
    }
    // init LLM
    llama_backend_init();
    llama_numa_init(params.numa);
    // initialize the model
    llama_model_params model_params = llama_model_params_from_gpt_params(params);
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }
    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }
    const unsigned int n_kv_max = llama_n_ctx(ctx);
    const llama_vocab * vocab = llama_get_vocab(ctx);
    llama_token bos = llama_token_bos_impl(*vocab);
    //llama_token eos = llama_token_eos_impl(*vocab);
    const unsigned int n_vocab  = llama_n_vocab(model);
    // decode in batches of ctx_params.n_batch tokens
    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
            llama_batch batch_view = {
                n_tokens,
                batch.token    + i,
                nullptr,
                batch.pos      + i,
                batch.n_seq_id + i,
                batch.seq_id   + i,
                batch.logits   + i,
            };
            const int ret = llama_decode(ctx, batch_view);
            if (ret != 0) {
                LOG("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                return false;
            }
            llama_synchronize(ctx);
        }
        return true;
    };
    const unsigned int pp = params.n_ubatch;
    const unsigned int tg = params.n_ubatch / 4;
    if (!params.sweep_bench_output_jsonl) {
        LOG("\n");
        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
        LOG("\n");
        LOG("|%6s | %6s | %6s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s");
        LOG("|%6s-|-%6s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "------", "--------", "--------", "--------", "--------");
    }
    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
    // warm up
    {
        llama_batch_add(batch, bos, 0, { 0 }, false);
        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
            LOG("%s: llama_decode() failed\n", __func__);
            return 1;
        }
    }
    llama_batch_clear(batch);
    llama_kv_cache_clear(ctx);
    for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) {
        // clean up KV cache before generation
        llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
        // first measure token generation performance at this context size
        const auto t_tg_start = ggml_time_us();
        for (unsigned int i = 0; i < tg; ++i) {
            llama_batch_clear(batch);
            llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true);
            if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                LOG("%s: llama_decode() failed\n", __func__);
                return 1;
            }
        }
        const auto t_tg_end = ggml_time_us();
        // clean up KV cache after generation
        llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
        // prepare batch of pp size for prompt processing performance measurement
        llama_batch_clear(batch);
        for (unsigned int i = 0; i < pp; ++i) {
            llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, false);
        }
        batch.logits[batch.n_tokens - 1] = true;
        // measure prompt processing performance
        const auto t_pp_start = ggml_time_us();
        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
            LOG("%s: llama_decode() failed\n", __func__);
            return 1;
        }
        const auto t_pp_end = ggml_time_us();
        // calculate and print metrics
        const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
        const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
        const float speed_pp = pp / t_pp;
        const float speed_tg = tg / t_tg;
        if(params.sweep_bench_output_jsonl) {
            LOG(
                "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
                "\"pp\": %d, \"tg\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f }\n",
                n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
                pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg
            );
        } else {
            LOG("|%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg);
        }
    }
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
    return 0;
 }