Merge branch 'main' into andrewkchan/try_trellis

2026-03-12 06:50:08 +00:00 · 2025-05-20 06:48:14 +00:00
parent d5eb74d719 504fb890d9
commit aefab2eec1
109 changed files with 20602 additions and 6765 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -51,5 +51,6 @@ else()
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
    add_subdirectory(speculative)
+    add_subdirectory(sweep-bench)
    add_subdirectory(tokenize)
 endif()
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -19,6 +19,8 @@
 #include <fstream>
 #include <unordered_map>
 #include <algorithm>
+#include <optional>
+#include <sstream>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -39,6 +41,7 @@ struct Stats {
    std::vector<float> values;
    std::vector<int> counts;
    int ncall = 0;
+    int n_as = 1;
 };

 class IMatrixCollector {
@@ -48,13 +51,59 @@ public:
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
    void save_imatrix(int ncall = -1) const;
    bool load_imatrix(const char * file_name);
+    void set_collect_lsim(bool yes_or_no) { m_collect_lsim = yes_or_no; }
+    void print_layer_importance();
 private:
    std::unordered_map<std::string, Stats> m_stats;
    gpt_params                             m_params;
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
+    int                                    m_last_layer = 9999;
+    int                                    m_last_ffn = -1;
    std::vector<float>                     m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
+    std::vector<float>                     m_last_input;
+    std::vector<float>                     m_ffn_input;
+    std::vector<std::pair<double,int>>     m_layer_sim;
+    std::vector<std::pair<double,int>>     m_attn_sim;
+    std::vector<std::pair<double,int>>     m_ffn_sim;
+    bool                                   m_collect_lsim = false;
+
+    std::optional<int> layer_index(const std::string& name) const {
+        if (name == m_params.output_tensor_name && m_last_layer < 199) {
+            return m_last_layer + 1;
+        }
+        if (auto pos = name.find("blk."); pos == 0) {
+            pos += 4;
+            if (auto pos1 = name.find('.', pos); pos1 != std::string::npos) {
+                auto index_str = name.substr(pos, pos1 - pos);
+                std::istringstream str(index_str);
+                int index; str >> index;
+                if (!str.fail()) return index;
+            }
+        }
+        return std::nullopt;
+    }
+
+    static inline double cosine_similarity(int n, const float * x, const float * y) {
+        double sumxy = 0, sumx2 = 0, sumy2 = 0;
+        for (int j = 0; j < n; ++j) {
+            sumxy += x[j]*y[j]; sumx2 += x[j]*x[j]; sumy2 += y[j]*y[j];
+        }
+        double cos_sim = sumx2 > 0 && sumy2 > 0 ? sumxy/sqrt(sumx2*sumy2) : 0;
+        return cos_sim;
+    }
+
+    static inline void collect_cos_similarity(int nrow, int n, const float * x, const float * y, std::pair<double, int>& p) {
+        for (int row = 0; row < nrow; ++row) {
+            p.first  += cosine_similarity(n, x, y);
+            p.second += 1;
+            x += n;
+            y += n;
+        }
+    }
+
+    static void print_layer_importance(const char * msg, const std::vector<std::pair<double, int>>& sim);
 };

 // remove any prefix and suffixes from the name
@@ -76,6 +125,45 @@ static std::string filter_tensor_name(const char * name) {
    return wname;
 }

+void IMatrixCollector::print_layer_importance(const char * msg, const std::vector<std::pair<double, int>>& sim) {
+    if (sim.empty()) return;
+    std::vector<std::pair<float, int>> layers;
+    layers.reserve(sim.size());
+    for (int i = 0; i < int(sim.size()); ++i) {
+        if (sim[i].second > 0) layers.emplace_back(float(std::abs(sim[i].first/sim[i].second)), i);
+    }
+    if (layers.empty()) return;
+    std::sort(layers.begin(), layers.end());
+    printf("%s\n", msg);
+    //printf("======================== sorted layer importances\n");
+    int j = 0;
+    for (auto& p : layers) {
+        int i = p.second;
+        printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, sim[i].first/sim[i].second);
+    }
+}
+
+void IMatrixCollector::print_layer_importance() {
+    print_layer_importance("\n======================== sorted layer importances", m_layer_sim);
+    print_layer_importance("\n======================== sorted attention importances", m_attn_sim);
+    print_layer_importance("\n======================== sorted ffn importances", m_ffn_sim);
+    //printf("%s: have %d layers\n", __func__, int(m_layer_sim.size()));
+    //if (m_layer_sim.empty()) return;
+    //std::vector<std::pair<float, int>> layers;
+    //layers.reserve(m_layer_sim.size());
+    //for (int i = 0; i < int(m_layer_sim.size()); ++i) {
+    //    if (m_layer_sim[i].second > 0) layers.emplace_back(float(std::abs(m_layer_sim[i].first/m_layer_sim[i].second)), i);
+    //}
+    //if (layers.empty()) return;
+    //std::sort(layers.begin(), layers.end());
+    //printf("======================== sorted layer importances\n");
+    //int j = 0;
+    //for (auto& p : layers) {
+    //    int i = p.second;
+    //    printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, m_layer_sim[i].first/m_layer_sim[i].second);
+    //}
+}
+
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
    GGML_UNUSED(user_data);

@@ -91,7 +179,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        // why are small batches ignored (<16 tokens)?
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
        //printf("wname = %s\n", wname.c_str());
-        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == m_params.output_tensor_name))) return false;
+        if (!(wname.substr(0, 4) == "blk." || ((m_params.process_output || m_collect_lsim) && wname == m_params.output_tensor_name))) return false;
        return true;
    }

@@ -107,6 +195,33 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();

+    if (m_collect_lsim) {
+        if (wname.find(".ffn_") != std::string::npos) {
+            if (auto index = layer_index(wname); index.has_value() && *index == m_last_layer && *index != m_last_ffn) {
+                int n = src1->ne[0];
+                int nrow = t->op == GGML_OP_MUL_MAT_ID ? src1->ne[2] : src1->ne[1];
+                if (t->op == GGML_OP_MUL_MAT_ID) {
+                    GGML_ASSERT(src1->ne[1] == 1);
+                }
+                if (m_ffn_input.empty()) {
+                    m_ffn_input.resize(nrow*n);
+                } else {
+                    if ((int)m_ffn_input.size() != nrow*n) {
+                        printf("Oops, inconsistent ffn size\n"); exit(1);
+                    }
+                }
+                std::memcpy(m_ffn_input.data(), data, nrow*n*sizeof(float));
+                if (m_ffn_input.size() != m_last_input.size()) {
+                    printf("Oops, inconsistent ffn vs last_input size\n"); exit(1);
+                }
+                if (m_attn_sim.size() < *index + 1) m_attn_sim.resize(*index + 1);
+                auto& p = m_attn_sim[*index];
+                collect_cos_similarity(nrow, n, m_ffn_input.data(), m_last_input.data(), p);
+                m_last_ffn = *index;
+            }
+        }
+    }
+
    // this has been adapted to the new format of storing merged experts in a single 3d tensor
    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
@@ -132,11 +247,15 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        if (e.values.empty()) {
            e.values.resize(src1->ne[0]*n_as, 0);
            e.counts.resize(src1->ne[0]*n_as, 0);
+            e.n_as = n_as;
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
            exit(1); //GGML_ABORT("fatal error");
        }
+        else if (e.n_as != n_as) {
+            fprintf(stderr, "Oops: inconsistent n_as for %s (%d vs %d)\n", wname.c_str(), e.n_as, n_as);
+        }
        if (m_params.verbosity > 1) {
            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
        }
@@ -177,6 +296,39 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            }
        }
    } else {
+        if (m_collect_lsim) {
+            // We only need to do it here and not in the MoE branch above because the first tensor in a layer
+            // never is a MoE tensor
+            if (auto index = layer_index(wname); index.has_value()) {
+                if (*index != m_last_layer) {
+                    if (*index > 0) {
+                        if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
+                            printf("Oops: different size (%d vs %d). Tensor name was %s, m_last_layer = %d\n",
+                                    (int)(src1->ne[0]*src1->ne[1]), (int)m_last_input.size(), src0->name, m_last_layer);
+                            exit(1);
+                        }
+                        if (*index > m_layer_sim.size()) m_layer_sim.resize(*index);
+                        auto& p = m_layer_sim[*index - 1];
+                        collect_cos_similarity(src1->ne[1], src1->ne[0], m_last_input.data(), (const float *)data, p);
+                        if (*index == m_last_ffn + 1) {
+                            if (*index > m_ffn_sim.size()) m_ffn_sim.resize(*index);
+                            auto& p1 = m_ffn_sim[*index-1];
+                            collect_cos_similarity(src1->ne[1], src1->ne[0], m_ffn_input.data(), (const float *)data, p1);
+                        }
+                    }
+                    m_last_layer = *index;
+                    if (m_last_input.empty()) {
+                        m_last_input.resize(src1->ne[0]*src1->ne[1]);
+                    } else {
+                        if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
+                            printf("Oops\n"); exit(1);
+                        }
+                    }
+                    //printf("Copying src1 to m_last_input\n");
+                    std::memcpy(m_last_input.data(), data, src1->ne[0]*src1->ne[1]*sizeof(float));
+                }
+            }
+        }
        auto & e = m_stats[wname];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0], 0);
@@ -190,7 +342,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        if (m_params.verbosity > 1) {
            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
        }
-        for (int row = 0; row < (int)src1->ne[1]; ++row) {
+        for (int row = 0; row < (int)(src1->ne[1]*src1->ne[2]); ++row) {
            const float * x = data + row * src1->ne[0];
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
@@ -258,8 +410,38 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        }

        if (n_zeros > 0) {
-            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
-            continue;
+            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%)", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            bool store_it = false;
+            if (kv.second.n_as > 1) {
+                int n_per_expert = n_all / kv.second.n_as;
+                std::vector<int> bad_experts;
+                bad_experts.reserve(kv.second.n_as);
+                for (int i = 0; i < kv.second.n_as; ++i) {
+                    auto counts = kv.second.counts.data() + i*n_per_expert;
+                    int nz_i = 0;
+                    for (int j = 0; j < n_per_expert; ++j) {
+                        if (counts[j] == 0) ++nz_i;
+                    }
+                    if (nz_i > 0) bad_experts.push_back(i);
+                }
+                fprintf(stderr, " %d out of %d experts are missing data", int(bad_experts.size()), kv.second.n_as);
+                if (bad_experts.size() < round(kv.second.n_as * 0.05)) {
+                    fprintf(stderr, " Storing **but be aware**\n");
+                    store_it = true;
+                    for (auto i : bad_experts) {
+                        auto counts = (int *)kv.second.counts.data() + i*n_per_expert;
+                        auto values = (float *)kv.second.values.data() + i*n_per_expert;
+                        for (int j = 0; j < n_per_expert; ++j) {
+                            counts[j] = 1;
+                            values[j] = 1;
+                        }
+                    }
+                }
+            }
+            if (!store_it) {
+                fprintf(stderr, " - skipping\n");
+                continue;
+            }
        }

        n_entries++;
@@ -587,7 +769,25 @@ int main(int argc, char ** argv) {
    params.logits_all = true;
    params.verbosity = 1;

-    if (!gpt_params_parse(argc, argv, params)) {
+    bool lsim = false;
+    //
+    // Do not pollute common with totally imatrix specific arguments as it was done in mainline.
+    // Instead, parse imatrix specific args here, push unknown args into a new array of args,
+    // and pass that to gpt_params_parse().
+    //
+    std::vector<char*> args;
+    args.reserve(argc);
+    args.push_back(argv[0]);
+    for (int i = 1; i < argc; ++i) {
+        std::string arg{argv[i]};
+        if (arg == "-lsim" || arg == "--layer-similarity") {
+            lsim = true;
+        } else {
+            args.push_back(argv[i]);
+        }
+    }
+
+    if (!gpt_params_parse(args.size(), args.data(), params)) {
        print_usage(argc, argv, params);
        return 1;
    }
@@ -595,6 +795,7 @@ int main(int argc, char ** argv) {
    params.n_batch = std::min(params.n_batch, params.n_ctx);

    g_collector.set_params(params);
+    g_collector.set_collect_lsim(lsim);

    for (const auto & in_file : params.in_files) {
        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
@@ -645,6 +846,7 @@ int main(int argc, char ** argv) {
    }

    g_collector.save_imatrix();
+    g_collector.print_layer_importance();

    llama_print_timings(ctx);

--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1,3 +1,10 @@
+//
+// Copyright (C) 2023-2025 The llama.cpp authors
+// Copyright (C) 2024-2025 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+//
+
 #include <algorithm>
 #include <array>
 #include <cassert>
@@ -41,6 +48,12 @@ static uint64_t get_time_ns() {
    return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
 }

+template <typename T1, typename T2>
+std::ostream& operator<<(std::ostream& str, const std::pair<T1, T2>& item) {
+    str << '{' << item.first << ", " << item.second << '}';
+    return str;
+}
+
 template<class T>
 static std::string join(const std::vector<T> & values, const std::string & delim) {
    std::ostringstream str;
@@ -215,6 +228,9 @@ static std::string pair_str(const std::pair<int, int> & p) {
    return buf;
 }

+// Ser = Smart Expert Reduction
+using Ser = std::pair<int,float>;
+
 struct cmd_params {
    std::vector<std::string> model;
    std::vector<int> n_prompt;
@@ -225,21 +241,27 @@ struct cmd_params {
    std::vector<int> n_ubatch;
    std::vector<ggml_type> type_k;
    std::vector<ggml_type> type_v;
-    std::vector<int> n_threads;
+    std::vector<std::pair<int,int>> n_threads;
    std::vector<int> n_gpu_layers;
    std::vector<std::string> rpc_servers;
    std::vector<llama_split_mode> split_mode;
    std::vector<int> main_gpu;
    std::vector<bool> no_kv_offload;
    std::vector<bool> flash_attn;
+    std::vector<int> mla_attn;
+    std::vector<int> attn_max_batch;
+    std::vector<Ser> ser;
    std::vector<std::vector<float>> tensor_split;
    std::vector<bool> use_mmap;
    std::vector<bool> embeddings;
+    std::vector<llama_model_tensor_buft_override> buft_overrides;
    ggml_numa_strategy numa;
    int reps;
    bool verbose;
    bool warmup;
    bool repack = false;
+    bool fmoe = false;
+    bool use_thp = false;
    output_formats output_format;
    output_formats output_format_stderr;
 };
@@ -254,21 +276,27 @@ static const cmd_params cmd_params_defaults = {
    /* n_ubatch             */ {512},
    /* type_k               */ {GGML_TYPE_F16},
    /* type_v               */ {GGML_TYPE_F16},
-    /* n_threads            */ {cpu_get_num_math()},
+    /* n_threads            */ {{cpu_get_num_math(), cpu_get_num_math()}},
    /* n_gpu_layers         */ {99},
    /* rpc_servers          */ {""},
    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu             */ {0},
    /* no_kv_offload        */ {false},
    /* flash_attn           */ {false},
+    /* mla_attn             */ {0},
+    /* attn_max_batch       */ {0},
+    /* ser                  */ {{-1,0.0f}},
    /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
    /* use_mmap             */ {true},
    /* embeddings           */ {false},
+    /* buft_overrides       */ {},
    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
    /* reps                 */ 5,
    /* verbose              */ false,
    /* warmup               */ true,
    /* repack               */ false,
+    /* use_thp              */ false,
+    /* fmoe                 */ false,
    /* output_format        */ MARKDOWN,
    /* output_format_stderr */ NONE,
 };
@@ -288,12 +316,16 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -ctk, --cache-type-k <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
    printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -tgb, --threads-gen-batch <n1,n2>   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
    printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
+    printf("  -mla, --mla-attn <0|1|2>            (default: %s)\n", join(cmd_params_defaults.mla_attn, ",").c_str());
+    printf("  -amb, --attn-max-batch <i>          (default: %s)\n", join(cmd_params_defaults.attn_max_batch, ",").c_str());
+    printf("  -ser, --smart-expert-reduction <i,f>(default: %s)\n", join(cmd_params_defaults.attn_max_batch, ",").c_str());
    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
    printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
    printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
@@ -304,6 +336,9 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("  -w, --warmup <0|1>                  (default: %s)\n", cmd_params_defaults.warmup ? "1" : "0");
    printf("  -rtr, --run-time-repack <0|1>       (default: %s)\n", cmd_params_defaults.repack ? "1" : "0");
+    printf("  -thp, --transparent-huge-pages <0|1> (default: %s)\n", cmd_params_defaults.use_thp? "1" : "0");
+    printf("  -ot, --override-tensor pattern      (default: none)\n");
+    printf("  -fmoe, --fused-moe <0|1>            (default: %s)\n", cmd_params_defaults.fmoe? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
 }
@@ -336,10 +371,68 @@ static ggml_type ggml_type_from_name(const std::string & s) {
    if (s == "q6_0") {
        return GGML_TYPE_Q6_0;
    }
+    if (s == "q8_KV") {
+        return GGML_TYPE_Q8_KV;
+    }

    return GGML_TYPE_COUNT;
 }

+namespace {
+bool parse_buft_overrides(const std::string& value, std::vector<llama_model_tensor_buft_override>& overrides) {
+    /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
+    if (buft_list.empty()) {
+        // enumerate all the devices and add their buffer types to the list
+        for (size_t i = 0; i < ggml_backend_reg_get_count(); ++i) {
+            //auto * dev = ggml_backend_reg_get_name(i);
+            auto * buft = ggml_backend_reg_get_default_buffer_type(i);
+            if (buft) {
+                buft_list[ggml_backend_buft_name(buft)] = buft;
+            }
+        }
+    }
+    for (const auto & override : string_split<std::string>(value, ',')) {
+        std::string::size_type pos = override.find('=');
+        if (pos == std::string::npos) {
+            fprintf(stderr, "Invalid buft override argument %s\n", value.c_str());
+            return false;
+        }
+        std::string tensor_name = override.substr(0, pos);
+        std::string buffer_type = override.substr(pos + 1);
+        if (buft_list.find(buffer_type) == buft_list.end()) {
+            fprintf(stderr, "Available buffer types:\n");
+            for (const auto & it : buft_list) {
+                fprintf(stderr, "  %s\n", ggml_backend_buft_name(it.second));
+            }
+            return false;
+        }
+        overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
+    }
+    return true;
+}
+template<class T1, class T2>
+std::vector<std::pair<T1,T2>> string_split_pairs(const std::string & str, char delim) {
+    std::vector<std::pair<T1,T2>> values;
+    std::istringstream str_stream(str);
+    std::string token;
+    T1 first_value;
+    int i = 0;
+    while (std::getline(str_stream, token, delim)) {
+        std::istringstream token_stream(token);
+        if (i%2 == 0) {
+            token_stream >> first_value;
+            if (token_stream.fail()) return {};
+        } else {
+            T2 value;
+            token_stream >> value;
+            if (token_stream.fail()) return {};
+            values.emplace_back(first_value, value);
+        }
+        i++;
+    }
+    return values;
+}
+}

 static cmd_params parse_cmd_params(int argc, char ** argv) {
    cmd_params params;
@@ -459,7 +552,23 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                break;
            }
            auto p = string_split<int>(argv[i], split_delim);
-            params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+            params.n_threads.reserve(params.n_threads.size() + p.size());
+            for (auto t : p) params.n_threads.push_back({t, t});
+            //params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+        } else if (arg == "-tgb" || arg == "--threads-gen-batch") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto ps = string_split<std::string>(argv[i], ';');
+            for (auto& s : ps) {
+                auto p = string_split<int>(s.c_str(), ',');
+                if (p.size() != 2) {
+                    invalid_param = true;
+                    break;
+                }
+                params.n_threads.push_back({p[0], p[1]});
+            }
        } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
            if (++i >= argc) {
                invalid_param = true;
@@ -526,6 +635,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = string_split<bool>(argv[i], split_delim);
            params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
+        } else if (arg == "-mla" || arg == "--mla-attn") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.mla_attn.insert(params.mla_attn.end(), p.begin(), p.end());
+        } else if (arg == "-amb" || arg == "--attn-max-batch") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.attn_max_batch.insert(params.attn_max_batch.end(), p.begin(), p.end());
+        } else if (arg == "-ser" || arg == "--smart-expert-reduction") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split_pairs<int,float>(argv[i], split_delim);
+            params.ser.insert(params.ser.end(), p.begin(), p.end());
        } else if (arg == "-mmp" || arg == "--mmap") {
            if (++i >= argc) {
                invalid_param = true;
@@ -594,6 +724,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                break;
            }
            params.repack = std::stoi(argv[i]);
+        } else if (arg == "-thp" || arg == "--transparent-huge-pages") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.use_thp = std::stoi(argv[i]);
+        } else if (arg == "-fmoe" || arg == "--fused-moe") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.fmoe = std::stoi(argv[i]);
+        } else if (arg == "-ot" || arg == "--override-tensor") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            if (!parse_buft_overrides(std::string{argv[i]}, params.buft_overrides)) {
+                fprintf(stderr, "error: Invalid tensor buffer type override: %s\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
        } else {
            invalid_param = true;
            break;
@@ -621,10 +773,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
    if (params.flash_attn.empty())   { params.flash_attn = cmd_params_defaults.flash_attn; }
+    if (params.mla_attn.empty())     { params.mla_attn = cmd_params_defaults.mla_attn; }
+    if (params.attn_max_batch.empty()){ params.attn_max_batch = cmd_params_defaults.attn_max_batch; }
+    if (params.ser.empty())          { params.ser = cmd_params_defaults.ser; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
    if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
+    if (!params.buft_overrides.empty()) params.buft_overrides.emplace_back(llama_model_tensor_buft_override{nullptr, nullptr});

    return params;
 }
@@ -649,17 +805,23 @@ struct cmd_params_instance {
    int n_ubatch;
    ggml_type type_k;
    ggml_type type_v;
-    int n_threads;
+    std::pair<int,int> n_threads;
    int n_gpu_layers;
    std::string rpc_servers;
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
    bool flash_attn;
+    int  mla_attn;
+    int  attn_max_batch;
+    Ser  ser;
    std::vector<float> tensor_split;
    bool use_mmap;
    bool embeddings;
    bool repack = false;
+    bool fmoe = false;
+    bool use_thp = false;
+    const llama_model_tensor_buft_override* buft_overrides;

    llama_model_params to_llama_mparams() const {
        llama_model_params mparams = llama_model_default_params();
@@ -673,6 +835,8 @@ struct cmd_params_instance {
        mparams.tensor_split = tensor_split.data();
        mparams.use_mmap = use_mmap;
        mparams.repack_tensors = repack;
+        mparams.use_thp = use_thp;
+        mparams.tensor_buft_overrides = buft_overrides;

        return mparams;
    }
@@ -685,6 +849,7 @@ struct cmd_params_instance {
               main_gpu == other.main_gpu &&
               use_mmap == other.use_mmap &&
               repack == other.repack &&
+               use_thp == other.use_thp &&
               tensor_split == other.tensor_split;
    }

@@ -698,6 +863,11 @@ struct cmd_params_instance {
        cparams.type_v = type_v;
        cparams.offload_kqv = !no_kv_offload;
        cparams.flash_attn = flash_attn;
+        cparams.mla_attn = mla_attn;
+        cparams.attn_max_batch = attn_max_batch;
+        cparams.fused_moe_up_gate = fmoe;
+        cparams.min_experts = ser.first;
+        cparams.thresh_experts = ser.second;
        cparams.embeddings = embeddings;

        return cparams;
@@ -722,6 +892,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & tv : params.type_v)
    for (const auto & nkvo : params.no_kv_offload)
    for (const auto & fa : params.flash_attn)
+    for (const auto & mla : params.mla_attn)
+    for (const auto & amb : params.attn_max_batch)
+    for (const auto & ser : params.ser)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
            if (n_prompt == 0) {
@@ -743,10 +916,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
                /* .flash_attn   = */ fa,
+                /* .mla_attn     = */ mla,
+                /* .attn_max_b   = */ amb,
+                /* .ser          = */ ser,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
                /* .repack       = */ params.repack,
+                /* .fmoe         = */ params.fmoe,
+                /* .use_thp      = */ params.use_thp,
+                /* .buft_overrides=*/ params.buft_overrides.data(),
            };
            instances.push_back(instance);
        }
@@ -771,10 +950,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
                /* .flash_attn   = */ fa,
+                /* .mla_attn     = */ mla,
+                /* .attn_max_b   = */ amb,
+                /* .ser          = */ ser,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
                /* .repack       = */ params.repack,
+                /* .fmoe         = */ params.fmoe,
+                /* .use_thp      = */ params.use_thp,
+                /* .buft_overrides=*/ params.buft_overrides.data(),
            };
            instances.push_back(instance);
        }
@@ -799,10 +984,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
                /* .flash_attn   = */ fa,
+                /* .mla_attn     = */ mla,
+                /* .attn_max_b   = */ amb,
+                /* .ser          = */ ser,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
                /* .repack       = */ params.repack,
+                /* .fmoe         = */ params.fmoe,
+                /* .use_thp      = */ params.use_thp,
+                /* .buft_overrides=*/ params.buft_overrides.data(),
            };
            instances.push_back(instance);
        }
@@ -827,10 +1018,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
                /* .flash_attn   = */ fa,
+                /* .mla_attn     = */ mla,
+                /* .attn_max_b   = */ amb,
+                /* .ser          = */ ser,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
                /* .repack       = */ params.repack,
+                /* .fmoe         = */ params.fmoe,
+                /* .use_thp      = */ params.use_thp,
+                /* .buft_overrides=*/ params.buft_overrides.data(),
            };
            instances.push_back(instance);
        }
@@ -857,7 +1054,7 @@ struct test {
    uint64_t model_n_params;
    int n_batch;
    int n_ubatch;
-    int n_threads;
+    std::pair<int,int> n_threads;
    bool has_rpc;
    ggml_type type_k;
    ggml_type type_v;
@@ -866,10 +1063,15 @@ struct test {
    int main_gpu;
    bool no_kv_offload;
    bool flash_attn;
+    int  mla_attn;
+    int  attn_max_batch;
+    Ser  ser;
    std::vector<float> tensor_split;
    bool use_mmap;
    bool embeddings;
    bool repack = false;
+    bool fmoe = false;
+    bool use_thp = false;
    int n_prompt;
    int n_gen;
    std::string test_time;
@@ -895,10 +1097,15 @@ struct test {
        main_gpu = inst.main_gpu;
        no_kv_offload = inst.no_kv_offload;
        flash_attn = inst.flash_attn;
+        mla_attn = inst.mla_attn;
+        attn_max_batch = inst.attn_max_batch;
+        ser = inst.ser;
        tensor_split = inst.tensor_split;
        use_mmap = inst.use_mmap;
        embeddings = inst.embeddings;
        repack = inst.repack;
+        fmoe = inst.fmoe;
+        use_thp = inst.use_thp;
        n_prompt = inst.n_prompt;
        n_gen = inst.n_gen;
        test_kind = inst.test_kind;
@@ -988,8 +1195,8 @@ struct test {
            "n_batch", "n_ubatch",
            "n_threads", "type_k", "type_v",
            "n_gpu_layers", "split_mode",
-            "main_gpu", "no_kv_offload", "flash_attn",
-            "tensor_split", "use_mmap", "embeddings", "repack",
+            "main_gpu", "no_kv_offload", "flash_attn", "mla_attn", "attn_max_batch", "ser",
+            "tensor_split", "use_mmap", "embeddings", "repack", "fused_moe", "use_thp",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
            "avg_ts", "stddev_ts", "test",
@@ -1004,13 +1211,14 @@ struct test {
            field == "n_threads" ||
            field == "model_size" || field == "model_n_params" ||
            field == "n_gpu_layers" || field == "main_gpu" ||
-            field == "n_prompt" || field == "n_gen" ||
+            field == "n_prompt" || field == "n_gen" || field == "mla_attn" || field == "attn_max_batch" ||
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
        if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
-            field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack") {
+            field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack" || field == "use_thp" ||
+            field == "fused_moe") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -1035,6 +1243,12 @@ struct test {
                tensor_split_str += "/";
            }
        }
+        auto ser_to_string = [] (const Ser& ser) {
+            std::ostringstream str;
+            str << ser.first << ',' << ser.second;
+            return str.str();
+        };
+        bool is_gen = n_gen > 0;
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
@@ -1042,10 +1256,12 @@ struct test {
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
-            std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
+            std::to_string(is_gen ? n_threads.first : n_threads.second), ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), split_mode_str(split_mode),
            std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
-            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), std::to_string(repack),
+            std::to_string(mla_attn), std::to_string(attn_max_batch), ser_to_string(ser),
+            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
+            std::to_string(repack), std::to_string(fmoe), std::to_string(use_thp),
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
            std::to_string(avg_ts()), std::to_string(stdev_ts()),
@@ -1208,12 +1424,27 @@ struct markdown_printer : public printer {
        if (field == "flash_attn") {
            return 2;
        }
+        if (field == "mla_attn") {
+            return 3;
+        }
+        if (field == "attn_max_batch") {
+            return 5;
+        }
+        if (field == "ser") {
+            return 10;
+        }
        if (field == "use_mmap") {
            return 4;
        }
        if (field == "repack") {
            return 3;
        }
+        if (field == "use_thp") {
+            return 3;
+        }
+        if (field == "fused_moe") {
+            return 4;
+        }
        if (field == "test") {
            return 13;
        }
@@ -1242,12 +1473,27 @@ struct markdown_printer : public printer {
        if (field == "flash_attn") {
            return "fa";
        }
+        if (field == "mla_attn") {
+            return "mla";
+        }
+        if (field == "attn_max_batch") {
+            return "amb";
+        }
+        if (field == "attn_max_batch") {
+            return "ser";
+        }
        if (field == "use_mmap") {
            return "mmap";
        }
        if (field == "repack") {
            return "rtr";
        }
+        if (field == "use_thp") {
+            return "thp";
+        }
+        if (field == "fused_moe") {
+            return "fmoe";
+        }
        if (field == "embeddings") {
            return "embd";
        }
@@ -1294,6 +1540,15 @@ struct markdown_printer : public printer {
        if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
            fields.emplace_back("flash_attn");
        }
+        if (params.mla_attn.size() > 1 || params.mla_attn != cmd_params_defaults.mla_attn) {
+            fields.emplace_back("mla_attn");
+        }
+        if (params.attn_max_batch.size() > 1 || params.attn_max_batch != cmd_params_defaults.mla_attn) {
+            fields.emplace_back("attn_max_batch");
+        }
+        if (params.ser.size() > 1 || params.ser != cmd_params_defaults.ser) {
+            fields.emplace_back("ser");
+        }
        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
            fields.emplace_back("tensor_split");
        }
@@ -1306,6 +1561,12 @@ struct markdown_printer : public printer {
        if (params.repack != cmd_params_defaults.repack) {
            fields.emplace_back("repack");
        }
+        if (params.use_thp != cmd_params_defaults.use_thp) {
+            fields.emplace_back("use_thp");
+        }
+        if (params.fmoe != cmd_params_defaults.fmoe) {
+            fields.emplace_back("fused_moe");
+        }
        fields.emplace_back("test");
        fields.emplace_back("t/s");

@@ -1557,10 +1818,10 @@ int main(int argc, char ** argv) {
        if (params.warmup) {
            if (t.n_prompt > 0) {
                //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
-                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+                test_prompt(ctx, 1, 0, t.n_batch, t.n_threads.second);
            }
            if (t.n_gen > 0) {
-                test_gen(ctx, 1, 0, t.n_threads);
+                test_gen(ctx, 1, 0, t.n_threads.first);
            }
        }

@@ -1570,11 +1831,11 @@ int main(int argc, char ** argv) {
            uint64_t t_start = get_time_ns();

            if (t.n_prompt > 0) {
-                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads.second);
            }
            if (t.test_kind == TEST_KIND_GP) t_start = get_time_ns();
            if (t.n_gen > 0) {
-                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
+                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads.first);
            }

            uint64_t t_ns = get_time_ns() - t_start;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,3 +1,10 @@
+//
+// Copyright (C) 2023-2025 The llama.cpp authors
+// Copyright (C) 2024-2025 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+//
+
 #include "common.h"
 #include "llama.h"

@@ -126,7 +133,7 @@ static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob
        max_logit = std::max(max_logit, logits[i]);
        min_logit = std::min(min_logit, logits[i]);
    }
-    min_logit = std::max(min_logit, max_logit - 16);
+    min_logit = std::max(min_logit, max_logit - 24);
    double sum_exp = 0.0;
    for (int i = 0; i < n_vocab; ++i) {
        sum_exp += expf(logits[i] - max_logit);
@@ -166,7 +173,7 @@ static void process_logits(
                break;
            }
            lock.unlock();
-            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
+            const results_log_softmax results = log_softmax(n_vocab, logits + int64_t(i)*n_vocab, tokens[i+1]);
            const double v = -results.log_softmax;
            local_nll += v;
            local_nll2 += v*v;
@@ -200,7 +207,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
                break;
            }
            lock.unlock();
-            const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
+            const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + int64_t(i)*nv, tokens[i+1]);
            local_nll += v;
            local_nll2 += v*v;
        }
@@ -618,7 +625,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

            if (num_batches > 1 && n_outputs > 0) {
                const auto * batch_logits = llama_get_logits(ctx);
-                logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
+                logits.insert(logits.end(), batch_logits, batch_logits + int64_t(n_outputs) * n_vocab);
            }
        }

--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,3 +1,10 @@
+//
+// Copyright (C) 2023-2025 The llama.cpp authors
+// Copyright (C) 2024-2025 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+//
+
 #define LLAMA_API_INTERNAL
 #include "common.h"
 #include "ggml.h"
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,3 +1,10 @@
+//
+// Copyright (C) 2023-2025 The llama.cpp authors
+// Copyright (C) 2024-2025 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+//
+
 #include "common.h"
 #include "llama.h"

@@ -58,6 +65,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_0_R4",  LLAMA_FTYPE_MOSTLY_Q5_0_R4,  " 5.50 bpw quantization",            },
    { "Q6_0_R4",  LLAMA_FTYPE_MOSTLY_Q6_0_R4,  " 6.50 bpw quantization",            },
    { "Q8_0_R8",  LLAMA_FTYPE_MOSTLY_Q8_0_R8,  " 8.50 bpw quantization",            },
+    { "Q8_KV",    LLAMA_FTYPE_MOSTLY_Q8_KV,    " 8.00 bpw quantization",            },
    { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
    { "IQ4_KS",   LLAMA_FTYPE_MOSTLY_IQ4_KS,   " 4.25 bpw non-linear quantization", },
    { "IQ4_KS_R4",LLAMA_FTYPE_MOSTLY_IQ4_KS_R4,"IQ4_KS repacked", },
@@ -85,6 +93,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
    { "Q6_K_R4",  LLAMA_FTYPE_MOSTLY_Q6_K_R4,  "Q6_K repacked", },
    { "Q8_K_R8",  LLAMA_FTYPE_MOSTLY_Q8_K_R8,  "Q8_K repacked", },
+    { "Q8_KV_R8", LLAMA_FTYPE_MOSTLY_Q8_KV_R8, "Q8_KV repacked", },
    { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
@@ -136,15 +145,19 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
    printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
+    printf("  --hide-imatrix: do not store imatrix details in the quantized model\n");
    printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor.\n");
    printf("  --token-embedding-type ggml_type: use this ggml_type for the token_embd.weight tensor.\n\n");
+    printf("  --custom-q regex1=type1,regex2=type2...: use this to specify custom quantization type rules.\n\n");
+    printf("  --repack Repack all tensors to the corresponding _r4/8 variant if available.\n\n");
+    printf("  --repack-pattern Comma separated list of regexs to use for matching tensor names to be repacked.\n\n");
    printf("Additional specific tensor quantization types used in the custom quant scheme 'CQS (default is Q2_K):\n");
    printf("      --attn-q-type ggml_type: use this ggml_type for the attn_q.weight tensor.\n");
    printf("      --attn-k-type ggml_type: use this ggml_type for the attn_k.weight tensor.\n");
@@ -291,6 +304,28 @@ static ggml_type parse_ggml_type(const char * arg) {
    return result;
 }

+using CustomQ = std::pair<std::string, ggml_type>;
+
+static bool parse_custom_quants(const std::string& arg, std::vector<CustomQ>& custom_quants) {
+    for (const auto & item : string_split<std::string>(arg, ',')) {
+        auto pos = item.find('=');
+        if (pos == std::string::npos) {
+            fprintf(stderr, "Invalid custom quantization input %s\n", arg.c_str());
+            return false;
+        }
+        auto pattern = item.substr(0, pos);
+        auto type_as_string = item.substr(pos + 1);
+        auto type = parse_ggml_type(type_as_string.c_str());
+        if (type == GGML_TYPE_COUNT) {
+            fprintf(stderr, "Invalid quantization type '%s' in custom quantization input %s\n", type_as_string.c_str(), item.c_str());
+            return false;
+        }
+        printf("Adding custom rule %s -> %s\n", pattern.c_str(), ggml_type_name(type));
+        custom_quants.emplace_back(std::move(pattern), type);
+    }
+    return true;
+}
+
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@@ -302,12 +337,26 @@ int main(int argc, char ** argv) {
    std::string imatrix_file;
    std::vector<std::string> included_weights, excluded_weights;
    std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<CustomQ> custom_quants;
+
+    std::vector<std::string> repack_patterns;
+
+    bool hide_imatrix = false;

    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
            params.quantize_output_tensor = false;
        } else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
            params.ignore_imatrix_rules = true;
+        } else if (strcmp(argv[arg_idx], "--repack") == 0) {
+            params.only_repack = true;
+        } else if (strcmp(argv[arg_idx], "--repack-pattern") == 0) {
+            if (arg_idx < argc-1) {
+                auto p = string_split(argv[++arg_idx], ',');
+                repack_patterns.insert(repack_patterns.end(), p.begin(), p.end());
+            } else {
+                usage(argv[0]);
+            }
        } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
            if (arg_idx < argc-1) {
                params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
@@ -372,6 +421,10 @@ int main(int argc, char ** argv) {
            if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                usage(argv[0]);
            }
+        } else if (strcmp(argv[arg_idx], "--custom-q") == 0) {
+            if (arg_idx == argc-1 || !parse_custom_quants(argv[++arg_idx], custom_quants)) {
+                usage(argv[0]);
+            }
        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
            params.allow_requantize = true;
        } else if (strcmp(argv[arg_idx], "--pure") == 0) {
@@ -382,6 +435,8 @@ int main(int argc, char ** argv) {
            } else {
                usage(argv[0]);
            }
+        } else if (strcmp(argv[arg_idx], "--hide-imatrix") == 0) {
+            hide_imatrix = true;
        } else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
            if (arg_idx < argc-1) {
                included_weights.emplace_back(argv[++arg_idx]);
@@ -401,6 +456,10 @@ int main(int argc, char ** argv) {
        }
    }

+    if (!repack_patterns.empty()) {
+        params.repack_pattern = &repack_patterns;
+    }
+
    if (argc - arg_idx < 2) {
        printf("%s: bad arguments\n", argv[0]);
        usage(argv[0]);
@@ -418,7 +477,11 @@ int main(int argc, char ** argv) {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            strncpy(kvo.val_str, imatrix_file.c_str(), 127);
+            if (hide_imatrix) {
+                strncpy(kvo.val_str, "top_secret", 127);
+            } else {
+                strncpy(kvo.val_str, imatrix_file.c_str(), 127);
+            }
            kvo.val_str[127] = '\0';
            kv_overrides.emplace_back(std::move(kvo));
        }
@@ -426,7 +489,11 @@ int main(int argc, char ** argv) {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
+            if (hide_imatrix) {
+                strncpy(kvo.val_str, "top_secret", 127);
+            } else {
+                strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
+            }
            kvo.val_str[127] = '\0';
            kv_overrides.emplace_back(std::move(kvo));
        }
@@ -435,7 +502,11 @@ int main(int argc, char ** argv) {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.val_i64 = imatrix_data.size();
+            if (hide_imatrix) {
+                kvo.val_i64 = 0;
+            } else {
+                kvo.val_i64 = imatrix_data.size();
+            }
            kv_overrides.emplace_back(std::move(kvo));
        }

@@ -443,7 +514,11 @@ int main(int argc, char ** argv) {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.val_i64 = m_last_call;
+            if (hide_imatrix) {
+                kvo.val_i64 = 0;
+            } else {
+                kvo.val_i64 = m_last_call;
+            }
            kv_overrides.emplace_back(std::move(kvo));
        }
    }
@@ -452,6 +527,9 @@ int main(int argc, char ** argv) {
        kv_overrides.back().key[0] = 0;
        params.kv_overrides = &kv_overrides;
    }
+    if (!custom_quants.empty()) {
+        params.custom_quants = &custom_quants;
+    }

    llama_backend_init();

--- a/examples/sweep-bench/CMakeLists.txt
+++ b/examples/sweep-bench/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-sweep-bench)
+add_executable(${TARGET} sweep-bench.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sweep-bench/README.md
+++ b/examples/sweep-bench/README.md
@@ -0,0 +1,65 @@
+# ik_llama.cpp/example/sweep-bench
+
+Benchmark the prompt processing and token generation performance of `ik_llama.cpp`
+by doing a sweep over a whole context size and gathering performance metrics
+in each ubatch-sized window. Only a single token sequence is used.
+
+The benchmark steps are:
+
+for each ubatch-sized window in context:
+
+    1. generate ubatch/4 tokens (not the whole window to save some time)
+    2. measure generation performance
+    3. remove generated tokens from KV cache
+    4. prepare a ubatch-sized batch of random tokens
+    4. process prepated batch
+    5. measure prompt processing performance
+
+The purpose of the benchmark is to visualize how the performance changes with
+the context size without averaging the metrics values over the whole context.
+
+## Usage
+
+./llama-sweep-bench -c 8704 -ub 512 -m models/Meta-Llama-3.2-3B-Instruct-Q8_0.gguf
+
+## Sample results
+
+- `PP` - prompt tokens per ubatch
+- `TG` - generated tokens per ubatch
+- `N_KV` - current KV cache size
+- `T_PP` - prompt processing time (i.e. time to first token)
+- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
+- `T_TG` - time to generate all batches
+- `S_TG` - text generation speed (`(B*TG)/T_TG`)
+
+|    PP |     TG |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |
+|-------|--------|--------|----------|----------|----------|----------|
+|   512 |    128 |      0 |    1.100 |   465.51 |    2.311 |    55.38 |
+|   512 |    128 |    512 |    1.183 |   432.97 |    1.895 |    67.55 |
+|   512 |    128 |   1024 |    1.305 |   392.38 |    2.071 |    61.81 |
+|   512 |    128 |   1536 |    1.279 |   400.42 |    2.164 |    59.14 |
+|   512 |    128 |   2048 |    1.571 |   325.96 |    2.280 |    56.14 |
+|   512 |    128 |   2560 |    1.431 |   357.87 |    2.418 |    52.94 |
+|   512 |    128 |   3072 |    1.515 |   337.93 |    2.566 |    49.88 |
+|   512 |    128 |   3584 |    1.588 |   322.34 |    2.722 |    47.03 |
+|   512 |    128 |   4096 |    1.675 |   305.70 |    2.864 |    44.69 |
+|   512 |    128 |   4608 |    1.769 |   289.50 |    2.999 |    42.68 |
+|   512 |    128 |   5120 |    1.845 |   277.48 |    3.102 |    41.26 |
+|   512 |    128 |   5632 |    1.893 |   270.46 |    3.219 |    39.76 |
+|   512 |    128 |   6144 |    1.953 |   262.20 |    3.348 |    38.23 |
+|   512 |    128 |   6656 |    2.018 |   253.71 |    3.474 |    36.84 |
+|   512 |    128 |   7168 |    2.078 |   246.34 |    3.589 |    35.66 |
+|   512 |    128 |   7680 |    2.140 |   239.22 |    3.717 |    34.43 |
+|   512 |    128 |   8192 |    2.196 |   233.15 |    3.854 |    33.21 |
+
+### JSONL output
+
+Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
+
+```json lines
+{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 0, "t_pp": 1.093814, "speed_pp": 468.086884, "t_tg": 1.780312, "speed_tg": 71.897514 }
+{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 512, "t_pp": 1.169302, "speed_pp": 437.868073, "t_tg": 1.897474, "speed_tg": 67.458099 }
+{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 1024, "t_pp": 1.183700, "speed_pp": 432.542053, "t_tg": 2.059179, "speed_tg": 62.160694 }
+{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 1536, "t_pp": 1.428625, "speed_pp": 358.386566, "t_tg": 2.160639, "speed_tg": 59.241734 }
+{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 2048, "t_pp": 1.360647, "speed_pp": 376.291595, "t_tg": 2.274003, "speed_tg": 56.288403 }
+```
--- a/examples/sweep-bench/sweep-bench-plot.py
+++ b/examples/sweep-bench/sweep-bench-plot.py
@@ -0,0 +1,118 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('file', nargs='+')
+args = parser.parse_args()
+
+df = None
+
+#for jsonl_file in args.file:
+#    # Read JSONL file into DataFrame
+#    df_part = pd.read_json(jsonl_file, lines=True)
+#    df_part['label'] = jsonl_file
+#    if df is None:
+#        df = df_part
+#    else:
+#        df = pd.concat([df, df_part])
+#
+
+
+
+for md_file in args.file:
+    # Read markdown table file into DataFrame
+    df_part = pd.read_csv(md_file, sep=r'\s*\|\s*', engine='python', 
+                         header=0, skiprows=[1])
+    
+    # Clean up columns (remove empty columns from markdown formatting)
+    df_part = df_part.iloc[:, 1:-1]
+    df_part.columns = [col.strip() for col in df_part.columns]
+    
+    # Rename columns to match expected names
+    df_part = df_part.rename(columns={
+        'N_KV': 'n_kv',
+        'S_PP t/s': 'speed_pp',
+        'S_TG t/s': 'speed_tg'
+    })
+    
+    # Convert to numeric types
+    df_part['n_kv'] = pd.to_numeric(df_part['n_kv'])
+    df_part['speed_pp'] = pd.to_numeric(df_part['speed_pp'])
+    df_part['speed_tg'] = pd.to_numeric(df_part['speed_tg'])
+    
+    # Add label and append to main DataFrame
+    df_part['label'] = md_file
+    df = pd.concat([df, df_part]) if df is not None else df_part
+
+# Group by label and n_kv, calculate mean and std for both speed metrics
+df_grouped = df.groupby(['label', 'n_kv']).agg({
+    'speed_pp': ['mean', 'std'],
+    'speed_tg': ['mean', 'std']
+}).reset_index()
+
+# Flatten multi-index columns
+df_grouped.columns = ['label', 'n_kv', 'speed_pp_mean', 'speed_pp_std',
+                      'speed_tg_mean', 'speed_tg_std']
+
+# Replace NaN with 0 (std for a single sample is NaN)
+df_grouped['speed_pp_std'] = df_grouped['speed_pp_std'].fillna(0)
+df_grouped['speed_tg_std'] = df_grouped['speed_tg_std'].fillna(0)
+
+# Prepare ticks values for X axis (prune for readability)
+x_ticks = df['n_kv'].unique()
+while len(x_ticks) > 16:
+    x_ticks = x_ticks[::2]
+
+# Get unique labels and color map
+labels = df_grouped['label'].unique()
+colors = plt.cm.rainbow(np.linspace(0, 1, len(labels)))
+
+# Create prompt processing plot
+plt.figure(figsize=(10, 6))
+ax1 = plt.gca()
+plt.grid()
+ax1.set_xticks(x_ticks)
+
+# Plot each label's data
+for label, color in zip(labels, colors):
+    label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
+    pp = ax1.errorbar(label_data['n_kv'], label_data['speed_pp_mean'],
+                     yerr=label_data['speed_pp_std'], color=color,
+                     marker='o', linestyle='-', label=label)
+
+# Add labels and title
+ax1.set_xlabel('Context Length (tokens)')
+ax1.set_ylabel('Prompt Processing Rate (t/s)')
+plt.title('Prompt Processing Performance Comparison')
+ax1.legend(loc='upper right')
+
+# Adjust layout and save
+plt.tight_layout()
+plt.savefig('performance_comparison_pp.png', bbox_inches='tight')
+plt.close()
+
+# Create token generation plot
+plt.figure(figsize=(10, 6))
+ax1 = plt.gca()
+plt.grid()
+ax1.set_xticks(x_ticks)
+
+# Plot each model's data
+for label, color in zip(labels, colors):
+    label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
+    tg = ax1.errorbar(label_data['n_kv'], label_data['speed_tg_mean'],
+                     yerr=label_data['speed_tg_std'], color=color,
+                     marker='s', linestyle='-', label=label)
+
+# Add labels and title
+ax1.set_xlabel('Context Length (n_kv)')
+ax1.set_ylabel('Token Generation Rate (t/s)')
+plt.title('Token Generation Performance Comparison')
+ax1.legend(loc='upper right')
+
+# Adjust layout and save
+plt.tight_layout()
+plt.savefig('performance_comparison_tg.png', bbox_inches='tight')
+plt.close()
--- a/examples/sweep-bench/sweep-bench.cpp
+++ b/examples/sweep-bench/sweep-bench.cpp
@@ -0,0 +1,189 @@
+#include "ggml.h"
+#include "llama.h"
+#include "common.h"
+#include "llama-vocab.h"
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+static void print_usage(int, char ** argv) {
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf -c 8192 -b 2048 -ub 512\n", argv[0]);
+    LOG_TEE("\n");
+}
+
+int main(int argc, char ** argv) {
+
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // initialize the model
+
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    const unsigned int n_kv_max = llama_n_ctx(ctx);
+
+
+    const llama_vocab * vocab = llama_get_vocab(ctx);
+    llama_token bos = llama_token_bos_impl(*vocab);
+    //llama_token eos = llama_token_eos_impl(*vocab);
+
+    const unsigned int n_vocab  = llama_n_vocab(model);
+
+    // decode in batches of ctx_params.n_batch tokens
+    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
+        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+
+            llama_batch batch_view = {
+                n_tokens,
+                batch.token    + i,
+                nullptr,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
+            };
+
+            const int ret = llama_decode(ctx, batch_view);
+            if (ret != 0) {
+                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                return false;
+            }
+
+            llama_synchronize(ctx);
+        }
+
+        return true;
+    };
+
+    const unsigned int pp = params.n_ubatch;
+    const unsigned int tg = params.n_ubatch / 4;
+
+    if (!params.sweep_bench_output_jsonl) {
+        LOG_TEE("\n");
+        LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+        LOG_TEE("\n");
+        LOG_TEE("|%6s | %6s | %6s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s");
+        LOG_TEE("|%6s-|-%6s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "------", "--------", "--------", "--------", "--------");
+    }
+
+    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
+
+    // warm up
+    {
+        llama_batch_add(batch, bos, 0, { 0 }, false);
+
+        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+    }
+
+    llama_batch_clear(batch);
+    llama_kv_cache_clear(ctx);
+
+    for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) {
+        // clean up KV cache before generation
+        llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
+
+        // first measure token generation performance at this context size
+        const auto t_tg_start = ggml_time_us();
+
+        for (unsigned int i = 0; i < tg; ++i) {
+            llama_batch_clear(batch);
+            llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true);
+
+            if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                LOG_TEE("%s: llama_decode() failed\n", __func__);
+                return 1;
+            }
+        }
+
+        const auto t_tg_end = ggml_time_us();
+
+        // clean up KV cache after generation
+        llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
+
+        // prepare batch of pp size for prompt processing performance measurement
+        llama_batch_clear(batch);
+
+        for (unsigned int i = 0; i < pp; ++i) {
+            llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, false);
+        }
+        batch.logits[batch.n_tokens - 1] = true;
+
+        // measure prompt processing performance
+        const auto t_pp_start = ggml_time_us();
+
+        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+
+        const auto t_pp_end = ggml_time_us();
+
+        // calculate and print metrics
+        const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
+        const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
+
+        const float speed_pp = pp / t_pp;
+        const float speed_tg = tg / t_tg;
+
+        if(params.sweep_bench_output_jsonl) {
+            LOG_TEE(
+                "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
+                "\"pp\": %d, \"tg\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f }\n",
+                n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
+                pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg
+            );
+        } else {
+            LOG_TEE("|%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg);
+        }
+    }
+
+    llama_batch_free(batch);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    return 0;
+}