From 25cd985c9b238d7399eaa44a75f9e36ca0cef8cc Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Tue, 11 Nov 2025 08:44:59 +0200 Subject: [PATCH] Add --n-cpu-moe to llama_bench (#937) * Add --n-cpu-moe to llama_banch * Add usage --------- Co-authored-by: Iwan Kawrakow --- examples/llama-bench/llama-bench.cpp | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 0d6c4a48..52b1dbbd 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -283,7 +283,7 @@ static const cmd_params cmd_params_defaults = { /* type_k */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {{cpu_get_num_math(), cpu_get_num_math()}}, - /* n_gpu_layers */ {99}, + /* n_gpu_layers */ {999}, /* rpc_servers */ {""}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, @@ -330,6 +330,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -tgb, --threads-gen-batch (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" --n-cpu-moe (default: none)\n"); printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); @@ -428,6 +429,19 @@ bool parse_buft_overrides(const std::string& value, std::vector& overrides) { + int n_layers = std::stoi(arg); + if (n_layers < 0) { + fprintf(stderr, "error: Invalid value for --n-cpu-moe: %s\n", arg); + return false; + } + for (int32_t l = 0; l < n_layers; ++l) { + std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps\\.weight)"; + overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()}); + } + return true; +} + template std::vector> string_split_pairs(const std::string & str, char delim) { std::vector> values; @@ -800,6 +814,15 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } + } else if (arg == "--n-cpu-moe") { + if (++i >= argc) { + invalid_param = true; + break; + } + if (!add_cpu_buft_overrides(argv[i], params.buft_overrides)) { + invalid_param = true; + break; + } } else { invalid_param = true; break;