From 6bda22a4d693f3458525086d482c7fecb63fd2b2 Mon Sep 17 00:00:00 2001 From: Parsa <61601745+TheLegendOfKitty@users.noreply.github.com> Date: Fri, 8 Aug 2025 04:38:18 -0700 Subject: [PATCH] Port cpu moe options from mainline (#672) * Port cpu moe options from mainline * Use strdup and int32_t to follow coding guidelines --- common/common.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index 1801da03..da702368 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1080,6 +1080,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } + if (arg == "--cpu-moe" || arg == "-cmoe") { + params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps"), ggml_backend_cpu_buffer_type()}); + return true; + } + if (arg == "--n-cpu-moe" || arg == "-ncmoe") { + CHECK_ARG + int32_t n_layers = std::stoi(argv[i]); + if (n_layers < 0) { + fprintf(stderr, "error: Invalid value for --n-cpu-moe: %d (must be >= 0)\n", n_layers); + invalid_param = true; + return true; + } + for (int32_t l = 0; l < n_layers; ++l) { + std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps)"; + params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()}); + } + return true; + } if (arg == "--no-mmap") { params.use_mmap = false; return true; @@ -1794,6 +1812,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" }); } options.push_back({ "*", " --run-time-repack", "repack tensors if interleaved variant is available"}); + options.push_back({ "*", " --cpu-moe", "keep all MoE weights in CPU memory"}); + options.push_back({ "*", " --n-cpu-moe N", "keep MoE weights of the first N layers in CPU memory"}); options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n" " - distribute: spread execution evenly over all nodes\n" " - isolate: only spawn threads on CPUs on the node that execution started on\n"