Port cpu moe options from mainline (#672)

* Port cpu moe options from mainline * Use strdup and int32_t to follow coding guidelines
2026-01-26 17:20:01 +00:00 · 2025-08-08 04:38:18 -07:00
parent dc1746338c
commit 6bda22a4d6
1 changed files with 20 additions and 0 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1080,6 +1080,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
+    if (arg == "--cpu-moe" || arg == "-cmoe") {
+        params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps"), ggml_backend_cpu_buffer_type()});
+        return true;
+    }
+    if (arg == "--n-cpu-moe" || arg == "-ncmoe") {
+        CHECK_ARG
+        int32_t n_layers = std::stoi(argv[i]);
+        if (n_layers < 0) {
+            fprintf(stderr, "error: Invalid value for --n-cpu-moe: %d (must be >= 0)\n", n_layers);
+            invalid_param = true;
+            return true;
+        }
+        for (int32_t l = 0; l < n_layers; ++l) {
+            std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps)";
+            params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
+        }
+        return true;
+    }
    if (arg == "--no-mmap") {
        params.use_mmap = false;
        return true;
@@ -1794,6 +1812,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
        options.push_back({ "*",           "       --no-mmap",              "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
    }
    options.push_back({ "*",           "       --run-time-repack",      "repack tensors if interleaved variant is available"});
+    options.push_back({ "*",           "       --cpu-moe",              "keep all MoE weights in CPU memory"});
+    options.push_back({ "*",           "       --n-cpu-moe N",          "keep MoE weights of the first N layers in CPU memory"});
    options.push_back({ "*",           "       --numa TYPE",            "attempt optimizations that help on some NUMA systems\n"
                                                                        "  - distribute: spread execution evenly over all nodes\n"
                                                                        "  - isolate: only spawn threads on CPUs on the node that execution started on\n"