mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-29 19:01:47 +00:00
Port cpu moe options from mainline (#672)
* Port cpu moe options from mainline * Use strdup and int32_t to follow coding guidelines
This commit is contained in:
@@ -1080,6 +1080,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--cpu-moe" || arg == "-cmoe") {
|
||||||
|
params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps"), ggml_backend_cpu_buffer_type()});
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--n-cpu-moe" || arg == "-ncmoe") {
|
||||||
|
CHECK_ARG
|
||||||
|
int32_t n_layers = std::stoi(argv[i]);
|
||||||
|
if (n_layers < 0) {
|
||||||
|
fprintf(stderr, "error: Invalid value for --n-cpu-moe: %d (must be >= 0)\n", n_layers);
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
for (int32_t l = 0; l < n_layers; ++l) {
|
||||||
|
std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps)";
|
||||||
|
params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "--no-mmap") {
|
if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
return true;
|
return true;
|
||||||
@@ -1794,6 +1812,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||||||
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
|
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
|
||||||
}
|
}
|
||||||
options.push_back({ "*", " --run-time-repack", "repack tensors if interleaved variant is available"});
|
options.push_back({ "*", " --run-time-repack", "repack tensors if interleaved variant is available"});
|
||||||
|
options.push_back({ "*", " --cpu-moe", "keep all MoE weights in CPU memory"});
|
||||||
|
options.push_back({ "*", " --n-cpu-moe N", "keep MoE weights of the first N layers in CPU memory"});
|
||||||
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
|
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
|
||||||
" - distribute: spread execution evenly over all nodes\n"
|
" - distribute: spread execution evenly over all nodes\n"
|
||||||
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
|
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user