mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
Better PP performance with split mode "graph" and 3+ GPUs (#1069)
* This should do the trick for PP * Command line option to set max. extra VRAM that the scheduler can use * Fix bug and cleanup * Looks like with this change it is working with tensor overrides * Nah, it is not working * OK, this seems to be working * Disable split scheduling with tensor overrides --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -757,6 +757,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.defrag_thold = std::stof(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--max-extra-alloc" || arg == "-mea") {
|
||||
CHECK_ARG
|
||||
params.max_extra_alloc_MiB = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--samplers") {
|
||||
CHECK_ARG
|
||||
const auto sampler_names = string_split(argv[i], ";");
|
||||
@@ -2218,6 +2223,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
|
||||
options.push_back({ "parallel" });
|
||||
options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
|
||||
options.push_back({ "*", "-mea, --max-extra-alloc", "Max extra VRAM allocation per GPU (default: %d)", params.max_extra_alloc_MiB});
|
||||
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
|
||||
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
|
||||
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
|
||||
@@ -3109,7 +3115,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
cparams.min_experts = params.min_experts;
|
||||
cparams.thresh_experts = params.thresh_experts;
|
||||
cparams.only_active_experts = params.only_active_exps;
|
||||
cparams.k_cache_hadamard = params.k_cache_hadamard;
|
||||
cparams.max_extra_alloc = params.max_extra_alloc_MiB;
|
||||
|
||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||
@@ -4049,6 +4055,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "use_thp: %s # default: false\n", params.use_thp ? "true" : "false");
|
||||
fprintf(stream, "validate_quants: %s # default: false\n", params.validate_quants ? "true" : "false");
|
||||
fprintf(stream, "merge_qkv: %s # default: false\n", params.merge_qkv ? "true" : "false");
|
||||
fprintf(stream, "max_extra_alloc: %d # default: 256\n", params.max_extra_alloc_MiB);
|
||||
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
||||
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
||||
|
||||
@@ -167,6 +167,7 @@ struct gpt_params {
|
||||
float yarn_beta_slow = -1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||
int32_t max_extra_alloc_MiB = 256; // additional VRAM per GPU the scheduler may allocate for more efficient compute graph evaluation
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||
void * cb_eval_user_data = nullptr;
|
||||
|
||||
Reference in New Issue
Block a user