mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-09 16:00:12 +00:00
GPU offload policy (#405)
* Adding GPU offload policy * Minor --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -19980,6 +19980,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.thtesh_experts =*/ 0.0f,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
/*.offload_policy =*/ nullptr,
|
||||
};
|
||||
|
||||
return result;
|
||||
@@ -20574,6 +20575,19 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
}
|
||||
|
||||
if (params.offload_policy) {
|
||||
const std::vector<std::pair<int, int>>& policy = *(const std::vector<std::pair<int, int>>*)params.offload_policy;
|
||||
for (auto [op, on_off] : policy) {
|
||||
if (op < 0 || op >= int(GGML_OP_COUNT)) {
|
||||
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting offload policy for all ops to %s\n", on_off ? "ON" : "OFF");
|
||||
} else {
|
||||
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting offload policy for op %s to %s\n",
|
||||
ggml_op_name(ggml_op(op)), on_off ? "ON" : "OFF");
|
||||
}
|
||||
ggml_backend_sched_set_op_offload(ctx->sched, ggml_op(op), on_off);
|
||||
}
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
@@ -23222,3 +23236,10 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
||||
fputs(text, stderr);
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
void llama_set_offload_policy(struct llama_context * lctx, int op, bool on_or_off) {
|
||||
if (!lctx || !lctx->sched) return;
|
||||
const char * op_name = op < 0 || op >= int(GGML_OP_COUNT) ? "all ops" : ggml_op_name(ggml_op(op));
|
||||
printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXX offload(%s) = %d\n", op_name, on_or_off);
|
||||
ggml_backend_sched_set_op_offload(lctx->sched, ggml_op(op), on_or_off);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user