diff --git a/common/common.cpp b/common/common.cpp index f0c618e0..dbf4f399 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2319,6 +2319,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.mla = params.mla_attn; mparams.rpc_servers = params.rpc_servers.c_str(); mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; diff --git a/include/llama.h b/include/llama.h index e2901861..dddd61cd 100644 --- a/include/llama.h +++ b/include/llama.h @@ -325,6 +325,7 @@ extern "C" { struct llama_model_params { int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t mla; // MLA implementation to use (only applicable to DeepSeek models at this point) enum llama_split_mode split_mode; // how to split the model across multiple GPUs // main_gpu interpretation depends on split_mode: diff --git a/src/llama.cpp b/src/llama.cpp index d0f76c49..720328fe 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6761,6 +6761,7 @@ static bool llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, + int mla_attn, enum llama_split_mode split_mode, int main_gpu, const float * tensor_split, @@ -8997,7 +8998,7 @@ static bool llm_load_tensors( } } - if (model.arch == LLM_ARCH_DEEPSEEK2) { + if (model.arch == LLM_ARCH_DEEPSEEK2 && mla_attn > 0) { int n_to_compute = 0; for (auto& l : model.layers) { if (!l.wk_b) ++n_to_compute; @@ -9252,7 +9253,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam #endif if (!llm_load_tensors( - ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, + ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data )) { return -2; @@ -19921,6 +19922,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, + /*.mla =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr,