Enable MLA-3 in crippled GGUFs: WIP

This commit is contained in:
Iwan Kawrakow
2025-05-11 14:18:24 +03:00
parent 504fb890d9
commit 8ee5008f7e
3 changed files with 6 additions and 2 deletions

View File

@@ -2319,6 +2319,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.mla = params.mla_attn;
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;

View File

@@ -325,6 +325,7 @@ extern "C" {
struct llama_model_params {
int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t mla; // MLA implementation to use (only applicable to DeepSeek models at this point)
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// main_gpu interpretation depends on split_mode:

View File

@@ -6761,6 +6761,7 @@ static bool llm_load_tensors(
llama_model_loader & ml,
llama_model & model,
int n_gpu_layers,
int mla_attn,
enum llama_split_mode split_mode,
int main_gpu,
const float * tensor_split,
@@ -8997,7 +8998,7 @@ static bool llm_load_tensors(
}
}
if (model.arch == LLM_ARCH_DEEPSEEK2) {
if (model.arch == LLM_ARCH_DEEPSEEK2 && mla_attn > 0) {
int n_to_compute = 0;
for (auto& l : model.layers) {
if (!l.wk_b) ++n_to_compute;
@@ -9252,7 +9253,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
#endif
if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
params.progress_callback, params.progress_callback_user_data
)) {
return -2;
@@ -19921,6 +19922,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
struct llama_model_params llama_model_default_params() {
struct llama_model_params result = {
/*.n_gpu_layers =*/ 0,
/*.mla =*/ 0,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,