mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-07 20:40:02 +00:00
Enable MLA-3 in crippled GGUFs: WIP
This commit is contained in:
@@ -2319,6 +2319,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
mparams.mla = params.mla_attn;
|
||||
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
|
||||
@@ -325,6 +325,7 @@ extern "C" {
|
||||
|
||||
struct llama_model_params {
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t mla; // MLA implementation to use (only applicable to DeepSeek models at this point)
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// main_gpu interpretation depends on split_mode:
|
||||
|
||||
@@ -6761,6 +6761,7 @@ static bool llm_load_tensors(
|
||||
llama_model_loader & ml,
|
||||
llama_model & model,
|
||||
int n_gpu_layers,
|
||||
int mla_attn,
|
||||
enum llama_split_mode split_mode,
|
||||
int main_gpu,
|
||||
const float * tensor_split,
|
||||
@@ -8997,7 +8998,7 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 && mla_attn > 0) {
|
||||
int n_to_compute = 0;
|
||||
for (auto& l : model.layers) {
|
||||
if (!l.wk_b) ++n_to_compute;
|
||||
@@ -9252,7 +9253,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
#endif
|
||||
|
||||
if (!llm_load_tensors(
|
||||
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
)) {
|
||||
return -2;
|
||||
@@ -19921,6 +19922,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
||||
struct llama_model_params llama_model_default_params() {
|
||||
struct llama_model_params result = {
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.mla =*/ 0,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
|
||||
Reference in New Issue
Block a user