mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-29 19:01:47 +00:00
Make MLA optional
This commit is contained in:
@@ -813,6 +813,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||||||
params.flash_attn = true;
|
params.flash_attn = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "-mla" || arg == "--mla-use") {
|
||||||
|
params.mla_attn = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "-co" || arg == "--color") {
|
if (arg == "-co" || arg == "--color") {
|
||||||
params.use_color = true;
|
params.use_color = true;
|
||||||
return true;
|
return true;
|
||||||
@@ -1452,6 +1456,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||||||
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
|
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
|
||||||
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
|
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
|
||||||
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
|
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
|
||||||
|
options.push_back({ "*", "-mla, --mla-use", "enable MLA (default: %s)", params.mla_attn ? "enabled" : "disabled" });
|
||||||
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
|
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
|
||||||
"in conversation mode, this will be used as system prompt\n"
|
"in conversation mode, this will be used as system prompt\n"
|
||||||
"(default: '%s')", params.prompt.c_str() });
|
"(default: '%s')", params.prompt.c_str() });
|
||||||
@@ -2283,6 +2288,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
cparams.flash_attn = params.flash_attn;
|
cparams.flash_attn = params.flash_attn;
|
||||||
|
cparams.mla_attn = params.mla_attn;
|
||||||
|
|
||||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||||
@@ -3280,6 +3286,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|||||||
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
||||||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
||||||
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
||||||
|
fprintf(stream, "mla_attn: %s # default: false\n", params.mla_attn ? "true" : "false");
|
||||||
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
||||||
|
|
||||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
||||||
|
|||||||
@@ -174,6 +174,7 @@ struct gpt_params {
|
|||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
bool flash_attn = false; // flash attention
|
bool flash_attn = false; // flash attention
|
||||||
|
bool mla_attn = false; // MLA
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool ignore_eos = false; // ignore generated EOS tokens
|
bool ignore_eos = false; // ignore generated EOS tokens
|
||||||
|
|||||||
@@ -232,6 +232,7 @@ struct cmd_params {
|
|||||||
std::vector<int> main_gpu;
|
std::vector<int> main_gpu;
|
||||||
std::vector<bool> no_kv_offload;
|
std::vector<bool> no_kv_offload;
|
||||||
std::vector<bool> flash_attn;
|
std::vector<bool> flash_attn;
|
||||||
|
std::vector<bool> mla_attn;
|
||||||
std::vector<std::vector<float>> tensor_split;
|
std::vector<std::vector<float>> tensor_split;
|
||||||
std::vector<bool> use_mmap;
|
std::vector<bool> use_mmap;
|
||||||
std::vector<bool> embeddings;
|
std::vector<bool> embeddings;
|
||||||
@@ -261,6 +262,7 @@ static const cmd_params cmd_params_defaults = {
|
|||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
/* no_kv_offload */ {false},
|
/* no_kv_offload */ {false},
|
||||||
/* flash_attn */ {false},
|
/* flash_attn */ {false},
|
||||||
|
/* mla_attn */ {false},
|
||||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||||
/* use_mmap */ {true},
|
/* use_mmap */ {true},
|
||||||
/* embeddings */ {false},
|
/* embeddings */ {false},
|
||||||
@@ -294,6 +296,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||||
|
printf(" -mla, --mla-attn <0|1> (default: %s)\n", join(cmd_params_defaults.mla_attn, ",").c_str());
|
||||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||||
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
||||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||||
@@ -526,6 +529,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
auto p = string_split<bool>(argv[i], split_delim);
|
auto p = string_split<bool>(argv[i], split_delim);
|
||||||
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "-mla" || arg == "--mla-attn") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<bool>(argv[i], split_delim);
|
||||||
|
params.mla_attn.insert(params.mla_attn.end(), p.begin(), p.end());
|
||||||
} else if (arg == "-mmp" || arg == "--mmap") {
|
} else if (arg == "-mmp" || arg == "--mmap") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@@ -621,6 +631,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
||||||
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
||||||
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
|
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
|
||||||
|
if (params.mla_attn.empty()) { params.mla_attn = cmd_params_defaults.mla_attn; }
|
||||||
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
||||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||||
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
||||||
@@ -656,6 +667,7 @@ struct cmd_params_instance {
|
|||||||
int main_gpu;
|
int main_gpu;
|
||||||
bool no_kv_offload;
|
bool no_kv_offload;
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
|
bool mla_attn;
|
||||||
std::vector<float> tensor_split;
|
std::vector<float> tensor_split;
|
||||||
bool use_mmap;
|
bool use_mmap;
|
||||||
bool embeddings;
|
bool embeddings;
|
||||||
@@ -698,6 +710,7 @@ struct cmd_params_instance {
|
|||||||
cparams.type_v = type_v;
|
cparams.type_v = type_v;
|
||||||
cparams.offload_kqv = !no_kv_offload;
|
cparams.offload_kqv = !no_kv_offload;
|
||||||
cparams.flash_attn = flash_attn;
|
cparams.flash_attn = flash_attn;
|
||||||
|
cparams.mla_attn = mla_attn;
|
||||||
cparams.embeddings = embeddings;
|
cparams.embeddings = embeddings;
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
@@ -722,6 +735,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
for (const auto & tv : params.type_v)
|
for (const auto & tv : params.type_v)
|
||||||
for (const auto & nkvo : params.no_kv_offload)
|
for (const auto & nkvo : params.no_kv_offload)
|
||||||
for (const auto & fa : params.flash_attn)
|
for (const auto & fa : params.flash_attn)
|
||||||
|
for (const auto & mla : params.mla_attn)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads) {
|
||||||
for (const auto & n_prompt : params.n_prompt) {
|
for (const auto & n_prompt : params.n_prompt) {
|
||||||
if (n_prompt == 0) {
|
if (n_prompt == 0) {
|
||||||
@@ -743,6 +757,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .flash_attn = */ fa,
|
/* .flash_attn = */ fa,
|
||||||
|
/* .mla_attn = */ mla,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
/* .use_mmap = */ mmp,
|
/* .use_mmap = */ mmp,
|
||||||
/* .embeddings = */ embd,
|
/* .embeddings = */ embd,
|
||||||
@@ -771,6 +786,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .flash_attn = */ fa,
|
/* .flash_attn = */ fa,
|
||||||
|
/* .mla_attn = */ mla,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
/* .use_mmap = */ mmp,
|
/* .use_mmap = */ mmp,
|
||||||
/* .embeddings = */ embd,
|
/* .embeddings = */ embd,
|
||||||
@@ -799,6 +815,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .flash_attn = */ fa,
|
/* .flash_attn = */ fa,
|
||||||
|
/* .mla_attn = */ mla,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
/* .use_mmap = */ mmp,
|
/* .use_mmap = */ mmp,
|
||||||
/* .embeddings = */ embd,
|
/* .embeddings = */ embd,
|
||||||
@@ -827,6 +844,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .flash_attn = */ fa,
|
/* .flash_attn = */ fa,
|
||||||
|
/* .mla_attn = */ mla,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
/* .use_mmap = */ mmp,
|
/* .use_mmap = */ mmp,
|
||||||
/* .embeddings = */ embd,
|
/* .embeddings = */ embd,
|
||||||
@@ -866,6 +884,7 @@ struct test {
|
|||||||
int main_gpu;
|
int main_gpu;
|
||||||
bool no_kv_offload;
|
bool no_kv_offload;
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
|
bool mla_attn;
|
||||||
std::vector<float> tensor_split;
|
std::vector<float> tensor_split;
|
||||||
bool use_mmap;
|
bool use_mmap;
|
||||||
bool embeddings;
|
bool embeddings;
|
||||||
@@ -895,6 +914,7 @@ struct test {
|
|||||||
main_gpu = inst.main_gpu;
|
main_gpu = inst.main_gpu;
|
||||||
no_kv_offload = inst.no_kv_offload;
|
no_kv_offload = inst.no_kv_offload;
|
||||||
flash_attn = inst.flash_attn;
|
flash_attn = inst.flash_attn;
|
||||||
|
mla_attn = inst.mla_attn;
|
||||||
tensor_split = inst.tensor_split;
|
tensor_split = inst.tensor_split;
|
||||||
use_mmap = inst.use_mmap;
|
use_mmap = inst.use_mmap;
|
||||||
embeddings = inst.embeddings;
|
embeddings = inst.embeddings;
|
||||||
@@ -988,7 +1008,7 @@ struct test {
|
|||||||
"n_batch", "n_ubatch",
|
"n_batch", "n_ubatch",
|
||||||
"n_threads", "type_k", "type_v",
|
"n_threads", "type_k", "type_v",
|
||||||
"n_gpu_layers", "split_mode",
|
"n_gpu_layers", "split_mode",
|
||||||
"main_gpu", "no_kv_offload", "flash_attn",
|
"main_gpu", "no_kv_offload", "flash_attn", "mla_attn",
|
||||||
"tensor_split", "use_mmap", "embeddings", "repack",
|
"tensor_split", "use_mmap", "embeddings", "repack",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
"avg_ns", "stddev_ns",
|
"avg_ns", "stddev_ns",
|
||||||
@@ -1010,7 +1030,7 @@ struct test {
|
|||||||
}
|
}
|
||||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack") {
|
field == "flash_attn" || field == "mla_attn" || field == "use_mmap" || field == "embeddings" || field == "repack") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
if (field == "avg_ts" || field == "stddev_ts") {
|
if (field == "avg_ts" || field == "stddev_ts") {
|
||||||
@@ -1044,7 +1064,7 @@ struct test {
|
|||||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||||
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||||
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), std::to_string(mla_attn),
|
||||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), std::to_string(repack),
|
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), std::to_string(repack),
|
||||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||||
@@ -1208,6 +1228,9 @@ struct markdown_printer : public printer {
|
|||||||
if (field == "flash_attn") {
|
if (field == "flash_attn") {
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
if (field == "mla_attn") {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
if (field == "use_mmap") {
|
if (field == "use_mmap") {
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
@@ -1242,6 +1265,9 @@ struct markdown_printer : public printer {
|
|||||||
if (field == "flash_attn") {
|
if (field == "flash_attn") {
|
||||||
return "fa";
|
return "fa";
|
||||||
}
|
}
|
||||||
|
if (field == "mla_attn") {
|
||||||
|
return "mla";
|
||||||
|
}
|
||||||
if (field == "use_mmap") {
|
if (field == "use_mmap") {
|
||||||
return "mmap";
|
return "mmap";
|
||||||
}
|
}
|
||||||
@@ -1294,6 +1320,9 @@ struct markdown_printer : public printer {
|
|||||||
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
|
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
|
||||||
fields.emplace_back("flash_attn");
|
fields.emplace_back("flash_attn");
|
||||||
}
|
}
|
||||||
|
if (params.mla_attn.size() > 1 || params.mla_attn != cmd_params_defaults.mla_attn) {
|
||||||
|
fields.emplace_back("mla_attn");
|
||||||
|
}
|
||||||
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
||||||
fields.emplace_back("tensor_split");
|
fields.emplace_back("tensor_split");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -374,6 +374,7 @@ extern "C" {
|
|||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||||
|
bool mla_attn; // whether to use MLA attention [EXPERIMENTAL]
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
|
|||||||
301
src/llama.cpp
301
src/llama.cpp
@@ -2507,6 +2507,7 @@ struct llama_cparams {
|
|||||||
bool causal_attn;
|
bool causal_attn;
|
||||||
bool offload_kqv;
|
bool offload_kqv;
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
|
bool mla_attn;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
@@ -7678,8 +7679,8 @@ static bool llm_load_tensors(
|
|||||||
|
|
||||||
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
|
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
|
||||||
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
|
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
|
||||||
layer.wk_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0);
|
layer.wk_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 1);
|
||||||
layer.wv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v}, 0);
|
layer.wv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v}, 1);
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
@@ -8851,6 +8852,7 @@ struct llm_build_context {
|
|||||||
const int32_t n_ctx_orig;
|
const int32_t n_ctx_orig;
|
||||||
|
|
||||||
const bool flash_attn;
|
const bool flash_attn;
|
||||||
|
const bool mla_attn;
|
||||||
|
|
||||||
const enum llama_pooling_type pooling_type;
|
const enum llama_pooling_type pooling_type;
|
||||||
const enum llama_rope_type rope_type;
|
const enum llama_rope_type rope_type;
|
||||||
@@ -8900,6 +8902,7 @@ struct llm_build_context {
|
|||||||
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
||||||
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
||||||
flash_attn (cparams.flash_attn),
|
flash_attn (cparams.flash_attn),
|
||||||
|
mla_attn (cparams.mla_attn),
|
||||||
pooling_type (cparams.pooling_type),
|
pooling_type (cparams.pooling_type),
|
||||||
rope_type (hparams.rope_type),
|
rope_type (hparams.rope_type),
|
||||||
cb (cb),
|
cb (cb),
|
||||||
@@ -13419,130 +13422,203 @@ struct llm_build_context {
|
|||||||
0);
|
0);
|
||||||
cb(kv_compressed, "kv_compressed", il);
|
cb(kv_compressed, "kv_compressed", il);
|
||||||
|
|
||||||
// and {n_embd_head_qk_rope, n_tokens}
|
if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) {
|
||||||
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
|
||||||
kv_pe_compresseed->nb[1],
|
|
||||||
kv_pe_compresseed->nb[1],
|
|
||||||
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
||||||
cb(k_pe, "k_pe", il);
|
|
||||||
|
|
||||||
//kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
// and {n_embd_head_qk_rope, n_tokens}
|
||||||
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
||||||
model.layers[il].attn_kv_a_norm, NULL,
|
kv_pe_compresseed->nb[1],
|
||||||
LLM_NORM_RMS, cb, il);
|
kv_pe_compresseed->nb[1],
|
||||||
cb(kv_compressed, "kv_compressed", il);
|
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
||||||
|
cb(k_pe, "k_pe", il);
|
||||||
|
|
||||||
struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head);
|
//kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
||||||
cb(kv_cache_view, "kv_cache_view", il);
|
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
||||||
|
model.layers[il].attn_kv_a_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(kv_compressed, "kv_compressed", il);
|
||||||
|
|
||||||
// note: storing c^KV in the KV cache
|
struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head);
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, kv_compressed, kv_cache_view));
|
cb(kv_cache_view, "kv_cache_view", il);
|
||||||
|
|
||||||
struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head));
|
// note: storing c^KV in the KV cache
|
||||||
cb(kv_cache_trans_view, "kv_cache_trans_view", il);
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, kv_compressed, kv_cache_view));
|
||||||
|
|
||||||
// note: storing transposed c^KV in the transposed KV cache
|
struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head));
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_transpose(ctx0, kv_compressed), kv_cache_trans_view));
|
cb(kv_cache_trans_view, "kv_cache_trans_view", il);
|
||||||
|
|
||||||
struct ggml_tensor * kv_cache =
|
// note: storing transposed c^KV in the transposed KV cache
|
||||||
ggml_view_2d(ctx0, kv_self.kv_l[il],
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_transpose(ctx0, kv_compressed), kv_cache_trans_view));
|
||||||
kv_lora_rank, n_kv,
|
|
||||||
ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank),
|
struct ggml_tensor * kv_cache =
|
||||||
|
ggml_view_2d(ctx0, kv_self.kv_l[il],
|
||||||
|
kv_lora_rank, n_kv,
|
||||||
|
ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank),
|
||||||
|
0);
|
||||||
|
cb(kv_cache, "kv_cache", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kv_cache_trans =
|
||||||
|
ggml_view_2d(ctx0, kv_self.kvt_l[il],
|
||||||
|
n_kv, kv_lora_rank,
|
||||||
|
ggml_row_size(kv_self.kv_l[il]->type, kv_self.size),
|
||||||
|
0);
|
||||||
|
cb(kv_cache_trans, "kv_cache_trans", il);
|
||||||
|
|
||||||
|
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
||||||
|
q_pe = ggml_rope_ext(
|
||||||
|
ctx0, q_pe, inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(q_pe, "q_pe", il);
|
||||||
|
|
||||||
|
// shared RoPE key
|
||||||
|
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
||||||
|
k_pe = ggml_rope_ext(
|
||||||
|
ctx0, k_pe, inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(k_pe, "k_pe", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head);
|
||||||
|
cb(kr_cache_view, "kr_cache_view", il);
|
||||||
|
|
||||||
|
// note: storing RoPE-ed version of K^R in the KV cache
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_pe, kr_cache_view));
|
||||||
|
|
||||||
|
struct ggml_tensor * kr_cache =
|
||||||
|
ggml_view_2d(ctx0, kv_self.kr_l[il],
|
||||||
|
n_embd_head_qk_rope, n_kv,
|
||||||
|
ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope),
|
||||||
|
0);
|
||||||
|
cb(kr_cache, "kr_cache", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope), 0);
|
||||||
|
cb(wk_b, "wk_b", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * q_nope_perm = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
|
||||||
|
cb(q_nope_perm, "q_nope_perm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope_perm);
|
||||||
|
cb(q_nope2, "q_nope2", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * q_nope2_perm = ggml_permute(ctx0, q_nope2, 0, 2, 1, 3);
|
||||||
|
cb(q_nope2_perm, "q_nope2_perm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm);
|
||||||
|
cb(kq_nope, "kq_nope", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1);
|
||||||
|
cb(q_pe_perm, "q_pe_perm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe);
|
||||||
|
cb(kq_pe, "kq_pe", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe);
|
||||||
|
cb(kq, "kq", il);
|
||||||
|
|
||||||
|
kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3));
|
||||||
|
cb(kq, "kq_perm", il);
|
||||||
|
|
||||||
|
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias);
|
||||||
|
cb(kq, "kq_soft_max_ext", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kq_perm = ggml_permute(ctx0, kq, 0, 2, 1, 3);
|
||||||
|
cb(kq_perm, "kq_soft_max_ext_perm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, kv_cache_trans, kq_perm);
|
||||||
|
cb(kqv_compressed, "kqv_compressed", il);
|
||||||
|
|
||||||
|
kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 2, 1, 3);
|
||||||
|
cb(kqv_compressed, "kqv_compressed_perm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank * n_embd_head_v), 0);
|
||||||
|
cb(wv_b, "wv_b", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed);
|
||||||
|
cb(kqv, "kqv", il);
|
||||||
|
|
||||||
|
kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3));
|
||||||
|
cb(kqv, "kqv_perm", il);
|
||||||
|
|
||||||
|
cur = ggml_view_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens, ggml_row_size(kqv->type, n_embd_head_v*n_head), 0);
|
||||||
|
cb(cur, "kqv_2d", il);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
||||||
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
// and {n_embd_head_qk_rope, n_tokens}
|
||||||
|
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
||||||
|
kv_pe_compresseed->nb[1],
|
||||||
|
kv_pe_compresseed->nb[1],
|
||||||
|
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
||||||
|
cb(k_pe, "k_pe", il);
|
||||||
|
|
||||||
|
//kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
||||||
|
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
||||||
|
model.layers[il].attn_kv_a_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(kv_compressed, "kv_compressed", il);
|
||||||
|
|
||||||
|
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
||||||
|
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
||||||
|
cb(kv, "kv", il);
|
||||||
|
|
||||||
|
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||||
|
struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
||||||
|
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
||||||
|
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
||||||
0);
|
0);
|
||||||
cb(kv_cache, "kv_cache", il);
|
cb(k_nope, "k_nope", il);
|
||||||
|
|
||||||
struct ggml_tensor * kv_cache_trans =
|
// and {n_head * n_embd_head_v, n_tokens}
|
||||||
ggml_view_2d(ctx0, kv_self.kvt_l[il],
|
struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
||||||
n_kv, kv_lora_rank,
|
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
||||||
ggml_row_size(kv_self.kv_l[il]->type, kv_self.size),
|
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
||||||
|
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
||||||
|
cb(v_states, "v_states", il);
|
||||||
|
|
||||||
|
v_states = ggml_cont(ctx0, v_states);
|
||||||
|
cb(v_states, "v_states", il);
|
||||||
|
|
||||||
|
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
||||||
|
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
||||||
0);
|
0);
|
||||||
cb(kv_cache_trans, "kv_cache_trans", il);
|
cb(v_states, "v_states", il);
|
||||||
|
|
||||||
//q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
//q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
||||||
q_pe = ggml_rope_ext(
|
q_pe = ggml_rope_ext(
|
||||||
ctx0, q_pe, inp_pos, nullptr,
|
ctx0, q_pe, inp_pos, nullptr,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(q_pe, "q_pe", il);
|
cb(q_pe, "q_pe", il);
|
||||||
|
|
||||||
// shared RoPE key
|
// shared RoPE key
|
||||||
//k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
//k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
||||||
k_pe = ggml_rope_ext(
|
k_pe = ggml_rope_ext(
|
||||||
ctx0, k_pe, inp_pos, nullptr,
|
ctx0, k_pe, inp_pos, nullptr,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(k_pe, "k_pe", il);
|
cb(k_pe, "k_pe", il);
|
||||||
|
|
||||||
struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head);
|
struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
||||||
cb(kr_cache_view, "kr_cache_view", il);
|
cb(q_states, "q_states", il);
|
||||||
|
|
||||||
// note: storing RoPE-ed version of K^R in the KV cache
|
struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_pe, kr_cache_view));
|
cb(k_states, "k_states", il);
|
||||||
|
|
||||||
struct ggml_tensor * kr_cache =
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||||
ggml_view_2d(ctx0, kv_self.kr_l[il],
|
model.layers[il].wo, NULL,
|
||||||
n_embd_head_qk_rope, n_kv,
|
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
||||||
ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope),
|
|
||||||
0);
|
|
||||||
cb(kr_cache, "kr_cache", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope), 0);
|
}
|
||||||
cb(wk_b, "wk_b", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * q_nope_perm = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
|
|
||||||
cb(q_nope_perm, "q_nope_perm", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope_perm);
|
|
||||||
cb(q_nope2, "q_nope2", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * q_nope2_perm = ggml_permute(ctx0, q_nope2, 0, 2, 1, 3);
|
|
||||||
cb(q_nope2_perm, "q_nope2_perm", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm);
|
|
||||||
cb(kq_nope, "kq_nope", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1);
|
|
||||||
cb(q_pe_perm, "q_pe_perm", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe);
|
|
||||||
cb(kq_pe, "kq_pe", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe);
|
|
||||||
cb(kq, "kq", il);
|
|
||||||
|
|
||||||
kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3));
|
|
||||||
cb(kq, "kq_perm", il);
|
|
||||||
|
|
||||||
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias);
|
|
||||||
cb(kq, "kq_soft_max_ext", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * kq_perm = ggml_permute(ctx0, kq, 0, 2, 1, 3);
|
|
||||||
cb(kq_perm, "kq_soft_max_ext_perm", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, kv_cache_trans, kq_perm);
|
|
||||||
cb(kqv_compressed, "kqv_compressed", il);
|
|
||||||
|
|
||||||
kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 2, 1, 3);
|
|
||||||
cb(kqv_compressed, "kqv_compressed_perm", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank * n_embd_head_v), 0);
|
|
||||||
cb(wv_b, "wv_b", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed);
|
|
||||||
cb(kqv, "kqv", il);
|
|
||||||
|
|
||||||
kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3));
|
|
||||||
cb(kqv, "kqv_perm", il);
|
|
||||||
|
|
||||||
cur = ggml_view_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens, ggml_row_size(kqv->type, n_embd_head_v*n_head), 0);
|
|
||||||
cb(cur, "kqv_2d", il);
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
|
||||||
|
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
|
||||||
cb(cur, "kqv_out", il);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
@@ -17486,6 +17562,7 @@ struct llama_context_params llama_context_default_params() {
|
|||||||
/*.embeddings =*/ false,
|
/*.embeddings =*/ false,
|
||||||
/*.offload_kqv =*/ true,
|
/*.offload_kqv =*/ true,
|
||||||
/*.flash_attn =*/ false,
|
/*.flash_attn =*/ false,
|
||||||
|
/*.mla_attn =*/ false,
|
||||||
/*.abort_callback =*/ nullptr,
|
/*.abort_callback =*/ nullptr,
|
||||||
/*.abort_callback_data =*/ nullptr,
|
/*.abort_callback_data =*/ nullptr,
|
||||||
};
|
};
|
||||||
@@ -17684,6 +17761,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
cparams.embeddings = params.embeddings;
|
cparams.embeddings = params.embeddings;
|
||||||
cparams.offload_kqv = params.offload_kqv;
|
cparams.offload_kqv = params.offload_kqv;
|
||||||
cparams.flash_attn = params.flash_attn;
|
cparams.flash_attn = params.flash_attn;
|
||||||
|
cparams.mla_attn = params.mla_attn;
|
||||||
cparams.pooling_type = params.pooling_type;
|
cparams.pooling_type = params.pooling_type;
|
||||||
|
|
||||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||||
@@ -17750,6 +17828,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
||||||
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
||||||
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
||||||
|
LLAMA_LOG_INFO("%s: mla_attn = %d\n", __func__, cparams.mla_attn);
|
||||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user