mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-13 07:20:15 +00:00
bitnet(scale in a separate tensor): mul -> scale on the CPU
This commit is contained in:
21
llama.cpp
21
llama.cpp
@@ -11822,11 +11822,13 @@ struct llm_build_context {
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
#define BITNET_MUL ggml_mul
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
||||
Qcur = BITNET_MUL(ctx0, Qcur, model.layers[il].wq_scale);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
@@ -11835,7 +11837,7 @@ struct llm_build_context {
|
||||
|
||||
// B1.K
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
||||
Kcur = BITNET_MUL(ctx0, Kcur, model.layers[il].wk_scale);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
@@ -11844,7 +11846,7 @@ struct llm_build_context {
|
||||
|
||||
// B1.V
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
||||
Vcur = BITNET_MUL(ctx0, Vcur, model.layers[il].wv_scale);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
@@ -11938,7 +11940,7 @@ struct llm_build_context {
|
||||
ggml_build_forward_expand(gf, cur_attn);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur_attn);
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
||||
cur = BITNET_MUL(ctx0, cur, model.layers[il].wo_scale);
|
||||
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
@@ -11961,12 +11963,12 @@ struct llm_build_context {
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
||||
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
|
||||
|
||||
tmp = BITNET_MUL(ctx0, tmp, model.layers[il].ffn_up_scale);
|
||||
|
||||
cb(tmp, "ffn_up", il);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale);
|
||||
cur = BITNET_MUL(ctx0, cur, model.layers[il].ffn_gate_scale);
|
||||
|
||||
cb(cur, "ffn_gate", il);
|
||||
|
||||
@@ -11974,7 +11976,7 @@ struct llm_build_context {
|
||||
cur = ggml_silu(ctx0, cur);
|
||||
cb(cur, "ffn_silu", il);
|
||||
|
||||
cur = ggml_mul(ctx0, cur, tmp);
|
||||
cur = BITNET_MUL(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_gate_par", il);
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
@@ -11983,7 +11985,7 @@ struct llm_build_context {
|
||||
cb(cur, "ffn_sub_norm", il);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||
cur = BITNET_MUL(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||
cb(cur, "ffn_down", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
@@ -12007,6 +12009,7 @@ struct llm_build_context {
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
#undef BITNET_MUL
|
||||
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user