mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-28 09:04:10 +00:00
Graph parallel for dense Qwen-3.5 models (#1331)
* Graph parallel for idense Qwen-3.5 models * Cleanup
This commit is contained in:
@@ -4649,7 +4649,19 @@ ggml_cgraph * llm_build_context::build_qwen35() {
|
||||
|
||||
if (hparams.is_recurrent(il)) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
int idx = model.default_layer_device[il];
|
||||
if (inpL->op == GGML_OP_REDUCE) {
|
||||
if (kv_self.s_l[il]) {
|
||||
// This shouldn't be necessary, but just in case.
|
||||
int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer);
|
||||
if (idx_s_l >= 0) idx = idx_s_l;
|
||||
}
|
||||
if (inpL->src[idx]) {
|
||||
inpL->view_src = inpL->src[idx];
|
||||
}
|
||||
}
|
||||
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
||||
@@ -1490,6 +1490,7 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
@@ -1510,15 +1511,15 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {
|
||||
} else {
|
||||
// Linear attention (gated delta net) specific tensors
|
||||
// Create tensors with calculated dimensions
|
||||
layer.wqkv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ssm_conv1d = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
||||
layer.ssm_dt = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_a = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_beta = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
|
||||
layer.ssm_alpha = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
|
||||
layer.ssm_norm = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
||||
layer.ssm_out = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
||||
layer.wqkv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.wqkv_gate = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ssm_conv1d = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
||||
layer.ssm_dt = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_a = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_beta = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
|
||||
layer.ssm_alpha = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
|
||||
layer.ssm_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
||||
layer.ssm_out = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
||||
}
|
||||
|
||||
layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
|
||||
@@ -3610,7 +3611,7 @@ bool create_tensors_helper::create_tensors() {
|
||||
if (layer.wo && layer.wq && layer.wk && layer.wv) {
|
||||
auto granularity_kq = hparams.n_embd_head_k * gqa_ratio;
|
||||
int wq_ne1 = layer.wq->ne[1];
|
||||
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE) {
|
||||
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE || model.arch == LLM_ARCH_QWEN35) {
|
||||
granularity_kq *= 2; wq_ne1 /= 2;
|
||||
}
|
||||
auto granularity_vo = hparams.n_embd_head_v * gqa_ratio;
|
||||
@@ -3666,7 +3667,7 @@ bool create_tensors_helper::create_tensors() {
|
||||
LLAMA_LOG_DEBUG("\n");
|
||||
prepare_split_tensors(1, ctx_split, layer.wqkv_gate, layer.split_wqkv_gate, wqkv_gate_split, mem_used);
|
||||
}
|
||||
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE) {
|
||||
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE || model.arch == LLM_ARCH_QWEN35) {
|
||||
for (auto & s : split_kq) s /= 2*gqa_ratio;
|
||||
} else {
|
||||
for (auto & s : split_kq) s /= gqa_ratio;
|
||||
|
||||
@@ -661,7 +661,7 @@ llama_context::~llama_context() {
|
||||
// kv cache helpers
|
||||
//
|
||||
|
||||
static inline bool llama_qwen3next_is_recurrent_layer(const llama_hparams & hparams, uint32_t il) {
|
||||
static inline bool llama_is_recurrent_layer(const llama_hparams & hparams, uint32_t il) {
|
||||
return hparams.is_recurrent(il);
|
||||
}
|
||||
|
||||
@@ -836,7 +836,7 @@ static bool llama_kv_cache_init(
|
||||
|
||||
int n_mla = 0;
|
||||
for (int i = 0; i < (int) n_layer; i++) {
|
||||
const bool qnext_recurrent = llama_qwen3next_is_recurrent_layer(hparams, i);
|
||||
const bool qnext_recurrent = llama_is_recurrent_layer(hparams, i);
|
||||
const uint32_t n_embd_v_row = llama_kv_v_row_embd(model, hparams, i);
|
||||
const uint32_t n_head_kv = hparams.n_head_kv(i);
|
||||
const uint32_t n_embd_head_k= hparams.n_embd_head_k;
|
||||
@@ -1937,6 +1937,7 @@ static bool is_model_split_supported(const llama_model & model) {
|
||||
LLM_ARCH_SEED_OSS,
|
||||
LLM_ARCH_STEP35,
|
||||
LLM_ARCH_QWEN3NEXT,
|
||||
LLM_ARCH_QWEN35,
|
||||
};
|
||||
auto it = k_supported.find(model.arch);
|
||||
return it != k_supported.end();
|
||||
|
||||
Reference in New Issue
Block a user