mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-26 08:04:09 +00:00
Slightly better graph parallel for Qwen3-Next (#1307)
* Make sure we pick the reduced tensor from the right GPU * Minor
This commit is contained in:
@@ -186,6 +186,7 @@ extern "C" {
|
||||
|
||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
||||
GGML_API int ggml_backend_sched_get_backend_idx(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer);
|
||||
|
||||
// Get the number of splits of the last graph
|
||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||
|
||||
@@ -2664,6 +2664,16 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
|
||||
return sched->backends[i];
|
||||
}
|
||||
|
||||
int ggml_backend_sched_get_backend_idx(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
||||
if (!buffer || !buffer->buft) return -1;
|
||||
if (buffer && buffer->buft) {
|
||||
for (int i = 0; i < sched->n_backends; ++i) {
|
||||
if (ggml_backend_get_default_buffer_type(sched->backends[i]) == buffer->buft) return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
@@ -1986,9 +1986,12 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (cur->op == GGML_OP_REDUCE && cur->src[lctx.model.main_gpu]) {
|
||||
int idx = lctx.model.default_layer_device[lctx.model.hparams.n_layer];
|
||||
int idx_out = ggml_backend_sched_get_backend_idx(lctx.sched, lctx.model.output->buffer);
|
||||
if (idx_out >= 0) idx = idx_out;
|
||||
if (cur->op == GGML_OP_REDUCE && cur->src[idx]) {
|
||||
// avoid copy to main GPU
|
||||
cur->view_src = cur->src[lctx.model.main_gpu];
|
||||
cur->view_src = cur->src[idx];
|
||||
}
|
||||
if (output_norm) {
|
||||
cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1);
|
||||
@@ -4458,12 +4461,18 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
|
||||
|
||||
|
||||
if (hparams.is_recurrent(il)) {
|
||||
if (inpL->op == GGML_OP_REDUCE && inpL->src[model.default_layer_device[il]]) {
|
||||
inpL->view_src = inpL->src[model.default_layer_device[il]];
|
||||
//printf("Using reduce result on device %d\n", model.default_layer_device[il]);
|
||||
//inpL = inpL->src[model.default_layer_device[il]];
|
||||
int idx = model.default_layer_device[il];
|
||||
if (inpL->op == GGML_OP_REDUCE) {
|
||||
if (kv_self.s_l[il]) {
|
||||
// This shouldn't be necessary, but just in case.
|
||||
int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer);
|
||||
if (idx_s_l >= 0) idx = idx_s_l;
|
||||
}
|
||||
if (inpL->src[idx]) {
|
||||
inpL->view_src = inpL->src[idx];
|
||||
}
|
||||
}
|
||||
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[model.default_layer_device[il]] : model.layers[il].attn_norm;
|
||||
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
|
||||
@@ -4474,7 +4483,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
|
||||
cur = ggml_add(ctx0, cur, inpSA);
|
||||
cb(cur, "attn_residual", il);
|
||||
} else {
|
||||
//cur = build_layer_attn(cur, inp_pos, KQ_mask, il);
|
||||
cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr,
|
||||
KQ_mask, nullptr, nullptr, KQ_scale, 0.0f, 0, il, true, false, true, false, false);
|
||||
}
|
||||
|
||||
@@ -2049,9 +2049,10 @@ static bool llm_load_tensors(
|
||||
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
||||
split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
|
||||
}
|
||||
auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
|
||||
//auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
|
||||
// assign the repeating layers
|
||||
for (int i = i_gpu_start; i < n_layer; ++i) {
|
||||
auto buft_layer = llama_default_buffer_type_offload(model, model.default_layer_device[i]);
|
||||
if (split_mode == LLAMA_SPLIT_MODE_ATTN) {
|
||||
int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count,
|
||||
float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin();
|
||||
@@ -2065,7 +2066,7 @@ static bool llm_load_tensors(
|
||||
if (n_gpu_layers > n_layer) {
|
||||
model.buft_output = {
|
||||
split_buft,
|
||||
llama_default_buffer_type_offload(model, model.devices[main_gpu])
|
||||
llama_default_buffer_type_offload(model, model.default_layer_device[n_layer])
|
||||
};
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
|
||||
Reference in New Issue
Block a user