Slightly better graph parallel for Qwen3-Next (#1307)

* Make sure we pick the reduced tensor from the right GPU

* Minor
This commit is contained in:
Kawrakow
2026-02-24 15:22:30 +01:00
committed by GitHub
parent cfb6747776
commit 7065488135
4 changed files with 30 additions and 10 deletions

View File

@@ -2049,9 +2049,10 @@ static bool llm_load_tensors(
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
}
auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
//auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
// assign the repeating layers
for (int i = i_gpu_start; i < n_layer; ++i) {
auto buft_layer = llama_default_buffer_type_offload(model, model.default_layer_device[i]);
if (split_mode == LLAMA_SPLIT_MODE_ATTN) {
int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count,
float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin();
@@ -2065,7 +2066,7 @@ static bool llm_load_tensors(
if (n_gpu_layers > n_layer) {
model.buft_output = {
split_buft,
llama_default_buffer_type_offload(model, model.devices[main_gpu])
llama_default_buffer_type_offload(model, model.default_layer_device[n_layer])
};
} else {
model.buft_output = llama_default_buffer_type_cpu(true);