mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-28 17:14:17 +00:00
Slightly better graph parallel for Qwen3-Next (#1307)
* Make sure we pick the reduced tensor from the right GPU * Minor
This commit is contained in:
@@ -2049,9 +2049,10 @@ static bool llm_load_tensors(
|
||||
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
||||
split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
|
||||
}
|
||||
auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
|
||||
//auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
|
||||
// assign the repeating layers
|
||||
for (int i = i_gpu_start; i < n_layer; ++i) {
|
||||
auto buft_layer = llama_default_buffer_type_offload(model, model.default_layer_device[i]);
|
||||
if (split_mode == LLAMA_SPLIT_MODE_ATTN) {
|
||||
int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count,
|
||||
float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin();
|
||||
@@ -2065,7 +2066,7 @@ static bool llm_load_tensors(
|
||||
if (n_gpu_layers > n_layer) {
|
||||
model.buft_output = {
|
||||
split_buft,
|
||||
llama_default_buffer_type_offload(model, model.devices[main_gpu])
|
||||
llama_default_buffer_type_offload(model, model.default_layer_device[n_layer])
|
||||
};
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
|
||||
Reference in New Issue
Block a user