From a71934998203f7480e7253788408d3720c3569fa Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 1 Dec 2025 19:25:40 +0100 Subject: [PATCH] POC: CUDA tensor parallel (MoE models) (#1022) * Remove most of split mode row * WIP * WIP: also allocate the KV cache using tensor split * WIP: it runs with wrong result But it also looks like the backend scheduler is not going to help: * It copies mask and input positions to GPU 0 * => RoPE ops must run on GPU 0 * => To proceed attn evaluation, GPU 1 must wait for GPU 0 to finish its entire attn calculation * Same with FFN. The rms_norm gets scheduled on GPU 0. Hence, GPU 1 must wait for GPU 0 to finish its entore FFN calculation before it can start (as it needs to copy the result of rms_norm from GPU 0) * => Seems useless without writing a bespoke TP scheduling * WIP * This works, but it is slow * This is slightly better the graph is still not being computed in parallel. Why? Because the scheduler creates graph splits where the result of the computation on one GPU becomes an input for the other split. Hence, to trigger the computation on the second GPU one needs to wait for the computation on the first GPU to finish, even thiough the two can be done in parallel up to the sunchronization point. So, all that is left to do is to trick the scheduler to create to splits that can be done in parallel, and then have a graph split where the results get combined. * Playing games with the scheduler This change tricks it into doing the right thing^TM. Still quite a bit slower than split mode layer for the 8B LlaMA model. But for the 70B LlaMA it now beats split mode layer for TG: 28 t/s vs 24.4 t/s. PP is 627 t/s vs 744 t/s. In comparison, split mode "row" in mainline gets 484 t/s PP and 19.3 t/s TG. * Fix attn split Granularity for Wq, Wo is not just head size, but head size * gqa_ratio. Else the Wk, Wv tensors end up not being a multiple of the head size when we divide the split determined by Wo with the gqa_ratio. * Show memory used per device * Make it work with partial offload but no tensor overrides yet, just ngl < num_layers. * Allow for f16 source in fused_rms_norm * This results in faster PP. Now PP is faster than split mode layer for L3-70B. * Rename split mode "row" to split mode "graph" * Leave FFN partial results as f16 * WIP GLM4.5 - runs with wrong results * WIP GLM4.5 - this works PP is already better than split mode layer, but TG for zero context is kind of low - 60 vs 92 t/s. TG becomes better than split mode layer at around 20k tokens. PP at 26k tokens is 1.55X of sm layer. * Work around compiler bug It issues a warning that there is an extra semicolon outside of a function, but there isn't. If I remove the anonymous namespace and turn the functions inside into static, the warning disapears, so clearly a compiler bug. * Make graph reuse work with split mode graph * Remove more split mode row remnants * WIP tensor overrides Runs with wrong results, don't see where the issue could be. * This works but is slow Still does not work for row-interleaved quants * Slightly better * Slightly better * Row-interleaved quants work * Better * Minor * Guarad against using split mode "graph" for unsupported models * Guards against using merge_qkv with split mode "graph" * WIP split mode attn Works for LlaMA models, but not for GLM-4.5. Doesn't seem to improve performance, so I guess no point in trying to fix it. * Split mode graph for qwen3moe * Try to better distribute the splits --------- Co-authored-by: Iwan Kawrakow --- common/common.cpp | 12 +- examples/llama-bench/llama-bench.cpp | 12 +- ggml/include/ggml.h | 7 + ggml/src/ggml-backend.cpp | 19 +- ggml/src/ggml-cuda.cu | 863 ++++++-------------- ggml/src/ggml-cuda/iqk_mmvq_templates.cuh | 25 +- ggml/src/ggml-cuda/norm.cu | 36 +- ggml/src/ggml.c | 14 +- include/llama.h | 3 +- src/llama-build-context.cpp | 947 +++++++++++++++------- src/llama-build-context.h | 32 +- src/llama-context.h | 3 + src/llama-impl.h | 5 + src/llama-load-tensors.cpp | 404 ++++++++- src/llama-model.h | 54 ++ src/llama.cpp | 340 +++++--- 16 files changed, 1694 insertions(+), 1082 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b6ee885c..9f1ce736 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1276,12 +1276,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else if (arg_next == "layer") { params.split_mode = LLAMA_SPLIT_MODE_LAYER; } - else if (arg_next == "row") { - fprintf(stderr, "\n\n=====================================================================================\n"); - fprintf(stderr, " Split mode row is no longer supported\n"); - fprintf(stderr, "=====================================================================================\n\n\n"); - GGML_ABORT("fatal error"); - params.split_mode = LLAMA_SPLIT_MODE_ROW; + else if (arg_next == "attn") { + params.split_mode = LLAMA_SPLIT_MODE_ATTN; + } + else if (arg_next == "graph") { + params.split_mode = LLAMA_SPLIT_MODE_GRAPH; } else { invalid_param = true; @@ -2249,6 +2248,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", "how to split the model across multiple GPUs, one of:\n" " - none: use one GPU only\n" + " - graph: split model tensors and computation graph across GPUs\n" " - layer (default): split layers and KV across GPUs\n" }); options.push_back({ "*", "-ts, --tensor-split SPLIT", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 31b769fe..f0f62d46 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -217,7 +217,7 @@ static const char * split_mode_str(llama_split_mode mode) { switch (mode) { case LLAMA_SPLIT_MODE_NONE: return "none"; case LLAMA_SPLIT_MODE_LAYER: return "layer"; - case LLAMA_SPLIT_MODE_ROW: return "row"; + case LLAMA_SPLIT_MODE_GRAPH: return "graph"; default: GGML_ABORT("invalid split mode"); } } @@ -334,7 +334,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" --n-cpu-moe (default: none)\n"); printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); - printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); @@ -630,12 +630,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { mode = LLAMA_SPLIT_MODE_NONE; } else if (m == "layer") { mode = LLAMA_SPLIT_MODE_LAYER; - } else if (m == "row") { - fprintf(stderr, "\n\n=======================================================================\n"); - fprintf(stderr, "Split mode 'row' is no longer supported\n"); - fprintf(stderr, "=======================================================================\n\n\n"); - invalid_param = true; - break; + } else if (m == "graph") { + mode = LLAMA_SPLIT_MODE_GRAPH; } else { invalid_param = true; break; diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 7b630489..d2a0cf85 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -3021,6 +3021,13 @@ extern "C" { GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); + typedef struct { + int n_device; + int split_dim; + struct ggml_tensor * tensor; + struct ggml_tensor ** splits; + } ggml_split_tensor_t; + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 813f4467..f7c1593b 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -43,7 +43,7 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { size_t size = buft->iface.get_alloc_size(buft, tensor); - assert(size >= ggml_nbytes(tensor)); + //assert(size >= ggml_nbytes(tensor)); return size; } return ggml_nbytes(tensor); @@ -1216,8 +1216,10 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co return -1; } + //printf("%s: have %d backends, buffer is %s\n", __func__, sched->n_backends, ggml_backend_buffer_name(buffer)); // find highest prio backend that supports the buffer type and the op for (int i = 0; i < sched->n_backends; i++) { + //printf(" Checking bacckend %d (%s)\n", i, ggml_backend_name(sched->backends[i])); if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) && ggml_backend_supports_op(sched->backends[i], op)) { return i; @@ -1393,6 +1395,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // do not overwrite user assignments if (*leaf_backend_id == -1) { *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); + //printf("Pass 1: assigned backend %d to leaf %d, %s\n", *leaf_backend_id, i, graph->leafs[i]->name); } } @@ -1402,6 +1405,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // do not overwrite user assignments if (*node_backend_id == -1) { *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); + //printf("Pass 1: assigned backend %d to node %d, %s(%s)\n", *node_backend_id, i, ggml_op_name(node->op), node->name); #if 0 // src @@ -1445,6 +1449,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg cur_backend_id = *node_backend_id; } } else if (cur_backend_id != -1) { + //printf("(u1) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } @@ -1466,6 +1471,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg cur_backend_id = *node_backend_id; } } else if (cur_backend_id != -1) { + //printf("(d1) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } @@ -1482,6 +1488,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { + //printf("(u2) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } @@ -1498,6 +1505,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { + //printf("(d2) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } @@ -1535,6 +1543,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (n_supported > n_supported_best) { n_supported_best = n_supported; *node_backend_id = b; + //printf("Pass 3: assigned backend %d to unassigned node %d, %s\n", b, i, node->name); SET_CAUSE(node, "3.best"); } } @@ -1555,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } if (supported) { + //printf("Pass 3: assigned backend %d to node %d, %s previously assigned to backend %d\n", b, i, node->name, *node_backend_id); *node_backend_id = b; SET_CAUSE(node, "3.upg"); break; @@ -1583,9 +1593,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // views are always on the same backend as the source *src_backend_id = tensor_backend_id(src->view_src); SET_CAUSE(src, "4.vsrc"); + //printf("Pass 4: assigned backend %d to src %d, %s in node %d, %s frpm view_src\n", *src_backend_id, j, src->name, i, node->name); } else { *src_backend_id = *cur_backend_id; SET_CAUSE(src, "4.cur"); + //printf("Pass 4: assigned backend %d to src %d, %s in node %d, %s frpm current\n", *src_backend_id, j, src->name, i, node->name); } } } @@ -1620,7 +1632,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // check if we should start a new split based on the sources of the current node bool need_new_split = false; - if (node_backend_id == cur_backend_id && split->n_inputs > 0) { + if (node->op == GGML_OP_ADD && node->op_params[0] == 0xff) { + need_new_split = true; + } + else if (node_backend_id == cur_backend_id && split->n_inputs > 0) { for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 0db57b08..2372efdb 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -221,7 +221,8 @@ static ggml_cuda_device_info ggml_cuda_init() { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); + GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", + prop.totalGlobalMem/(1024*1024)); info.default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; @@ -567,6 +568,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t } GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + printf("%s(%s -> %s)\n", __func__, src->name, dst->name); if (ggml_backend_buffer_is_cuda(src->buffer)) { ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context; @@ -601,7 +603,7 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor, + /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, @@ -703,59 +705,25 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { // cuda split buffer -static int64_t get_row_rounding(const std::array & tensor_split) { - int64_t row_rounding = 0; - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) { - continue; - } - - const int cc = ggml_cuda_info().devices[id].cc; - row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc)); - } - return row_rounding; -} - -static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { - const int64_t nrows = ggml_nrows(tensor); - const int64_t rounding = get_row_rounding(tensor_split); - - *row_low = id == 0 ? 0 : nrows*tensor_split[id]; - *row_low -= *row_low % rounding; - - if (id == ggml_backend_cuda_get_device_count() - 1) { - *row_high = nrows; - } else { - *row_high = nrows*tensor_split[id + 1]; - *row_high -= *row_high % rounding; - } -} - -static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); -} - struct ggml_backend_cuda_split_buffer_type_context { - std::array tensor_split; + //std::array tensor_split; }; struct ggml_backend_cuda_split_buffer_context { ~ggml_backend_cuda_split_buffer_context() { - for (ggml_tensor_extra_gpu * extra : tensor_extras) { - for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { - for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) { - if (extra->events[id][is] != nullptr) { - CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); - } - } - if (extra->data_device[id] != nullptr) { - CUDA_CHECK(cudaFree(extra->data_device[id])); - } - } - delete extra; - } + //for (ggml_tensor_extra_gpu * extra : tensor_extras) { + // for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { + // for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) { + // if (extra->events[id][is] != nullptr) { + // CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); + // } + // } + // if (extra->data_device[id] != nullptr) { + // CUDA_CHECK(cudaFree(extra->data_device[id])); + // } + // } + // delete extra; + //} } std::vector tensor_extras; @@ -784,128 +752,189 @@ GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buf GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported - - ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - - ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; - ctx->tensor_extras.push_back(extra); - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses +GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + if (!tensor->extra) return; + //printf("%s(%s, %p)\n", __func__, tensor->name, tensor->extra); + auto extra = (ggml_split_tensor_t *)tensor->extra; + GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count()); + for (int i = 0; i < extra->n_device; ++i) { + if (!extra->splits[i]) continue; + auto split = extra->splits[i]; + auto ne0 = split->ne[0]; + auto size = ggml_nbytes(split); + auto padded_size = size; if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + int nblock = (ne0 + MATRIX_ROW_PADDING - 1)/MATRIX_ROW_PADDING; + auto padded_row_size = ggml_row_size(split->type, nblock*MATRIX_ROW_PADDING); + auto row_size = ggml_row_size(split->type, ne0); + padded_size += padded_row_size - row_size; } - - // FIXME: do not crash if cudaMalloc fails - // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first - ggml_cuda_set_device(id); + ggml_cuda_set_device(i); char * buf; - CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); - - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); - } - - extra->data_device[id] = buf; - - for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) { - CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); + CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, padded_size, i)); + if (padded_size > size) { + CUDA_CHECK(cudaMemset(buf + size, 0, padded_size - size)); } + //printf(" allocated %zu bytes for tensor %s of type %s, dim = %ld x %ld x %ld. padding: %zu\n", padded_size, split->name, ggml_type_name(split->type), + // split->ne[0], split->ne[1], split->ne[2], padded_size - size); + split->data = buf; + auto ctx = new ggml_backend_cuda_buffer_context(i, buf); + auto buft = ggml_backend_cuda_buffer_type(i); + split->buffer = ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, padded_size); + ggml_backend_buffer_set_usage(split->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); } - tensor->extra = extra; + return; + } -GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + if (!tensor->extra) return; + static std::map k_map = { + { GGML_TYPE_Q4_0_R8 , 8}, + { GGML_TYPE_Q5_0_R4 , 4}, + { GGML_TYPE_Q8_0_R8 , 8}, + { GGML_TYPE_Q2_K_R4 , 4}, + { GGML_TYPE_Q3_K_R4 , 4}, + { GGML_TYPE_Q4_K_R4 , 4}, + { GGML_TYPE_Q5_K_R4 , 4}, + { GGML_TYPE_Q6_K_R4 , 4}, + { GGML_TYPE_IQ2_XXS_R4, 4}, + { GGML_TYPE_IQ2_XS_R4 , 4}, + { GGML_TYPE_IQ3_XXS_R4, 4}, + { GGML_TYPE_IQ1_S_R4 , 4}, + { GGML_TYPE_IQ4_NL_R4 , 4}, + { GGML_TYPE_IQ3_S_R4 , 4}, + { GGML_TYPE_IQ2_S_R4 , 4}, + { GGML_TYPE_IQ4_XS_R8 , 8}, + { GGML_TYPE_IQ1_M_R4 , 4}, + { GGML_TYPE_BF16_R16 , 16}, + { GGML_TYPE_Q6_0_R4 , 4}, + { GGML_TYPE_IQ2_BN_R4 , 4}, + { GGML_TYPE_IQ2_K_R4 , 4}, + { GGML_TYPE_IQ3_K_R4 , 4}, + { GGML_TYPE_IQ4_K_R4 , 4}, + { GGML_TYPE_IQ5_K_R4 , 4}, + { GGML_TYPE_IQ4_KS_R4 , 4}, + { GGML_TYPE_IQ5_KS_R4 , 4}, + { GGML_TYPE_Q8_K_R16 , 4}, + { GGML_TYPE_Q8_KV_R8 , 4}, + { GGML_TYPE_Q8_K_R8 , 8}, + }; + //printf("%s(%s)\n", __func__, tensor->name); + // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + auto extra = (ggml_split_tensor_t *)tensor->extra; + GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count()); - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; + if (extra->split_dim < 0) { + GGML_ASSERT(ggml_is_contiguous(tensor)); + auto nbytes = ggml_nbytes(tensor); + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + GGML_ASSERT(split->type == tensor->type); + GGML_ASSERT(ggml_are_same_shape(tensor, split)); + GGML_ASSERT(ggml_nbytes(split) == nbytes); + ggml_cuda_set_device(i); + CUDA_CHECK(cudaMemcpyAsync(split->data, data, nbytes, cudaMemcpyHostToDevice, cudaStreamPerThread)); } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + else if (extra->split_dim == 0) { + int n_interleave = 1; + if (auto it = k_map.find(tensor->type); it != k_map.end()) n_interleave = it->second; + //if (tensor->type >= GGML_TYPE_Q4_0_R8) { + // GGML_ABORT("Dim 0 copy of row-interleaved quants is not supported yet"); + //} + auto tt = ggml_internal_get_type_traits(tensor->type); + std::vector host_buffer; + GGML_ASSERT(ggml_is_contiguous(tensor)); + int nrows = ggml_nrows(tensor); + auto bs = tt.blck_size; + auto ts = tt.type_size; + auto row_size = ggml_row_size(tensor->type, tensor->ne[0]); + int ne = 0; + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + GGML_ASSERT(split->ne[1]%n_interleave == 0); + ggml_cuda_set_device(i); + GGML_ASSERT(split->type == tensor->type); + GGML_ASSERT((int)ggml_nrows(split) == nrows); + GGML_ASSERT(split->ne[0] % bs == 0); + auto source_offset = n_interleave*(tt.row_meta_size + (ne / bs) * ts); + auto split_row_size = ggml_row_size(split->type, split->ne[0]); + if (host_buffer.size() < nrows*split_row_size) host_buffer.resize(nrows*split_row_size); + for (int64_t i02 = 0; i02 < split->ne[2]; ++i02) { + for (int64_t i01 = 0; i01 < split->ne[1]; i01 += n_interleave) { + auto dst = host_buffer.data() + (i02*split->ne[1] + i01)*split_row_size; + auto src = (const char *)data + i02*tensor->nb[2] + i01*tensor->nb[1]; + if (tt.row_meta_size > 0) { + memcpy(dst, src, tt.row_meta_size*n_interleave); + } + memcpy(dst + tt.row_meta_size*n_interleave, src + source_offset, n_interleave*(split_row_size - tt.row_meta_size)); + } + } + CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), nrows*split_row_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + ne += split->ne[0]; } - - const char * buf_host = (const char *)data + offset_split; - CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + } + else if (extra->split_dim == 1) { + if (tensor->ne[2] > 1) { + auto row_size = ggml_row_size(tensor->type, tensor->ne[0]); + std::vector host_buffer; + int ne1 = 0; + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + ggml_cuda_set_device(i); + auto size = ggml_nbytes(split); + if (host_buffer.size() < size) host_buffer.resize(size); + for (int64_t i02 = 0; i02 < split->ne[2]; ++i02) { + auto dst = host_buffer.data() + i02*split->ne[1]*row_size; + auto src = (const char *)data + i02*tensor->nb[2] + ne1*tensor->nb[1]; + memcpy(dst, src, split->ne[1]*row_size); + } + CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + ne1 += split->ne[1]; + } + } else { + int n_interleave = 1; + if (auto it = k_map.find(tensor->type); it != k_map.end()) n_interleave = it->second; + size_t cur_offset = 0; + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + GGML_ASSERT(split->ne[1]%n_interleave == 0); + ggml_cuda_set_device(i); + auto size = ggml_nbytes(split); + const char * buf_host = (const char *)data + cur_offset; + CUDA_CHECK(cudaMemcpyAsync(split->data, buf_host, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + cur_offset += size; + } + } + } + else { + fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim == 0); + GGML_ABORT("fatal error"); } - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + for (int i = 0; i < extra->n_device; ++i) { + if (!extra->splits[i]) continue; CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } + } -GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, const ggml_tensor * tensor, + [[maybe_unused]] void * data, size_t offset, size_t size) { // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - char * buf_host = (char *)data + offset_split; - CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); - } - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); - } + GGML_ABORT("not implemented"); } GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -954,31 +983,26 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_b GGML_UNUSED(buft); } -GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; +GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size([[maybe_unused]] ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + if (!tensor->extra) return 0; + auto extra = (ggml_split_tensor_t *)tensor->extra; + GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count()); size_t total_size = 0; - - const int64_t ne0 = tensor->ne[0]; - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - total_size += ggml_nbytes_split(tensor, nrows_split); - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + total_size += ggml_nbytes(split); + auto ne0 = split->ne[0]; if (ne0 % MATRIX_ROW_PADDING != 0) { - total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + auto nblock = (ne0 + MATRIX_ROW_PADDING - 1)/MATRIX_ROW_PADDING; + auto row_size = ggml_row_size(split->type, ne0); + auto padded_row_size = ggml_row_size(split->type, nblock*MATRIX_ROW_PADDING); + total_size += padded_row_size - row_size; } } - return total_size; + } GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { @@ -996,40 +1020,12 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { - static std::mutex mutex; - std::lock_guard lock(mutex); - - static std::map, struct ggml_backend_buffer_type> buft_map; - - std::array tensor_split_arr = {}; - - bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; }); - if (all_zero) { - tensor_split_arr = ggml_cuda_info().default_tensor_split; - } else { - float split_sum = 0.0f; - for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { - tensor_split_arr[i] = split_sum; - split_sum += tensor_split[i]; - } - for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { - tensor_split_arr[i] /= split_sum; - } - } - - auto it = buft_map.find(tensor_split_arr); - if (it != buft_map.end()) { - return &it->second; - } - - struct ggml_backend_buffer_type buft { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * /*tensor_split*/) { + static ggml_backend_buffer_type buft { /* .iface = */ ggml_backend_cuda_split_buffer_type_interface, - /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, + /* .context = */ new ggml_backend_cuda_split_buffer_type_context{}, //{tensor_split_arr}, }; - - auto result = buft_map.emplace(tensor_split_arr, buft); - return &result.first->second; + return &buft; } // host buffer type @@ -1437,6 +1433,7 @@ static void ggml_cuda_op_mul_mat_cublas( GGML_UNUSED(src1_padded_row_size); } +#if 0 static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { static bool peer_access_enabled = false; @@ -1488,6 +1485,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { GGML_UNUSED(main_device); } +#endif static cudaError_t ggml_cuda_Memcpy2DPeerAsync( void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) { @@ -1553,20 +1551,6 @@ static void ggml_cuda_op_mul_mat( const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); - GGML_ASSERT(!(split && ne02 > 1)); - GGML_ASSERT(!(split && ne03 > 1)); - GGML_ASSERT(!(split && ne02 < ne12)); - - ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr; - - - std::array tensor_split; - if (split) { - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; - tensor_split = buft_ctx->tensor_split; - } - struct dev_data { int cc; @@ -1595,31 +1579,12 @@ static void ggml_cuda_op_mul_mat( dev[id].row_low = 0; dev[id].row_high = ne01; - // for multi GPU, get the row boundaries from tensor split - // and round to mul_mat_q tile sizes - if (split) { - const int64_t rounding = get_row_rounding(tensor_split); - - if (id != 0) { - dev[id].row_low = ne01*tensor_split[id]; - if (dev[id].row_low < ne01) { - dev[id].row_low -= dev[id].row_low % rounding; - } - } - - if (id != ggml_backend_cuda_get_device_count() - 1) { - dev[id].row_high = ne01*tensor_split[id + 1]; - if (dev[id].row_high < ne01) { - dev[id].row_high -= dev[id].row_high % rounding; - } - } - } } bool quantization_done = false; for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) { + if (id != ctx.device || dev[id].row_low == dev[id].row_high) { continue; } @@ -1632,7 +1597,7 @@ static void ggml_cuda_op_mul_mat( cudaStream_t stream = ctx.stream(id, 0); if (src0_is_contiguous) { - dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data; + dev[id].src0_dd = (char *) src0->data; } else { // If src0 is not contiguous it will be copied to a temporary buffer, it may then be necessary to clear padding. const size_t nbytes_data = ggml_nbytes(src0); @@ -1676,20 +1641,13 @@ static void ggml_cuda_op_mul_mat( if (dst_on_device) { dev[id].dst_dd = (float *) dst->data; } else { - const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst); + const size_t size_dst_ddf = ggml_nelements(dst); dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf); } } - // if multiple devices are used they need to wait for the main device - // here an event is recorded that signals that the main device has finished calculating the input data - if (split && used_devices > 1) { - ggml_cuda_set_device(ctx.device); - CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream())); - } - - const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; - if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1 && ne02 == ne12 && ne02 == dst->ne[2]) { + const int64_t src1_col_stride = ne11; + if (quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1 && ne02 == ne12 && ne02 == dst->ne[2]) { //printf("invoking fast path for %s x %s\n", src0->name, src1->name); int id = ctx.device; char * src0_dd_i = dev[id].src0_dd; @@ -1704,11 +1662,11 @@ static void ggml_cuda_op_mul_mat( } for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { - const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0; + const int64_t is = 0; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) { + if (id != ctx.device || dev[id].row_low == dev[id].row_high) { continue; } @@ -1719,11 +1677,6 @@ static void ggml_cuda_op_mul_mat( ggml_cuda_set_device(id); cudaStream_t stream = ctx.stream(id, is); - // wait for main GPU data if necessary - if (split && (id != ctx.device || is != 0)) { - CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0)); - } - for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) { const int64_t i03 = i0 / ne12; const int64_t i02 = i0 % ne12; @@ -1796,45 +1749,12 @@ static void ggml_cuda_op_mul_mat( // copy dst to host or other device if necessary if (!dst_on_device) { void * dst_off_device = dst->data; - if (split) { - // src0 = weight matrix is saved as a transposed matrix for better memory layout. - // dst is NOT transposed. - // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. - // Instead they need to be copied to the correct slice in ne0 = dst row index. - // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. - float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); - GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); - dhf_dst_i += src1_col_0*ne0 + dev[id].row_low; - CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync( - dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream)); - } else { - float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); - GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); - dhf_dst_i += src1_col_0*ne0; - CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream)); - } + float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); + GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); + dhf_dst_i += src1_col_0*ne0; + CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream)); } - // add event for the main device to wait on until other device is done - if (split && (id != ctx.device || is != 0)) { - CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream)); - } - } - } - } - - // main device waits for all other devices to be finished - if (split && ggml_backend_cuda_get_device_count() > 1) { - int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; - is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS; - - ggml_cuda_set_device(ctx.device); - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - if (dev[id].row_low == dev[id].row_high) { - continue; - } - for (int64_t is = 0; is < is_max; ++is) { - CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0)); } } } @@ -2167,7 +2087,6 @@ static int ggml_cuda_mul_mat_q(ggml_backend_cuda_context & ctx, const ggml_tenso static int ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cgraph * cgraph, int node_n) { - const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q. // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data. @@ -2192,26 +2111,11 @@ static int ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor bool any_gpus_with_slow_fp16 = false; - if (split) { - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; - auto & tensor_split = buft_ctx->tensor_split; - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - // skip devices that are not going to do any work: - if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) { - continue; - } + const int cc = ggml_cuda_info().devices[ctx.device].cc; + use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); + any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); - const int cc = ggml_cuda_info().devices[id].cc; - use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); - any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); - } - } else { - const int cc = ggml_cuda_info().devices[ctx.device].cc; - use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); - any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); - } - - if (!split && (use_mul_mat_vec_q || use_mul_mat_q) && src1->ne[2]*src1->ne[3] == 1) { + if ((use_mul_mat_vec_q || use_mul_mat_q) && src1->ne[2]*src1->ne[3] == 1) { return ggml_cuda_mul_mat_q(ctx, src0, src1, dst, cgraph, node_n, use_mul_mat_vec_q); } @@ -2223,13 +2127,13 @@ static int ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + if (any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // FP32 precision KQ single-batch for batch size 1 without FlashAttention ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst); - } else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + } else if (any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // FP32 precision KQV single-batch for batch size 1 without FlashAttention ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst); - } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) + } else if (src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch without FlashAttention ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); @@ -2364,7 +2268,6 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * ggml_backend_buffer_is_cuda(src0->buffer) && ggml_backend_buffer_is_cuda(src1->buffer) && ggml_backend_buffer_is_cuda(dst->buffer) && - !ggml_backend_buffer_is_cuda_split(src0->buffer) && src1->type == GGML_TYPE_F32) { int device_id = ctx.device; ggml_backend_cuda_buffer_context * src0_ctx = (ggml_backend_cuda_buffer_context *) src0->buffer->context; @@ -2405,8 +2308,7 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * if (next && next->op == GGML_OP_MUL_MAT_ID && next->src[0]->type == src0->type && src1 == next->src[1] && ggml_are_same_shape(src0, next->src[0]) && ggml_backend_buffer_is_cuda(next->src[0]->buffer) && - ggml_backend_buffer_is_cuda(next->buffer) && - !ggml_backend_buffer_is_cuda_split(next->src[0]->buffer)) { + ggml_backend_buffer_is_cuda(next->buffer)) { ggml_backend_cuda_buffer_context * next_src0_ctx = (ggml_backend_cuda_buffer_context *) next->src[0]->buffer->context; ggml_backend_cuda_buffer_context * next_dst_ctx = (ggml_backend_cuda_buffer_context *) next->buffer->context; if (next_src0_ctx->device == device_id && @@ -2432,8 +2334,6 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers"); - cudaStream_t stream = ctx.stream(); const int64_t n_as = ne02; @@ -2572,8 +2472,6 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten ggml_backend_buffer_is_cuda(src0_2->buffer) && ggml_backend_buffer_is_cuda(src1->buffer) && ggml_backend_buffer_is_cuda(dst->buffer) && - !ggml_backend_buffer_is_cuda_split(src0_1->buffer) && - !ggml_backend_buffer_is_cuda_split(src0_2->buffer) && src1->type == GGML_TYPE_F32) { int device_id = ctx.device; ggml_backend_cuda_buffer_context * src0_1_ctx = (ggml_backend_cuda_buffer_context *) src0_1->buffer->context; @@ -2615,7 +2513,6 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten bool fuse_next = next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) && ggml_backend_buffer_is_cuda(next->src[0]->buffer) && - !ggml_backend_buffer_is_cuda_split(next->src[0]->buffer) && ((ggml_backend_cuda_buffer_context *)next->src[0]->buffer->context)->device == device_id && ggml_backend_buffer_is_cuda(next->buffer) && ((ggml_backend_cuda_buffer_context *)next->buffer->context)->device == device_id; @@ -2673,9 +2570,6 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten } } - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_1->buffer) && "mul_mat_id does not support split buffers"); - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_2->buffer) && "mul_mat_id does not support split buffers"); - GGML_TENSOR_BINARY_OP_LOCALS cudaStream_t stream = ctx.stream(); @@ -2975,8 +2869,6 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor GGML_ASSERT(src1->ne[2] == 1); GGML_ASSERT(src1->ne[3] == 1); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_1->buffer)); - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_2->buffer)); auto stream = ctx.stream(); @@ -3052,10 +2944,6 @@ static inline bool ops_are_same_device(const ggml_cgraph * cgraph, int first, in } static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst, const ggml_cgraph * cgraph, int & i) { - // why is this here instead of mul_mat? - if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) { - ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); - } #if IK_PRINT_TIMING int64_t tim1 = ggml_time_us(); @@ -3065,7 +2953,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg auto fusion = ctx.fusion; - //printf("%4d %s(%s)\n", i, ggml_op_name(dst->op), dst->name); + //printf("%4d %s(%s) on device %d. time = %ld\n", i, ggml_op_name(dst->op), dst->name, ctx.device, ggml_time_us()); switch (dst->op) { case GGML_OP_ARGMAX: ggml_cuda_argmax(ctx, dst); @@ -3101,6 +2989,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg cgraph->nodes[i+2]->op == GGML_OP_FUSED_RMS_NORM && ggml_is_contiguous(dst->src[0]) && ggml_is_contiguous(dst->src[1]) && + dst->src[0]->type == GGML_TYPE_F32 && // with split mode "attn" we can end up having f16 ggml_are_same_shape(dst->src[0], dst->src[1]) && dst == cgraph->nodes[i+1]->src[0] && ggml_is_contiguous(cgraph->nodes[i+1]->src[1]) && @@ -3547,7 +3436,43 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ #ifdef GGML_CUDA_NO_PEER_COPY return false; #else - CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); + if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + // + // The goal here is to reduce traffic between GPU's, which is entirely non-negligible + // for prompt processing. + // We cast the tensor to be copied to f16, copy the f16 data peer-to-peer + // and then cast back to f32 on the destination side. + // The cost for converting to/from f16 is much ower than the cost of copying + // two times more data over PCI-E (well, at least the 30 GB/s PCI-E I have). + // iBut for some reason the following is not working. + // Can somebody tell me why? + // + ggml_cuda_pool_alloc tmp_src(cuda_ctx_src->pool(), ggml_nelements(src)); + ggml_cuda_pool_alloc tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst)); + + auto src_f16 = *src; + src_f16.type = GGML_TYPE_F16; + for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2; + src_f16.data = tmp_src.get(); + + auto dst_f16 = *dst; + dst_f16.type = GGML_TYPE_F16; + for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2; + dst_f16.data = tmp_dst.get(); + + ggml_cuda_set_device(cuda_ctx_src->device); + ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true); + CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream())); + + CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream())); + + ggml_cuda_set_device(cuda_ctx_dst->device); + CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream())); + ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true); + + } else { + CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); + } #endif } @@ -3571,6 +3496,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_cuda_set_device(cuda_ctx->device); CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream())); GGML_UNUSED(backend); @@ -3600,7 +3526,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG - GGML_CUDA_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__); + GGML_CUDA_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer %s\n", __func__, node->src[0]->name); #endif } @@ -3790,6 +3716,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx // TODO const bool integrated = false; //ggml_cuda_info().devices[cuda_ctx->device].integrated; + //printf("======================== %s: graph with %d nodes on device %d. time = %ld\n", __func__, cgraph->n_nodes, cuda_ctx->device, ggml_time_us()); while (!graph_evaluated_or_captured) { // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. // With the use of CUDA graphs, the execution will be performed by the graph launch. @@ -3823,8 +3750,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { assert(node->src[j]->buffer); - //assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || - // ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft))); } } #else @@ -3953,277 +3878,6 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t return GGML_STATUS_SUCCESS; } -/* -GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - ggml_cuda_set_device(cuda_ctx->device); - -#ifdef USE_CUDA_GRAPH - static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); - - // Objects required for CUDA Graph - if (cuda_ctx->cuda_graph == nullptr) { - cuda_ctx->cuda_graph.reset(new ggml_cuda_graph()); - } - - bool use_cuda_graph = true; - bool cuda_graph_update_required = false; - // vector of pointers to CUDA cpy kernels, which are required to identify - // kernel parameters which need updated in the graph for each token - std::vector ggml_cuda_cpy_fn_ptrs; - - if (cuda_ctx->cuda_graph->graph == nullptr) { - if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { - cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__); -#endif - } - } - - // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly, - // or previous graph capture failure. - // Also disable for multi-gpu for now. TO DO investigate - if (disable_cuda_graphs_due_to_env - || cuda_ctx->cuda_graph->disable_due_to_gpu_arch - || cuda_ctx->cuda_graph->disable_due_to_too_many_updates - || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) { - use_cuda_graph = false; - } - - if (use_cuda_graph) { - if (cuda_ctx->cuda_graph->instance == nullptr) { - cuda_graph_update_required = true; - } - - // Check if the graph size has changed - if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { - cuda_graph_update_required = true; - cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); - } - - // Loop over nodes in GGML graph to determine if CUDA graph update is required - // and store properties to allow this comparison for the next token - for (int i = 0; i < cgraph->n_nodes; i++) { - bool has_matching_properties = true; - if (!cuda_graph_update_required) { - has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); - } - if (!has_matching_properties) { - cuda_graph_update_required = true; - } - set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); - } - - // Loop over nodes in GGML graph to obtain info needed for CUDA graph - cuda_ctx->cuda_graph->updated_kernel_arg.clear(); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - - if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { - use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); -#endif - } - - if (node->op == GGML_OP_MUL_MAT_ID || node->op == GGML_OP_MOE_FUSED_UP_GATE) { - use_cuda_graph = false; // This node type is not supported by CUDA graph capture -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); -#endif - } - - if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { - // disable CUDA graphs for batch size > 1 for now. - // Changes in batch size or context size can cause changes to the grid size of some kernels. - use_cuda_graph = false; -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); -#endif - } - if (node->op == GGML_OP_MULTI_ADD && node->ne[1] > 1) { - // disable CUDA graphs for batch size > 1 for now. - // Changes in batch size or context size can cause changes to the grid size of some kernels. - use_cuda_graph = false; -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); -#endif - } - - if (node->op == GGML_OP_CPY) { - // store the copy op parameter which changes with each token. - cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data)); - // store a pointer to each copy op CUDA kernel to identify it later - void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]); - if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) { - ggml_cuda_cpy_fn_ptrs.push_back(ptr); - } - } - - if (!use_cuda_graph) { - break; - } - } - - // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. - if (use_cuda_graph && cuda_graph_update_required) { - cuda_ctx->cuda_graph->number_consecutive_updates++; - } else { - cuda_ctx->cuda_graph->number_consecutive_updates = 0; - } - - if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { - cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); -#endif - } - } - - if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture - CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); - } - -#else - bool use_cuda_graph = false; - bool cuda_graph_update_required = false; -#endif // USE_CUDA_GRAPH - - bool graph_evaluated_or_captured = false; - - while (!graph_evaluated_or_captured) { - // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. - // With the use of CUDA graphs, the execution will be performed by the graph launch. - if (!use_cuda_graph || cuda_graph_update_required) { - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - ggml_tensor * next = i < cgraph->n_nodes-1 ? cgraph->nodes[i+1] : nullptr; - - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { - continue; - } - -#ifndef NDEBUG - assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - if (node->src[j] != nullptr) { - assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); - } - } -#endif - - bool skip_next = false; - bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, next, skip_next); - if (!ok) { - GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); - } - GGML_ASSERT(ok); - if (skip_next) ++i; - } - } - -#ifdef USE_CUDA_GRAPH - if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture - if (cuda_ctx->cuda_graph->graph != nullptr) { - CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph)); - cuda_ctx->cuda_graph->graph = nullptr; - } - CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph)); - -#if 0 - if (disable_cuda_graphs_due_to_failed_capture) { - use_cuda_graph = false; - cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__); -#endif - } else { - graph_evaluated_or_captured = true; // CUDA graph has been captured - } -#endif - graph_evaluated_or_captured = true; // CUDA graph has been captured - } else { - graph_evaluated_or_captured = true; // ggml graph has been directly evaluated - } - } - - if (use_cuda_graph) { - if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph. - CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); - } - - // Perform update to graph (if required for this token), and change copy parameter (required for every token) - - if (cuda_graph_update_required) { - // Extract nodes from graph - // First call with null argument gets number of nodes in graph - CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes)); - // Subsequent call with non-null argument gets nodes - cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes); - cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes); - if (cuda_ctx->cuda_graph->num_nodes > 0) { - CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes)); - - // Loop over nodes, and extract kernel parameters from each node - for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) { - cudaGraphNodeType node_type; - CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type)); - if (node_type == cudaGraphNodeTypeKernel) { - cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime - if (stat == cudaErrorInvalidDeviceFunction) { - // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node. - // We don't need to update blas nodes, so clear error and move on. - cudaGetLastError(); - } else { - GGML_ASSERT(stat == cudaSuccess); - } - } - } - } - } - - // One of the arguments to the copy kernel is updated for each token, hence we need to - // replace that argument with the updated value in the CUDA graph - if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured - int k = 0; - for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) { - if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) { - char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++); - cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr; - CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i])); - } - } - } - - // Update graph executable - cudaGraphExecUpdateResultInfo result_info; - cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); - if (stat == cudaErrorGraphExecUpdateFailure) { -#ifndef NDEBUG - GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__); -#endif - // The pre-existing graph exec cannot be updated due to violated constraints - // so instead clear error and re-instantiate - cudaGetLastError(); - CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance)); - cuda_ctx->cuda_graph->instance = nullptr; - CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); - } else { - GGML_ASSERT(stat == cudaSuccess); - } - // Launch graph - CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream())); -#else - graph_evaluated_or_captured = true; -#endif // USE_CUDA_GRAPH - } - - return GGML_STATUS_SUCCESS; -} -*/ - GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; switch (op->op) { @@ -4487,6 +4141,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons } GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + //printf("%s(%s, %s): %d, %d\n", __func__, ggml_backend_name(backend), ggml_backend_buft_name(buft), ggml_backend_buft_is_cuda_split(buft), ggml_backend_buft_is_cuda(buft)); if (ggml_backend_buft_is_cuda_split(buft)) { return true; } diff --git a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh index 95a80d54..a21bafb5 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh +++ b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh @@ -18,17 +18,8 @@ struct ggml_cuda_type_traits { static constexpr int qi = 4; }; -// Reminder: -// constexpr int qk = ggml_cuda_type_traits::qk; -// constexpr int qi = ggml_cuda_type_traits::qi; -// constexpr int vdr = get_vdr_mmvq(type); - -// QI4_XS = 256/(4*2) = 32 -// vdr = 4, qi = 32 -> qi/vdr = 8, kqs = 4*(tid%8), blocks_per_iter = 4*1*32/32 = 4 -// vdr = 2, qi = 32 -> qi/vdr =16, kqs = 2*(tid%16), blocks_per_iter = 2*1*32/32 = 2 -namespace { template -__device__ void iqk_mul_mat_vec_q_kerne( +static __device__ void iqk_mul_mat_vec_q_kerne( const void * __restrict__ vx, const void * __restrict__ vy, const float * bias, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size) { @@ -110,7 +101,7 @@ __device__ void iqk_mul_mat_vec_q_kerne( } template -__device__ void iqk_fused_mul_mat_vec_q_kernel( +static __device__ void iqk_fused_mul_mat_vec_q_kernel( const void * __restrict__ vup, const void * __restrict__ vgate, const void * __restrict__ vy, float * __restrict__ dst, const float * __restrict__ bias_u, const float * __restrict__ bias_g, const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size, @@ -228,7 +219,7 @@ template -void iqk_mul_mat_vec_q_cuda(const mmvq_args & args, cudaStream_t stream) { +static void iqk_mul_mat_vec_q_cuda(const mmvq_args & args, cudaStream_t stream) { GGML_ASSERT(args.ncols_x % ggml_blck_size(type) == 0); //GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE); @@ -428,7 +419,7 @@ void iqk_mul_mat_vec_q_cuda(const mmvq_args & args, cudaStream_t stream) { } } -__device__ __forceinline__ void get_int_from_table_16_shift(const uint32_t & q4, uint16_t shift, const uint8_t * all_values, +static __device__ __forceinline__ void get_int_from_table_16_shift(const uint32_t & q4, uint16_t shift, const uint8_t * all_values, int & val1, int & val2) { uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; @@ -476,7 +467,7 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con #endif } -__device__ __forceinline__ int int_from_table(const uint8_t * a8, const uint8_t * values) { +static __device__ __forceinline__ int int_from_table(const uint8_t * a8, const uint8_t * values) { uint16_t v1 = values[a8[0]] | (values[a8[1]] << 8); uint16_t v2 = values[a8[2]] | (values[a8[3]] << 8); return v1 | (v2 << 16); @@ -506,8 +497,6 @@ __device__ __forceinline__ int int_from_table(const uint8_t * a8, const uint8_t #define VDR_IQ3_K_Q8_1_MMVQ 4 #define VDR_IQ3_K_Q8_1_MMQ 4 -} // namespace - extern void mul_mat_vec_iq2_k_q8_1_cuda(const mmvq_args & args, cudaStream_t stream); extern void mul_mat_vec_iq3_k_q8_1_cuda(const mmvq_args & args, cudaStream_t stream); extern void mul_mat_vec_iq4_k_q8_1_cuda(const mmvq_args & args, cudaStream_t stream); diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 98d33ebc..c4619e60 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -176,15 +176,15 @@ static __global__ void rms_norm_f32_nc( } } -template -static __global__ void fused_rms_norm_f32(const float * x, const float * y, float * dst, const int ncols, const float eps) { +template +static __global__ void fused_rms_norm_f32(const src_t * x, const float * y, float * dst, const int ncols, const float eps) { const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row*ncols + col]; + const float xi = (float)x[row*ncols + col]; tmp += xi * xi; } @@ -206,13 +206,13 @@ static __global__ void fused_rms_norm_f32(const float * x, const float * y, floa const float scale = rsqrtf(mean + eps); for (int col = tid; col < ncols; col += block_size) { - dst[row*ncols + col] = scale * y[col] * x[row*ncols + col]; + dst[row*ncols + col] = scale * y[col] * (float)x[row*ncols + col]; } } -template +template static __global__ void fused_rms_norm_f32_nc( - const float * x, const float * y, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, + const src_t * x, const float * y, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps) { const int nrows = gridDim.x; const int nchannels = gridDim.y; @@ -229,7 +229,7 @@ static __global__ void fused_rms_norm_f32_nc( float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { - const float xi = x[col]; + const float xi = (float)x[col]; tmp += xi * xi; } @@ -257,7 +257,7 @@ static __global__ void fused_rms_norm_f32_nc( const float scale = rsqrtf(mean + eps); for (int col = tid; col < ncols; col += block_size) { - dst[col] = scale * y[col] * x[col]; + dst[col] = scale * y[col] * (float)x[col]; } } @@ -307,7 +307,8 @@ static void rms_norm_f32_nc_cuda( } } -static void fused_rms_norm_f32_cuda(const float * x, const float * y, float * dst, +template +static void fused_rms_norm_f32_cuda(const src_t * x, const float * y, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { constexpr int kBlockSize = 256; GGML_ASSERT(ncols % WARP_SIZE == 0); @@ -331,8 +332,9 @@ static void fused_rms_norm_f32_cuda(const float * x, const float * y, float * ds } } +template static void fused_rms_norm_f32_nc_cuda( - const float * x, const float * y, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples, + const src_t * x, const float * y, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, cudaStream_t stream) { const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { @@ -432,7 +434,7 @@ void ggml_cuda_op_fused_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT(src0->ne[0] == src1->ne[0]); @@ -445,14 +447,22 @@ void ggml_cuda_op_fused_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * if (ggml_is_contiguous(src0)) { const int64_t nrows = ggml_nrows(src0); - fused_rms_norm_f32_cuda(src0_d, src1_d, dst_d, ne00, nrows, eps, stream); + if (src0->type == GGML_TYPE_F32) { + fused_rms_norm_f32_cuda(src0_d, src1_d, dst_d, ne00, nrows, eps, stream); + } else { + fused_rms_norm_f32_cuda((const half *)src0_d, src1_d, dst_d, ne00, nrows, eps, stream); + } } else { auto ts0 = ggml_type_size(src0->type); GGML_ASSERT(src0->nb[0] == ts0); auto s01 = src0->nb[1] / ts0; auto s02 = src0->nb[2] / ts0; auto s03 = src0->nb[3] / ts0; - fused_rms_norm_f32_nc_cuda(src0_d, src1_d, dst_d, ne00, src0->ne[1], src0->ne[2], src0->ne[3], s01, s02, s03, eps, stream); + if (src0->type == GGML_TYPE_F32) { + fused_rms_norm_f32_nc_cuda(src0_d, src1_d, dst_d, ne00, src0->ne[1], src0->ne[2], src0->ne[3], s01, s02, s03, eps, stream); + } else { + fused_rms_norm_f32_nc_cuda((const half *)src0_d, src1_d, dst_d, ne00, src0->ne[1], src0->ne[2], src0->ne[3], s01, s02, s03, eps, stream); + } } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index cee25268..cb9c7562 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7284,7 +7284,19 @@ static struct ggml_tensor * ggml_fused_rms_norm_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result; + if (inplace) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + result = ggml_view_tensor(ctx, a); + } else { + if (a->type == GGML_TYPE_F32) { + result = ggml_dup_tensor(ctx, a); + } else { + result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], a->ne[1], a->ne[2], a->ne[3]); + } + } + + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &eps, sizeof(eps)); diff --git a/include/llama.h b/include/llama.h index ea63574f..3c9b331c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -275,7 +275,8 @@ extern "C" { enum llama_split_mode { LLAMA_SPLIT_MODE_NONE = 0, // single GPU LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs - LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs + LLAMA_SPLIT_MODE_ATTN = 2, // splits self-attention computations across GPUs + LLAMA_SPLIT_MODE_GRAPH = 3, // splits computations across GPUs }; diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 9588d1b9..123fc185 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -621,7 +621,8 @@ ggml_tensor * llm_build_context::llm_build_norm( ggml_tensor * llm_build_context::llm_build_ffn( ggml_context * ctx, llama_context & lctx, - ggml_tensor * cur, + ggml_tensor * ffn_norm, + ggml_tensor * input, ggml_tensor * up, ggml_tensor * up_b, ggml_tensor * up_s, @@ -634,14 +635,83 @@ ggml_tensor * llm_build_context::llm_build_ffn( ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, - const llm_build_cb & cb, int il) { + const llm_build_cb & cb, int il, ggml_cgraph * graph) { + + if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s && + up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR && + (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) { + auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : + type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU; + auto u = (ggml_split_tensor_t *)up->extra; + auto g = (ggml_split_tensor_t *)gate->extra; + auto d = (ggml_split_tensor_t *)down->extra; + GGML_ASSERT(u->n_device == g->n_device && u->n_device == d->n_device); + std::vector ffn; + ffn.reserve(u->n_device); + for (int id = 0; id < u->n_device; ++id) { + int il_cb = 1000*(id+1) + il; + auto split_u = u->splits[id]; + auto split_g = g->splits[id]; + auto split_d = d->splits[id]; + GGML_ASSERT((!split_u && !split_g && split_d) || (split_u && split_g && split_d)); + if (!split_u) continue; + auto cur = input; + if (ffn_norm && ffn_norm->extra) { + auto norm = (ggml_split_tensor_t *)ffn_norm->extra; + cur = llm_build_norm(ctx, input, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il); + cb(cur, "ffn_inp_normed", il_cb); + } + else if (input->type != GGML_TYPE_F32) { + cur = ggml_cast(ctx, input, GGML_TYPE_F32); + } + cur = ggml_fused_up_gate(ctx, split_u, split_g, cur, unary_op); + cb(cur, "ffn_up_gate", il_cb); + cur = llm_build_lora_mm(lctx, ctx, split_d, cur); + cb(cur, "ffn_down", il_cb); + if (lctx.model.arch == LLM_ARCH_GLM4 || lctx.model.arch == LLM_ARCH_GLM4_MOE) { + // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } + if (cur->ne[1] >= 32) { + cur = ggml_cast(ctx, cur, GGML_TYPE_F16); + } + if (graph) { + ggml_build_forward_expand(graph, cur); + } + ffn.push_back(cur); + } + if (ffn.size() == 1) return ffn.front(); + auto cur = ggml_add(ctx, ffn[0], ffn[1]); + cb(cur, "combine_ffn", il); + cur->op_params[0] = 0xff; + for (int id = 2; id < int(ffn.size()); ++id) { + cur = ggml_add(ctx, cur, ffn[id]); + cb(cur, "combine_ffn", il); + } + if (ffn.size() > 2) { + cur->op_params[0] = 0xff; + } + //if (cur->type != GGML_TYPE_F32) { + // cur = ggml_cast(ctx, cur, GGML_TYPE_F32); + //} + + return cur; + } + + if (ffn_norm) { + input = llm_build_norm(ctx, input, lctx.model.hparams, ffn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(input, "ffn_norm", il); + } + else if (input->type != GGML_TYPE_F32) { + input = ggml_cast(ctx, input, GGML_TYPE_F32); + } if (lctx.cparams.fused_up_gate && up && gate && !up_b && !up_s && !gate_b && !gate_s && type_gate == LLM_FFN_PAR && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) { auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU; - cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op); + auto cur = ggml_fused_up_gate(ctx, up, gate, input, unary_op); cb(cur, "ffn_up_gate", il); if (down) { cur = llm_build_lora_mm(lctx, ctx, down, cur); @@ -663,7 +733,7 @@ ggml_tensor * llm_build_context::llm_build_ffn( return cur; } - struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur; + struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, input) : input; cb(tmp, "ffn_up", il); if (up_b) { @@ -676,6 +746,7 @@ ggml_tensor * llm_build_context::llm_build_ffn( cb(tmp, "ffn_up_s", il); } + auto cur = input; if (gate) { switch (type_gate) { case LLM_FFN_SEQ: @@ -983,6 +1054,183 @@ llm_expert_gating_func_type gating_op, } +ggml_tensor * llm_build_context::llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx, + ggml_tensor * ffn_norm, + ggml_tensor * input, + ggml_tensor * gate_inp, ggml_tensor * gate_inp_b, + ggml_tensor * up_exps, ggml_tensor * up_exps_b, + ggml_tensor * gate_exps, ggml_tensor * gate_exps_b, + ggml_tensor * down_exps, ggml_tensor * down_exps_b, + ggml_tensor * exp_probs_b, + ggml_tensor * up_shexp, ggml_tensor * up_b_shexp, + ggml_tensor * gate_shexp, ggml_tensor * gate_b_shexp, + ggml_tensor * down_shexp, ggml_tensor * down_b_shexp, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + bool scale_w, + float w_scale, +llm_expert_gating_func_type gating_op, + llm_ffn_op_type type_op_shexp, + const llm_build_cb & cb, int il, ggml_cgraph * graph) { + + auto split_up_exps = (ggml_split_tensor_t *)up_exps->extra; + auto split_gate_exps = (ggml_split_tensor_t *)gate_exps->extra; + auto split_down_exps = (ggml_split_tensor_t *)down_exps->extra; + auto split_up_shexp = up_shexp ? (ggml_split_tensor_t *)up_shexp->extra : nullptr; + auto split_gate_shexp = gate_shexp ? (ggml_split_tensor_t *)gate_shexp->extra : nullptr; + auto split_down_shexp = down_shexp ? (ggml_split_tensor_t *)down_shexp->extra : nullptr; + auto split_up_b_shexp = up_b_shexp ? (ggml_split_tensor_t *)up_b_shexp : nullptr; + auto split_gate_b_shexp = gate_b_shexp ? (ggml_split_tensor_t *)gate_b_shexp : nullptr; + auto split_down_b_shexp = down_b_shexp ? (ggml_split_tensor_t *)down_b_shexp : nullptr; + if (!split_up_exps && !split_gate_exps && !split_down_exps) { + auto cur = input; + if (ffn_norm) { + auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm; + cur = llm_build_norm(ctx, input, lctx.model.hparams, the_ffn_norm, nullptr, LLM_NORM_RMS, cb, il); + cb(cur, "ffn_inp_normed", il); + } + else if (cur->type != GGML_TYPE_F32) { + cur = ggml_cast(ctx, cur, GGML_TYPE_F32); + } + auto the_gate_inp = gate_inp->extra ? ((ggml_split_tensor_t *)gate_inp->extra)->splits[lctx.model.main_gpu] : gate_inp; + auto the_gate_inp_b = gate_inp_b ? gate_inp_b->extra ? ((ggml_split_tensor_t *)gate_inp_b->extra)->splits[lctx.model.main_gpu] : gate_inp_b : nullptr; + auto the_exp_probs_b = exp_probs_b ? exp_probs_b->extra ? ((ggml_split_tensor_t *)exp_probs_b->extra)->splits[lctx.model.main_gpu] : exp_probs_b : nullptr; + //int n_before = graph->n_nodes; + auto routed_out = llm_build_moe_ffn(ctx, lctx, cur, + the_gate_inp, the_gate_inp_b, + up_exps, up_exps_b, + gate_exps, gate_exps_b, + down_exps, down_exps_b, + the_exp_probs_b, + n_expert, n_expert_used, + type_op, norm_w, scale_w, w_scale, + gating_op, cb, il, graph); + cb(routed_out, "routed_out", il); + ggml_build_forward_expand(graph, routed_out); + //printf("Using non-split llm_build_moe_ffn for layer %d. n_before = %d, n_now = %d\n", il, n_before, graph->n_nodes); + + if (up_shexp && gate_shexp && down_shexp) { + if (split_up_shexp) { + //printf("Using split ffn for shared experts in layer %d\n", il); + std::vector results(split_up_shexp->n_device); + GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == split_up_shexp->n_device); + GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_shexp->n_device); + GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_shexp->n_device); + for (int id = 0; id < split_up_shexp->n_device; ++id) { + int il_cb = 1000*id + il; + auto the_ffn_norm = ffn_norm ? ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[id] : ffn_norm : nullptr; + auto shared_out = llm_build_ffn(ctx, lctx, the_ffn_norm, input, + split_up_shexp->splits[id], split_up_b_shexp ? split_up_b_shexp->splits[id] : nullptr, nullptr, + split_gate_shexp->splits[id], split_gate_b_shexp ? split_gate_b_shexp->splits[id] : nullptr, nullptr, + split_down_shexp->splits[id], split_down_b_shexp ? split_down_b_shexp->splits[id] : nullptr, nullptr, + nullptr, type_op_shexp, LLM_FFN_PAR, cb, il); + cb(shared_out, "ffn_shexp_out", il_cb); + if (shared_out->ne[1] > 32) { + shared_out = ggml_cast(ctx, shared_out, GGML_TYPE_F16); + } + results[id] = shared_out; + } + cur = ggml_add(ctx, results[0], results[1]); + cur->op_params[0] = 0xff; + cb(cur, "ffn_shared_combined", il); + for (int id = 2; id < int(results.size()); ++id) { + cur = ggml_add(ctx, cur, results[id]); + cb(cur, "ffn_shared_combined", il); + } + if (routed_out->ne[1] > 32) { + auto routed_out_f16 = ggml_cast(ctx, routed_out, GGML_TYPE_F16); + cur = ggml_add(ctx, routed_out_f16, cur); + } else { + cur = ggml_add(ctx, routed_out, cur); + } + cb(cur, "ffn_out", il); + } else { + //printf("Using non-split ffn for shared experts in layer %d\n", il); + auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur, + up_shexp, up_b_shexp, nullptr, + gate_shexp, gate_b_shexp, nullptr, + down_shexp, down_b_shexp, nullptr, + nullptr, type_op_shexp, LLM_FFN_PAR, cb, il); + cb(shared_out, "ffn_shexp_out", il); + cur = ggml_add(ctx, routed_out, shared_out); + cb(cur, "ffn_out", il); + } + } else { + cur = routed_out; + } + if (cur != routed_out) { + ggml_build_forward_expand(graph, cur); + } + return cur; + } + GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps); + GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device); + std::vector results(split_up_exps->n_device); + GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) || + ( split_up_shexp && split_gate_shexp && split_down_shexp)); + auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra; + GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device); + auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr; + GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device); + for (int id = 0; id < split_up_exps->n_device; ++id) { + int il_cb = 1000*(id + 1) + il; + auto cur = input; + if (ffn_norm) { + auto split_ffn_norm = (ggml_split_tensor_t *)ffn_norm->extra; + GGML_ASSERT(split_ffn_norm && split_ffn_norm->n_device == split_up_exps->n_device); + cur = llm_build_norm(ctx, input, lctx.model.hparams, split_ffn_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il); + cb(cur, "ffn_inp_normed", il_cb); + } + else if (cur->type != GGML_TYPE_F32) { + cur = ggml_cast(ctx, cur, GGML_TYPE_F32); + } + auto routed_out = llm_build_moe_ffn(ctx, lctx, cur, + split_gate_inp->splits[id], gate_inp_b, + split_up_exps->splits[id], up_exps_b, + split_gate_exps->splits[id], gate_exps_b, + split_down_exps->splits[id], down_exps_b, + split_exp_probs_b ? split_exp_probs_b->splits[id] : nullptr, + n_expert, n_expert_used, + type_op, norm_w, scale_w, w_scale, + gating_op, cb, il, graph); + cb(routed_out, "routed_out", il_cb); + + if (split_up_shexp) { + GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == split_up_exps->n_device); + GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_exps->n_device); + GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_exps->n_device); + auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur, + split_up_shexp->splits[id], split_up_b_shexp ? split_up_b_shexp->splits[id] : nullptr, nullptr, + split_gate_shexp->splits[id], split_gate_b_shexp ? split_gate_b_shexp->splits[id] : nullptr, nullptr, + split_down_shexp->splits[id], split_down_b_shexp ? split_down_b_shexp->splits[id] : nullptr, nullptr, + nullptr, type_op_shexp, LLM_FFN_PAR, cb, il); + cb(shared_out, "ffn_shexp_out", il_cb); + + cur = ggml_add(ctx, routed_out, shared_out); + cb(cur, "ffn_out", il_cb); + } else { + cur = routed_out; + } + if (cur->ne[1] >= 32) { + cur = ggml_cast(ctx, cur, GGML_TYPE_F16); + cb(cur, "ffn_out_f16", il_cb); + } + results[id] = cur; + } + if (results.size() == 1) return results.front(); + + auto cur = ggml_add(ctx, results[0], results[1]); + cur->op_params[0] = 0xff; + cb(cur, "ffn_combined", il); + for (int id = 2; id < int(results.size()); ++id) { + cur = ggml_add(ctx, cur, results[id]); + cb(cur, "ffn_combined", il); + } + return cur; +} + static ggml_tensor * llm_build_kqv( struct ggml_context * ctx, struct llama_context & lctx, @@ -1243,7 +1491,7 @@ std::tuple llm_build_context::llm_buil ggml_tensor * wq, ggml_tensor * bq, ggml_tensor * wk, ggml_tensor * bk, ggml_tensor * wv, ggml_tensor * bv, - float attention_scale, int il) { + float attention_scale, int il) const { auto Qcur = llm_build_lora_mm(lctx, ctx0, wq, cur); cb(Qcur, "Qcur", il); auto Kcur = llm_build_lora_mm(lctx, ctx0, wk, cur); @@ -1282,7 +1530,7 @@ std::tuple llm_build_context::llm_buil ggml_tensor * wq, ggml_tensor * bq, ggml_tensor * wk, ggml_tensor * bk, ggml_tensor * wv, ggml_tensor * bv, - ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) { + ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) const { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); if (wqkv) { @@ -1299,12 +1547,12 @@ std::tuple llm_build_context::llm_buil cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); if (q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il); + Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il); cb(Qcur, "Qcur_normed", il); ggml_build_forward_expand(gf, Qcur); } if (k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il); + Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il); cb(Kcur, "Kcur_normed", il); ggml_build_forward_expand(gf, Kcur); } @@ -1336,12 +1584,12 @@ std::tuple llm_build_context::llm_buil cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); if (q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il); + Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il); cb(Qcur, "Qcur_normed", il); ggml_build_forward_expand(gf, Qcur); } if (k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il); + Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il); cb(Kcur, "Kcur_normed", il); ggml_build_forward_expand(gf, Kcur); } @@ -1351,21 +1599,79 @@ std::tuple llm_build_context::llm_buil } auto [Q, K, V] = llm_build_mul_mat_qkv(gf, cur, wq, bq, wk, bk, wv, bv, attention_scale, il); - auto Qcur = ggml_reshape_3d(ctx0, Q, n_embd_head, n_head, n_tokens); + auto Qcur = ggml_reshape_3d(ctx0, Q, n_embd_head, Q->ne[0]/n_embd_head, n_tokens); if (q_norm) { Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il); cb(Qcur, "Qcur_normed", il); } - auto Kcur = ggml_reshape_3d(ctx0, K, n_embd_head, n_head_kv, n_tokens); + auto Kcur = ggml_reshape_3d(ctx0, K, n_embd_head, K->ne[0]/n_embd_head, n_tokens); if (k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il); + Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il); cb(Kcur, "Kcur_normed", il); } auto Vcur = V; return {Qcur, Kcur, Vcur}; } +static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml_tensor * cur, ggml_tensor * output, const llm_build_cb & cb) { + // lm_head + if (output->extra) { + auto split_output = (ggml_split_tensor_t *)output->extra; + std::vector o; + o.reserve(split_output->n_device); + for (int id = 0; id < split_output->n_device; ++id) { + auto split = split_output->splits[id]; + if (!split) continue; + o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur)); + cb(o.back(), "output", id); + } + if (o.size() == 1) cur = o.front(); + cur = ggml_concat(ctx, o[0], o[1], 0); + for (int id = 2; id < int(o.size()); ++id) { + cur = ggml_concat(ctx, cur, o[id], 0); + } + } else { + cur = llm_build_context::llm_build_lora_mm(lctx, ctx, output, cur); + } + return cur; +} + +static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml_tensor * cur, ggml_tensor * output, ggml_tensor * output_norm, const llm_build_cb & cb) { + // lm_head + if (output->extra) { + auto split_output = (ggml_split_tensor_t *)output->extra; + auto split_output_norm = output_norm && output_norm->extra ? (ggml_split_tensor_t *)output_norm->extra : nullptr; + std::vector o; + o.reserve(split_output->n_device); + for (int id = 0; id < split_output->n_device; ++id) { + auto split = split_output->splits[id]; + if (!split) continue; + if (output_norm) { + auto the_norm = split_output_norm ? split_output_norm->splits[id] : output_norm; + auto cur_normed = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, the_norm, NULL, LLM_NORM_RMS, cb, -1); + cb(cur_normed, "output_normed", 1000*(id+1)); + o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur_normed)); + } else { + o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur)); + } + cb(o.back(), "output", id); + } + if (o.size() == 1) cur = o.front(); + cur = ggml_concat(ctx, o[0], o[1], 0); + for (int id = 2; id < int(o.size()); ++id) { + cur = ggml_concat(ctx, cur, o[id], 0); + } + } else { + if (output_norm) { + cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1); + cb(cur, "output_normed", -1); + } + cur = llm_build_context::llm_build_lora_mm(lctx, ctx, output, cur); + } + return cur; +} + ggml_cgraph * llm_build_context::build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -1405,15 +1711,22 @@ ggml_cgraph * llm_build_context::build_llama() { bool use_rope = model.arch == LLM_ARCH_LLAMA4 ? (il + 1) % hparams.n_no_rope_layer_step != 0 : true; auto this_KQ_mask = hparams.n_swa > 0 && hparams.n_swa_pattern > 0 && il % hparams.n_swa_pattern < (hparams.n_swa_pattern - 1) ? KQ_mask_swa : KQ_mask; + int this_n_swa = this_KQ_mask == KQ_mask_swa ? hparams.n_swa : 0; - // norm - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + // rope freq factors for llama3; may return nullptr for llama2 and other models + //auto rope_factors = build_rope_factors(il); // self-attention - { - // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + if (use_rope) { + cur = build_std_attention(gf, inpL, inp_pos, nullptr, this_KQ_mask, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il); + } + else { + + auto rope_factors = build_rope_factors(il); + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, model.layers[il].wqkv, model.layers[il].bqkv, @@ -1450,7 +1763,7 @@ ggml_cgraph * llm_build_context::build_llama() { cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, this_KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il, nullptr, - this_KQ_mask == KQ_mask_swa ? hparams.n_swa : 0); + this_n_swa); } if (il == n_layer - 1) { @@ -1475,15 +1788,12 @@ ggml_cgraph * llm_build_context::build_llama() { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { // non-MoE - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf); cb(cur, "ffn_out", il); } else if (model.arch == LLM_ARCH_LLAMA4) { // llama4 MoE @@ -1503,7 +1813,7 @@ ggml_cgraph * llm_build_context::build_llama() { cb, il, gf); // Shared experts - ggml_tensor * shexp_out = llm_build_ffn(ctx0, lctx, ffn_inp_normed, + ggml_tensor * shexp_out = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp_normed, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -1551,11 +1861,8 @@ ggml_cgraph * llm_build_context::build_llama() { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb); // For Granite architecture if (hparams.f_logit_scale) { @@ -1664,10 +1971,8 @@ ggml_cgraph * llm_build_context::build_deci() { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -1778,10 +2083,7 @@ ggml_cgraph * llm_build_context::build_baichuan() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -1873,10 +2175,7 @@ ggml_cgraph * llm_build_context::build_xverse() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -1986,7 +2285,7 @@ ggml_cgraph * llm_build_context::build_falcon() { // feed forward { - cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result + cur = llm_build_ffn(ctx0, lctx, nullptr, attn_norm, // !! use the attn norm, not the result model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2105,7 +2404,7 @@ ggml_cgraph * llm_build_context::build_grok() { cb(moe_out, "ffn_moe_out", il); if (model.layers[il].ffn_up) { - ggml_tensor* ffn_out = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor* ffn_out = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2344,10 +2643,7 @@ ggml_cgraph * llm_build_context::build_starcoder() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2424,10 +2720,7 @@ ggml_cgraph * llm_build_context::build_refact() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2604,21 +2897,21 @@ ggml_cgraph * llm_build_context::build_bert() { // feed-forward network if (model.arch == LLM_ARCH_BERT) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2704,10 +2997,7 @@ ggml_cgraph * llm_build_context::build_bloom() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2828,9 +3118,7 @@ ggml_cgraph * llm_build_context::build_mpt() { // feed forward { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2946,7 +3234,7 @@ ggml_cgraph * llm_build_context::build_stablelm() { // parallel residual cur = inpSA; } - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3049,10 +3337,7 @@ ggml_cgraph * llm_build_context::build_qwen() { // feed-forward forward { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3144,10 +3429,7 @@ ggml_cgraph * llm_build_context::build_qwen2() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3247,10 +3529,7 @@ ggml_cgraph * llm_build_context::build_qwen2vl() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3372,7 +3651,7 @@ ggml_cgraph * llm_build_context::build_qwen2moe() { ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); cb(cur_gate, "ffn_shexp_gate", il); - ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -3478,10 +3757,7 @@ ggml_cgraph * llm_build_context::build_qwen3() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3514,9 +3790,6 @@ ggml_cgraph * llm_build_context::build_qwen3() { ggml_cgraph * llm_build_context::build_qwen3moe() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3532,46 +3805,18 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - auto rope_cache = cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ? - ggml_rope_cache(ctx0, inp_pos, nullptr, n_embd_head, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow) : nullptr; - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + //cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); + //cb(cur, "attn_norm", il); - // self_attention - { - auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, - model.layers[il].wqkv, nullptr, - model.layers[il].wqk, nullptr, - model.layers[il].wq, nullptr, model.layers[il].wk, nullptr, model.layers[il].wv, nullptr, - model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0, il); - - if (rope_cache) { - Qcur = ggml_rope_fast(ctx0, Qcur, rope_cache); - Kcur = ggml_rope_fast(ctx0, Kcur, rope_cache); - } else { - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - } - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - - cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } + cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il); if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -3579,23 +3824,19 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = - llm_build_moe_ffn(ctx0, lctx, cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, true, - false, 0.0, - LLM_EXPERT_GATING_FUNC_SOFTMAX, - cb, il, gf); - cb(cur, "ffn_moe_out", il); + cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, + model.layers[il].ffn_gate_inp, nullptr, + model.layers[il].ffn_up_exps, nullptr, + model.layers[il].ffn_gate_exps, nullptr, + model.layers[il].ffn_down_exps, nullptr, + model.layers[il].ffn_exp_probs_b, + nullptr, nullptr, // we don't have shared expert biases? + nullptr, nullptr, + nullptr, nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, false, 0.0f, + LLM_EXPERT_GATING_FUNC_SOFTMAX, + LLM_FFN_SILU, cb, il, gf); cur = ggml_add(ctx0, cur, ffn_inp); cur = lctx.cvec.apply_to(ctx0, cur, il); @@ -3607,11 +3848,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3712,12 +3949,7 @@ ggml_cgraph * llm_build_context::build_qwen3vl() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3986,7 +4218,7 @@ ggml_cgraph * llm_build_context::build_phi2() { // FF { - ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output, + ffn_output = llm_build_ffn(ctx0, lctx, nullptr, attn_norm_output, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4099,14 +4331,11 @@ ggml_cgraph * llm_build_context::build_phi3() { cur = ggml_add(ctx0, cur, residual); residual = cur; - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // FF // special-case: the up and gate tensors are merged into a single tensor // TOOD: support into llm_build_ffn { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4195,7 +4424,7 @@ ggml_cgraph * llm_build_context::build_plamo() { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4292,10 +4521,7 @@ ggml_cgraph * llm_build_context::build_gpt2() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4394,10 +4620,7 @@ ggml_cgraph * llm_build_context::build_codeshell() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4485,10 +4708,7 @@ ggml_cgraph * llm_build_context::build_orion() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4578,10 +4798,7 @@ ggml_cgraph * llm_build_context::build_internlm2() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4691,10 +4908,7 @@ ggml_cgraph * llm_build_context::build_minicpm() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4793,12 +5007,9 @@ ggml_cgraph * llm_build_context::build_gemma() { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, sa_out, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4903,12 +5114,9 @@ ggml_cgraph * llm_build_context::build_gemma2() { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, sa_out, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5034,11 +5242,8 @@ ggml_cgraph * llm_build_context::build_gemma3() { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // feed-forward network - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, sa_out, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5132,11 +5337,7 @@ ggml_cgraph * llm_build_context::build_starcoder2() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -5392,7 +5593,7 @@ ggml_cgraph * llm_build_context::build_command_r() { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, ffn_inp, + cur = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5523,7 +5724,7 @@ ggml_cgraph * llm_build_context::build_olmo() { cur = llm_build_norm(ctx0, ffn_inp, hparams, NULL, NULL, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5637,10 +5838,7 @@ ggml_cgraph * llm_build_context::build_openelm() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5744,7 +5942,7 @@ ggml_cgraph * llm_build_context::build_gptneox() { cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -5772,7 +5970,7 @@ ggml_cgraph * llm_build_context::build_gptneox() { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -5865,10 +6063,7 @@ ggml_cgraph * llm_build_context::build_arctic() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6398,7 +6593,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() { cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6423,7 +6618,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -6482,22 +6677,26 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { // output token IDs (for last layer cropping) struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - auto rope_cache = cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ? + auto rope_cache = model.split_mode != LLAMA_SPLIT_MODE_GRAPH && cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ? ggml_rope_cache(ctx0, inp_pos, nullptr, n_embd_head, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) : nullptr; + float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + // Only process up to last layer (skip final NextN layer) // Final layer tensors are loaded but not processed in forward pass const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; for (int il = 0; il < n_transformer_layers; ++il) { struct ggml_tensor * inpSA = inpL; - // Pre-attention norm - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); - // self-attention - { + if (rope_cache == nullptr) { + cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, kq_scale, 0.0f, 0, il); + } else { + // Pre-attention norm + cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, model.layers[il].wqkv, model.layers[il].bqkv, model.layers[il].wqk, model.layers[il].bqk, @@ -6539,46 +6738,57 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // Post-attention norm - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "post_attn_norm", il); - if ((uint32_t) il < hparams.n_layer_dense_lead) { // dense FFN - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf); cb(cur, "ffn_out", il); } else { - // MoE FFN - struct ggml_tensor * routed_out = llm_build_moe_ffn(ctx0, lctx, cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, + cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, + model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b, + model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b, + model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b, + model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b, model.layers[il].ffn_exp_probs_b, + model.layers[il].ffn_up_shexp, nullptr, // we don't have shared expert biases? + model.layers[il].ffn_gate_shexp, nullptr, + model.layers[il].ffn_down_shexp, nullptr, n_expert, n_expert_used, - LLM_FFN_SILU, hparams.expert_weights_norm, - true, hparams.expert_weights_scale, - (enum llm_expert_gating_func_type) hparams.expert_gating_func, - cb, il, gf); - cb(routed_out, "routed_out", il); + LLM_FFN_SILU, hparams.expert_weights_norm, true, hparams.expert_weights_scale, + (llm_expert_gating_func_type) hparams.expert_gating_func, + LLM_FFN_SILU, cb, il, gf); - { - struct ggml_tensor * shared_out = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(shared_out, "ffn_shexp_out", il); + //// Post-attention norm + //cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); + //cb(cur, "post_attn_norm", il); + //// MoE FFN + //auto routed_out = llm_build_moe_ffn(ctx0, lctx, cur, + // model.layers[il].ffn_gate_inp, + // model.layers[il].ffn_up_exps, + // model.layers[il].ffn_gate_exps, + // model.layers[il].ffn_down_exps, + // model.layers[il].ffn_exp_probs_b, + // n_expert, n_expert_used, + // LLM_FFN_SILU, hparams.expert_weights_norm, + // true, hparams.expert_weights_scale, + // (enum llm_expert_gating_func_type) hparams.expert_gating_func, + // cb, il, gf); + //cb(routed_out, "routed_out", il); - cur = ggml_add(ctx0, routed_out, shared_out); - cb(cur, "ffn_out", il); - } + //auto shared_out = llm_build_ffn(ctx0, lctx, nullptr, cur, + // model.layers[il].ffn_up_shexp, NULL, NULL, + // model.layers[il].ffn_gate_shexp, NULL, NULL, + // model.layers[il].ffn_down_shexp, NULL, NULL, + // NULL, + // LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + //cb(shared_out, "ffn_shexp_out", il); + + //cur = ggml_add(ctx0, routed_out, shared_out); + //cb(cur, "ffn_out", il); } // residual and context vector @@ -6592,12 +6802,8 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { cur = inpL; - // final norm - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - // lm head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -6828,10 +7034,7 @@ ggml_cgraph * llm_build_context::build_bitnet_158() { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, NULL, NULL, NULL, @@ -6945,7 +7148,7 @@ ggml_cgraph * llm_build_context::build_cohere2() { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, + cur = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -7063,11 +7266,8 @@ ggml_cgraph * llm_build_context::build_t5_encoder() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm_enc, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm_enc, ffn_inp, model.layers[il].ffn_up_enc, NULL, NULL, model.layers[il].ffn_gate_enc, NULL, NULL, model.layers[il].ffn_down_enc, NULL, NULL, @@ -7251,11 +7451,8 @@ ggml_cgraph * llm_build_context::build_t5_decoder() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7349,10 +7546,7 @@ ggml_cgraph * llm_build_context::build_jais() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -7454,10 +7648,7 @@ ggml_cgraph * llm_build_context::build_chatglm() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7580,12 +7771,8 @@ ggml_cgraph * llm_build_context::build_glm4() { // FF { - // Pre-MLP norm - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // MLP - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7698,7 +7885,7 @@ ggml_cgraph * llm_build_context::build_dots1() { cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7721,7 +7908,7 @@ ggml_cgraph * llm_build_context::build_dots1() { cb(moe_out, "ffn_moe_out", il); { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -7842,10 +8029,7 @@ ggml_cgraph * llm_build_context::build_ernie4_5() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7967,10 +8151,7 @@ ggml_cgraph * llm_build_context::build_ernie4_5_moe() { bool is_moe_layer = static_cast(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0; if (!is_moe_layer) { - cur = llm_build_norm(ctx0, ffn_inp,hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7998,7 +8179,7 @@ ggml_cgraph * llm_build_context::build_ernie4_5_moe() { // Shared expert (if present) if (hparams.n_ff_shexp > 0) { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -8115,7 +8296,7 @@ ggml_cgraph * llm_build_context::build_hunyuan_moe() { cb(cur, "ffn_norm", il); // feed-forward network (non-MoE) - ggml_tensor * cur_mlp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * cur_mlp = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -8363,7 +8544,7 @@ ggml_cgraph * llm_build_context::build_bailingmoe2() { cb(cur, "ffn_norm", il); if (static_cast(il) < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -8386,7 +8567,7 @@ ggml_cgraph * llm_build_context::build_bailingmoe2() { cb, il, gf); cb(moe_out, "ffn_moe_out", il); - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -8606,10 +8787,7 @@ ggml_cgraph* llm_build_context::build_smollm3() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9010,3 +9188,210 @@ ggml_cgraph * llm_build_context::llama_build_graph( return result; } + +ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tensor * input, ggml_tensor * inp_pos, ggml_tensor * rope_factors_in, + ggml_tensor * KQ_mask, ggml_tensor * sinks, float KQ_scale, float f_attn_scale, int n_swa, int il) { + if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn && + model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) { + if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) { + ggml_split_tensor_t * attn_norm = model.layers[il].attn_norm ? (ggml_split_tensor_t *)model.layers[il].attn_norm->extra : nullptr; + auto wq = (ggml_split_tensor_t *)model.layers[il].wq->extra; + auto wk = (ggml_split_tensor_t *)model.layers[il].wk->extra; + auto wv = (ggml_split_tensor_t *)model.layers[il].wv->extra; + auto wo = (ggml_split_tensor_t *)model.layers[il].wo->extra; + GGML_ASSERT(wq->n_device == wk->n_device && wq->n_device == wv->n_device && wq->n_device == wo->n_device); + auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra; + auto vl = (ggml_split_tensor_t *)kv_self.v_l[il]->extra; + GGML_ASSERT(wq->n_device == kl->n_device && wq->n_device == vl->n_device); + ggml_split_tensor_t *bq = nullptr, *bo = nullptr, *bk = nullptr, *bv = nullptr; + if (model.layers[il].bq && model.layers[il].bq->extra) { + bq = (ggml_split_tensor_t *)model.layers[il].bq->extra; + GGML_ASSERT(bq->n_device == wq->n_device); + } + if (model.layers[il].bo && model.layers[il].bo->extra) { + bo = (ggml_split_tensor_t *)model.layers[il].bo->extra; + GGML_ASSERT(bo->n_device == wq->n_device); + } + if (model.layers[il].bk && model.layers[il].bk->extra) { + bk = (ggml_split_tensor_t *)model.layers[il].bk->extra; + GGML_ASSERT(bk->n_device == wq->n_device); + } + if (model.layers[il].bv && model.layers[il].bv->extra) { + bv = (ggml_split_tensor_t *)model.layers[il].bv->extra; + GGML_ASSERT(bv->n_device == wq->n_device); + } + std::vector attn; attn.reserve(wq->n_device); + for (int id = 0; id < wq->n_device; ++id) { + int il_cb = 1000*(id+1) + il; + auto split_wq = wq->splits[id]; + auto split_wk = wk->splits[id]; + auto split_wv = wv->splits[id]; + auto split_wo = wo->splits[id]; + auto split_kl = kl->splits[id]; + auto split_vl = vl->splits[id]; + GGML_ASSERT((!split_wq && !split_wk && !split_wv && !split_wo && !split_kl && !split_vl) || + (split_wq && split_wk && split_wv && split_wo && split_kl && split_vl)); + if (!split_wq) continue; + auto cur = input; + if (attn_norm) { + auto split_norm = attn_norm->splits[id]; + cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il_cb); + } + else if (cur->type != GGML_TYPE_F32) { + cur = ggml_cast(ctx0, cur, GGML_TYPE_F32); + } + auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ? + ((ggml_split_tensor_t *)model.layers[il].attn_q_norm->extra)->splits[id] : model.layers[il].attn_q_norm : nullptr; + auto the_k_norm = model.layers[il].attn_k_norm ? model.layers[il].attn_k_norm->extra ? + ((ggml_split_tensor_t *)model.layers[il].attn_k_norm->extra)->splits[id] : model.layers[il].attn_k_norm : nullptr; + auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, nullptr, nullptr, nullptr, nullptr, + split_wq, bq ? bq->splits[id] : nullptr, + split_wk, bk ? bk->splits[id] : nullptr, + split_wv, bv ? bv->splits[id] : nullptr, + the_q_norm, the_k_norm, f_attn_scale, il_cb); + auto rope_factors = rope_factors_in; + if (!rope_factors && model.layers[il].rope_freqs && model.layers[il].rope_freqs->extra) { + auto extra = (ggml_split_tensor_t *)model.layers[il].rope_freqs->extra; + rope_factors = extra->splits[id]; + } + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il_cb); + cb(Kcur, "Kcur", il_cb); + ggml_build_forward_expand(gf, Qcur); + ggml_build_forward_expand(gf, Kcur); + ggml_build_forward_expand(gf, Vcur); + + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_head_kv = split_wk->ne[1] / n_embd_head_k; + + GGML_ASSERT(kv_self.size == cparams.n_ctx); + + auto idx = 2*wq->n_device*il + 2*id; + GGML_ASSERT(idx+1 < (int)lctx.cache_copies.size()); + auto k_row_size = ggml_row_size(split_kl->type, n_embd_head_k); + ggml_tensor * k_cache_view = ggml_view_2d(ctx0, split_kl, n_embd_head_k, n_tokens*n_head_kv, + k_row_size, k_row_size*n_head_kv*kv_head); + + lctx.cache_copies[idx+0].cpy = ggml_cpy(ctx0, Kcur, k_cache_view); + lctx.cache_copies[idx+0].step = k_row_size*n_head_kv; + + // note: storing RoPE-ed version of K in the KV cache + ggml_build_forward_expand(gf, lctx.cache_copies[idx+0].cpy); + + struct ggml_tensor * v_cache_view = nullptr; + + if (cparams.flash_attn) { + v_cache_view = ggml_view_1d(ctx0, split_vl, n_tokens*split_wv->ne[1], + kv_head*ggml_row_size(split_vl->type, split_wv->ne[1])); + lctx.cache_copies[idx+1].step = ggml_row_size(split_vl->type, split_wv->ne[1]); + } else { + // note: the V cache is transposed when not using flash attention + v_cache_view = ggml_view_2d(ctx0, split_vl, n_tokens, split_wv->ne[1], + ( n_ctx)*ggml_element_size(split_vl), + (kv_head)*ggml_element_size(split_vl)); + lctx.cache_copies[idx+1].step = ggml_element_size(split_vl); + + Vcur = ggml_transpose(ctx0, Vcur); + } + cb(v_cache_view, "v_cache_view", il_cb); + + lctx.cache_copies[idx+1].cpy = ggml_cpy(ctx0, Vcur, v_cache_view); + ggml_build_forward_expand(gf, lctx.cache_copies[idx+1].cpy); + + auto q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + cb(q, "q", il_cb); + + auto k = ggml_view_3d(ctx0, split_kl, n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(split_kl->type, n_embd_head_k)*n_head_kv, //n_embd_k_gqa), + ggml_row_size(split_kl->type, n_embd_head_k), 0); + cb(k, "k", il_cb); + + auto v = ggml_view_3d(ctx0, split_vl, n_embd_head_v, n_kv, n_head_kv, + ggml_row_size(split_vl->type, split_wv->ne[1]), + ggml_row_size(split_vl->type, n_embd_head_v), 0); + cb(v, "v", il_cb); + +#ifdef GGML_USE_VULKAN + constexpr bool use_f32_precision = true; +#else + constexpr bool use_f32_precision = false; +#endif + cur = ggml_flash_attn_ext(ctx0, q, k, v, KQ_mask, KQ_scale, hparams.f_max_alibi_bias, + hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); + cb(cur, "flash_attn", il_cb); + ggml_flash_attn_ext_add_sinks(cur, sinks); + if (n_swa > 0) { + ((int32_t *)cur->op_params)[4] = n_swa; + } + + // Some models produced NaNs/gibberish when FA is computed with f16 precision on CUDA + if (use_f32_precision || model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || + (model.arch == LLM_ARCH_DEEPSEEK2 && q->ne[1] <= 8) || model.arch == LLM_ARCH_COHERE2 || model.arch == LLM_ARCH_GLM4 || + model.arch == LLM_ARCH_GLM4_MOE) { + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + } + + cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens); + cb(cur, "flash_attn_reshaped", il_cb); + + cur = llm_build_lora_mm(lctx, ctx0, split_wo, cur); + if (lctx.model.arch == LLM_ARCH_GLM4 || lctx.model.arch == LLM_ARCH_GLM4_MOE) { + // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } + cb(cur, "kqv_wo", il_cb); + if (bo) { + cur = ggml_add(ctx0, cur, bo->splits[id]); + cb(cur, "kqv_wo_biased", il_cb); + } + if (cur->ne[1] >= 32) { + cur = ggml_cast(ctx0, cur, GGML_TYPE_F16); + } + ggml_build_forward_expand(gf, cur); + attn.push_back(cur); + } + if (attn.size() == 1) return attn.front(); + auto cur = ggml_add(ctx0, attn[0], attn[1]); + cb(cur, "combine_attn", il); + cur->op_params[0] = 0xff; + for (int id = 2; id < (int)attn.size(); ++id) { + cur = ggml_add(ctx0, cur, attn[id]); + cb(cur, "combine_attn", il); + } + // TODO: for more than 2 GPUs, do we need to add another forced graph split? + //if (attn.size() > 2) { + // cur->op_params[0] = 0xff; + //} + return cur; + } + } + + auto cur = input; + if (model.layers[il].attn_norm) { + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + } + + auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, + model.layers[il].wqkv, model.layers[il].bqkv, + model.layers[il].wqk, model.layers[il].bqk, + model.layers[il].wq, model.layers[il].bq, model.layers[il].wk, model.layers[il].bk, model.layers[il].wv, model.layers[il].bv, + model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, f_attn_scale, il); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors_in, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, rope_factors_in, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa); + + return cur; +} diff --git a/src/llama-build-context.h b/src/llama-build-context.h index a96a49d4..328b51ce 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -148,7 +148,7 @@ struct llm_build_context { ggml_tensor * wq, ggml_tensor * bq, ggml_tensor * wk, ggml_tensor * bk, ggml_tensor * wv, ggml_tensor * bv, - float attention_scale, int il); + float attention_scale, int il) const; std::tuple llm_build_mul_mat_qkv(ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * wqkv, ggml_tensor * bqkv, @@ -156,7 +156,7 @@ struct llm_build_context { ggml_tensor * wq, ggml_tensor * bq, ggml_tensor * wk, ggml_tensor * bk, ggml_tensor * wv, ggml_tensor * bv, - ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il); + ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) const; ggml_cgraph * build_llama(); @@ -317,7 +317,7 @@ struct llm_build_context { float kq_scale, const llm_build_cb & cb, int il, ggml_tensor * sinks = nullptr, int n_swa = 0); - static ggml_tensor * llm_build_ffn(ggml_context * ctx, llama_context & lctx, + static ggml_tensor * llm_build_ffn(ggml_context * ctx, llama_context & lctx, ggml_tensor * ffn_norm, ggml_tensor * cur, ggml_tensor * up, ggml_tensor * up_b, @@ -331,7 +331,7 @@ struct llm_build_context { ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, - const llm_build_cb & cb, int il); + const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr); static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx, ggml_tensor * cur, @@ -375,6 +375,27 @@ llm_expert_gating_func_type gating_op, gating_op, cb, il, graph); } + static ggml_tensor * llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx, + ggml_tensor * ffn_norm, + ggml_tensor * input, + ggml_tensor * gate_inp, ggml_tensor * gate_inp_b, + ggml_tensor * up_exps, ggml_tensor * up_exps_b, + ggml_tensor * gate_exps, ggml_tensor * gate_exps_b, + ggml_tensor * down_exps, ggml_tensor * down_exps_b, + ggml_tensor * exp_probs_b, + ggml_tensor * up_shexp, ggml_tensor * up_b_shexp, + ggml_tensor * gate_shexp, ggml_tensor * gate_b_shexp, + ggml_tensor * down_shexp, ggml_tensor * down_b_shexp, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + bool scale_w, + float w_scale, +llm_expert_gating_func_type gating_op, + llm_ffn_op_type type_op_shexp, + const llm_build_cb & cb, int il, ggml_cgraph * graph); + static ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids); static ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx); @@ -383,4 +404,7 @@ llm_expert_gating_func_type gating_op, static ggml_cgraph * llama_build_graph(llama_context & lctx, const llama_batch & batch, bool worst_case); + ggml_tensor * build_std_attention(ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * inp_pos, ggml_tensor * rope_factors, + ggml_tensor * KQ_mask, ggml_tensor * sinks, float KQ_scale, float f_attn_scale, int n_swa, int il); + }; diff --git a/src/llama-context.h b/src/llama-context.h index bb21d880..61ad51e5 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -57,6 +57,9 @@ struct llama_kv_cache { std::vector k_l; // per layer std::vector v_l; + std::vector split_k_l; + std::vector split_v_l; + std::vector ctxs; std::vector bufs; diff --git a/src/llama-impl.h b/src/llama-impl.h index 38844506..b6d84369 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -224,3 +224,8 @@ struct gguf_context; std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i); ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer); + +struct llama_split_tensor { + std::vector tensor_splits; + ggml_split_tensor_t ggml; +}; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 5f671fe7..a0f0f0bf 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #define LLAMA_API_INTERNAL @@ -139,7 +140,7 @@ struct create_tensors_helper : public create_tensors_helper_interface { ggml_context ** actual_ctx = nullptr); void create_default_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool norm_bias); - void create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm = true); + void create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm = true, bool use_ctx_split = false); void create_std_attn(int i, const LLM_TN & tn, llama_layer & layer, int n_embd, int n_embd_gqa, ggml_context * ctx_split); void create_std_ffn(int i, const LLM_TN & tn, llama_layer & layer, int n_ff, int n_embd, ggml_context * ctx_split); @@ -153,12 +154,15 @@ struct create_tensors_helper : public create_tensors_helper_interface { std::map buft_layer_count; std::map ctx_map; + ggml_context * split_ctx = nullptr; size_t ctx_size; ggml_context * ctx_input; ggml_context * ctx_output; ggml_context * ctx_output_split; + std::unordered_set split_tensors; + inline ggml_context * ctx_for_buft(ggml_backend_buffer_type_t buft) { if (auto it = ctx_map.find(buft); it != ctx_map.end()) return it->second; @@ -179,6 +183,14 @@ struct create_tensors_helper : public create_tensors_helper_interface { create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_model & _model) : ml(_ml), model(_model) { +#if 0 + for (int i = 0; i < model.hparams.n_layer; ++i) { + printf("Layer %2d: %s %s\n", i, ggml_backend_buft_name(model.buft_layer[i].buft_matrix), ggml_backend_buft_name(model.buft_layer[i].buft)); + } + printf("Output: %s %s\n", ggml_backend_buft_name(model.buft_output.buft_matrix), ggml_backend_buft_name(model.buft_output.buft)); + printf(" Input: %s %s\n", ggml_backend_buft_name(model.buft_input.buft_matrix), ggml_backend_buft_name(model.buft_input.buft)); +#endif + const int n_layer = model.hparams.n_layer; buft_layer_count[model.buft_input.buft]++; buft_layer_count[model.buft_input.buft_matrix]++; @@ -192,6 +204,11 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output ctx_size += ggml_tensor_overhead()*n_layer*3; // for moe merged tensors + if (model.splits.size() > 1) { + ctx_size += ggml_tensor_overhead()*n_layer*4; // for KV cache + ctx_size *= (model.splits.size() + 1); + } + for (auto & it : buft_layer_count) { struct ggml_init_params params = { /*.mem_size =*/ ctx_size, @@ -205,10 +222,95 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod ctx_map[it.first] = ctx; model.ctxs.push_back(ctx); } + if (model.split_buft) { + if (auto it = ctx_map.find(model.split_buft); it != ctx_map.end()) { + split_ctx = it->second; + } + } +#if 0 + printf("=======================================================================\n"); + auto n_device = model.device_count(); + printf(" Model has %d devices:\n", n_device); + for (int device = 0; device < n_device; ++device) { + auto buft = model.default_buffer_type_offload(device); + if (buft) { + printf(" %d %s\n", device, ggml_backend_buft_name(buft)); + } else { + printf(" Oops: null buft for debvice %d\n", device); + } + } + if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) { + printf("model.splits:"); + for (auto s : model.splits) printf(" %g", s); + printf("\n"); + } +#endif +} + +static std::vector create_split(int nr, int granularity, const std::vector & splits, const std::vector & mem_used) { + GGML_ASSERT(nr % granularity == 0); + GGML_ASSERT(!splits.empty()); + if (granularity < 0) return std::vector(splits.size(), nr); + GGML_ASSERT(mem_used.size() == splits.size()); + size_t tot_memory_used = 1; + for (auto & mem : mem_used) tot_memory_used += mem; + int nchunk = nr / granularity; + std::vector result(splits.size()); + float last_split = 0; + int sum = 0; + for (int i = 0; i < (int)splits.size(); ++i) { + float p = splits[i] - last_split; + p += (p - 1.f*mem_used[i]/tot_memory_used); + result[i] = roundf(p*nchunk); + if (result[i] < 0) result[i] = 0; + sum += result[i]; + last_split = splits[i]; + } + while (sum > nchunk) { + last_split = 0; + float best_err = std::numeric_limits::max(); + int ibest = -1; + for (int i = 0; i < (int)splits.size(); ++i) { + if (result[i] > 0) { + float p = splits[i] - last_split; + float n_want = p*nchunk; + float err = std::abs(n_want - result[i] + 1); + //float err = std::abs(n_want - result[i] + 1) + std::abs(p - 1.f*mem_used[i]/tot_memory_used)*nchunk; + if (err < best_err) { + best_err = err; ibest = i; + } + } + last_split = splits[i]; + } + GGML_ASSERT(ibest >= 0 && result[ibest] > 0); + --result[ibest]; + --sum; + } + while (sum < nchunk) { + last_split = 0; + float best_err = std::numeric_limits::max(); + int ibest = -1; + for (int i = 0; i < (int)splits.size(); ++i) { + float p = splits[i] - last_split; + float n_want = p*nchunk; + float err = std::abs(n_want - result[i] - 1); + //float err = std::abs(n_want - result[i] - 1) + std::abs(p - 1.f*mem_used[i]/tot_memory_used)*nchunk; + if (err < best_err) { + best_err = err; ibest = i; + } + last_split = splits[i]; + } + GGML_ASSERT(ibest >= 0); + ++result[ibest]; + ++sum; + } + for (auto & r : result) r *= granularity; + return result; } ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector & ne, int flags, ggml_context ** actual_context) { + //auto requested_ctx = ctx; if (ml.tensor_buft_overrides) { for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { std::regex pattern(overrides->pattern); @@ -220,7 +322,12 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std } } if (actual_context) *actual_context = ctx; - return ml.create_tensor(ctx, name, ne, flags); + auto tensor = ml.create_tensor(ctx, name, ne, flags); + if (tensor && ctx == split_ctx) { + //printf("%s: adding tensor %s to split tensors\n", __func__, tensor->name); + split_tensors.insert(tensor); + } + return tensor; } #define LOADING_PRELUDE \ @@ -251,17 +358,18 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std bool use_mmap_buffer = true; -void create_tensors_helper::create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm) { +void create_tensors_helper::create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm, bool use_ctx_split) { model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + auto out_ctx = use_ctx_split ? ctx_output_split : ctx_output; if (has_norm) { - model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); } - model.output = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.output = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + model.output = create_tensor(out_ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -280,7 +388,7 @@ void create_tensors_helper::create_std_ffn(int i, const LLM_TN & tn, llama_layer bool create_tensors_helper::create_llama_tensors(const LLM_TN & tn) { LOADING_PRELUDE - create_embd_output(tn, n_embd, n_vocab); + create_embd_output(tn, n_embd, n_vocab, true, true); for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); @@ -288,7 +396,7 @@ bool create_tensors_helper::create_llama_tensors(const LLM_TN & tn) { auto & layer = model.layers[i]; - layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); use_mmap_buffer &= !merge_qkv(tn, i, 1); @@ -297,12 +405,12 @@ bool create_tensors_helper::create_llama_tensors(const LLM_TN & tn) { // optional bias tensors layer.bo = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm = create_tensor(model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.rope_freqs = create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_freqs = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); if (n_expert == 0) { - create_std_ffn(i, tn, layer, n_ff, n_embd, ctx_split); + create_std_ffn(i, tn, layer, n_ff, n_embd, model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer); // optional MLP bias layer.ffn_gate_b = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -1043,11 +1151,11 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) { // output { - model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + model.output = create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -1057,18 +1165,19 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) { auto & layer = model.layers[i]; - layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); use_mmap_buffer &= !merge_qkv(tn, i, 0); layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}); - layer.attn_k_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}); - layer.attn_q_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}); + layer.attn_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}); + layer.attn_q_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}); - layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + auto ffn_ctx = model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer; + layer.ffn_norm = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + layer.ffn_gate_inp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); if (n_expert == 0) { throw std::runtime_error("n_expert must be > 0 for QWEN3MOE"); @@ -1080,9 +1189,9 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) { // MoE branch const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; - layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); - layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}); - layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); + layer.ffn_gate_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); + layer.ffn_down_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}); + layer.ffn_up_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); } return use_mmap_buffer; } @@ -1734,7 +1843,7 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers"); GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers"); - create_embd_output(tn, n_embd, n_vocab); + create_embd_output(tn, n_embd, n_vocab, true, true); for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); @@ -1748,7 +1857,7 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { auto & layer = model.layers[i]; - layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); + layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); // GLM-style attention with bias terms if (!flags) { @@ -1765,12 +1874,17 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags); // K/Q norm tensors (optional for GLM-4.5 355B variant) - layer.attn_q_norm = create_tensor(ctx_layer, + layer.attn_q_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags); - layer.attn_k_norm = create_tensor(ctx_layer, + layer.attn_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags); - layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags); + auto ffn_ctx = model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer; + + // Why are we adding an additional tensor type? + // attn_post_norm is the exact same thing as ffn_norm + //layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags); + layer.ffn_norm = create_tensor(ffn_ctx, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags); // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE @@ -1778,35 +1892,35 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { if (use_moe) { // MoE layers - layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags); + layer.ffn_gate_inp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags); // gate bias - layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags); + layer.ffn_exp_probs_b = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags); // MoE branch const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; - layer.ffn_gate_exps = create_tensor(ctx_split, + layer.ffn_gate_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags); - layer.ffn_down_exps = create_tensor(ctx_split, + layer.ffn_down_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags); - layer.ffn_up_exps = create_tensor(ctx_split, + layer.ffn_up_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags); // Shared expert if (n_expert_shared > 0) { const int64_t n_ff_shexp = n_ff_exp * n_expert_shared; - layer.ffn_gate_shexp = create_tensor(ctx_split, + layer.ffn_gate_shexp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); - layer.ffn_down_shexp = create_tensor(ctx_split, + layer.ffn_down_shexp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags); - layer.ffn_up_shexp = create_tensor(ctx_split, + layer.ffn_up_shexp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); } } else { // Dense layers (first k layers) - GLM uses separate gate/up projections - layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags); - layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags); - layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags); + layer.ffn_gate = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags); + layer.ffn_down = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags); + layer.ffn_up = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags); } // --- NextN / MTP tensors (preserved but unused), on the final layer --- if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { @@ -2629,18 +2743,77 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool i layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); if (bias) { auto flags = bias == 1 ? llama_model_loader::TENSOR_NOT_REQUIRED : 0; - layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {layer.wq->ne[1]}, flags); - layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {layer.wk->ne[1]}, flags); - layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {layer.wv->ne[1]}, flags); + layer.bq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "bias", i), {layer.wq->ne[1]}, flags); + layer.bk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "bias", i), {layer.wk->ne[1]}, flags); + layer.bv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "bias", i), {layer.wv->ne[1]}, flags); } } return fused_qkv; } +static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor * tensor, llama_split_tensor & split_tensor, + const std::vector & splits, std::vector & mem_used) { + GGML_ASSERT(split_dim <= 1); + GGML_ASSERT(splits.size() > 1); + std::string name{tensor->name}; + split_tensor.tensor_splits.resize(splits.size()); + if (split_dim < 0) { + for (int i = 0; i < int(splits.size()); ++i) { + if (splits[i] > 0) { + split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, tensor->ne[0], tensor->ne[1], tensor->ne[2]); + auto name_i = name + '.' + std::to_string(i); + ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str()); + } else { + split_tensor.tensor_splits[i] = nullptr; + } + } + } + else if (split_dim == 1) { + for (int i = 0; i < int(splits.size()); ++i) { + if (splits[i] > 0) { + split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, tensor->ne[0], splits[i], tensor->ne[2]); + auto name_i = name + '.' + std::to_string(i); + ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str()); + } else { + split_tensor.tensor_splits[i] = nullptr; + } + } + } else { + for (int i = 0; i < int(splits.size()); ++i) { + if (splits[i] > 0) { + split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, splits[i], tensor->ne[1], tensor->ne[2]); + auto name_i = name + '.' + std::to_string(i); + ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str()); + } else { + split_tensor.tensor_splits[i] = nullptr; + } + } + } + split_tensor.ggml.n_device = splits.size(); + split_tensor.ggml.split_dim = split_dim; + split_tensor.ggml.splits = split_tensor.tensor_splits.data(); + tensor->extra = (void *)&split_tensor.ggml; + GGML_ASSERT(mem_used.size() >= splits.size()); + for (int i = 0; i < split_tensor.ggml.n_device; ++i) { + if (split_tensor.ggml.splits[i]) { + //auto nbytes = ggml_nbytes(split_tensor.ggml.splits[i]); + //printf("mem_used(%s): %8.2f, total: %8.2f\n", split_tensor.ggml.splits[i]->name, nbytes/1024./1024., (mem_used[i] + nbytes)/1024./1024.); + mem_used[i] += ggml_nbytes(split_tensor.ggml.splits[i]); + } + } +} + bool create_tensors_helper::create_tensors() { const auto tn = LLM_TN(model.arch); bool use_mmap_buffer = true; + if (ml.merge_qkv && (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)) { + LLAMA_LOG_WARN("\n========================================================\n"); + LLAMA_LOG_WARN("merge_qkv is not compatible with split model 'graph'\n"); + LLAMA_LOG_WARN(" => turning off merge_qkv\n"); + LLAMA_LOG_WARN("========================================================\n\n"); + ml.merge_qkv = false; + } switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: @@ -2761,6 +2934,157 @@ bool create_tensors_helper::create_tensors() { default: throw std::runtime_error("unknown architecture"); } + if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) { + std::vector mem_used(model.splits.size(), 0); + const auto & hparams = model.hparams; + int gqa_ratio = hparams.n_head() / hparams.n_head_kv(); + //printf("GQA ratio: %d\n", gqa_ratio); + for (int il = 0; il < int(model.layers.size()); ++il) { + if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) { + LLAMA_LOG_INFO("%s: not splitting layer %d because buffer type is host\n", __func__, il); + continue; + } + auto & layer = model.layers[il]; + auto ctx_split = ctx_for_layer_split(il); + if (layer.attn_norm) { + auto split = create_split(ggml_nrows(layer.attn_norm), -1, model.splits, mem_used); + prepare_split_tensors(-1, ctx_split, layer.attn_norm, layer.split_attn_norm, split, mem_used); + } + if (layer.rope_freqs) { + auto split = create_split(ggml_nrows(layer.rope_freqs), -1, model.splits, mem_used); + prepare_split_tensors(-1, ctx_split, layer.rope_freqs, layer.split_rope_freqs, split, mem_used); + } + if (layer.wo && layer.wq && layer.wk && layer.wv) { + int attn_granularity = hparams.n_embd_head_k * gqa_ratio; + if (ggml_is_quantized(layer.wo->type)) { + auto tt = ggml_internal_get_type_traits(layer.wo->type); + if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size; + } + GGML_ASSERT(attn_granularity % hparams.n_embd_head_k == 0); + auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits, mem_used); + prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split, mem_used); + if (layer.bo) { + prepare_split_tensors(-1, ctx_split, layer.bo, layer.split_bo, split, mem_used); + } + if (layer.bq) { + prepare_split_tensors(0, ctx_split, layer.bq, layer.split_bq, split, mem_used); + } + if (layer.attn_q_norm) { + prepare_split_tensors(-1, ctx_split, layer.attn_q_norm, layer.split_q_norm, split, mem_used); + } + for (auto & s : split) s /= gqa_ratio; + prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split, mem_used); + if (layer.bk) { + prepare_split_tensors(0, ctx_split, layer.bk, layer.split_bk, split, mem_used); + } + if (layer.bv) { + prepare_split_tensors(0, ctx_split, layer.bv, layer.split_bv, split, mem_used); + } + if (layer.attn_k_norm) { + prepare_split_tensors(-1, ctx_split, layer.attn_k_norm, layer.split_k_norm, split, mem_used); + } + } + + if (layer.ffn_norm) { + if (auto it = split_tensors.find(layer.ffn_norm); it != split_tensors.end()) { + auto split = create_split(ggml_nrows(layer.ffn_norm), -1, model.splits, mem_used); + prepare_split_tensors(-1, ctx_split, layer.ffn_norm, layer.split_ffn_norm, split, mem_used); + } + } + + if (layer.ffn_down && layer.ffn_up && layer.ffn_gate) { + bool use_split = split_tensors.find(layer.ffn_down) != split_tensors.end() && + split_tensors.find(layer.ffn_gate) != split_tensors.end() && + split_tensors.find(layer.ffn_up) != split_tensors.end(); + if (use_split) { + int ffn_granularity = 16; + if (ggml_is_quantized(layer.ffn_down->type)) { + auto tt = ggml_internal_get_type_traits(layer.ffn_down->type); + if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; + } + auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, model.splits, mem_used); + prepare_split_tensors(0, ctx_split, layer.ffn_down, layer.split_ffn_down, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_up, layer.split_ffn_up, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_gate, layer.split_ffn_gate, split, mem_used); + } + } + + //bool any_ffn_split = false; + if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) { + bool use_split = split_tensors.find(layer.ffn_down_shexp) != split_tensors.end() && + split_tensors.find(layer.ffn_gate_shexp) != split_tensors.end() && + split_tensors.find(layer.ffn_up_shexp) != split_tensors.end(); + if (use_split) { + //any_ffn_split = true; + int ffn_granularity = 16; + if (ggml_is_quantized(layer.ffn_down_shexp->type)) { + auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type); + if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; + } + auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits, mem_used); + prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used); + } + } + + if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) { + bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() && + split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() && + split_tensors.find(layer.ffn_up_exps) != split_tensors.end(); + + if (use_split) { + //any_ffn_split = true; + int ffn_granularity = 16; + if (ggml_is_quantized(layer.ffn_down_exps->type)) { + auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type); + if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; + } + auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits, mem_used); + //printf("split(%2d):", il); for (auto & s : split) printf(" %d", s); printf("\n"); + prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split, mem_used); + } + } + + if (layer.ffn_gate_inp) { + if (auto it = split_tensors.find(layer.ffn_gate_inp); it != split_tensors.end()) { + auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, model.splits, mem_used); + prepare_split_tensors(-1, ctx_split, layer.ffn_gate_inp, layer.split_ffn_gate_inp, shared_split, mem_used); + } + } + if (layer.ffn_exp_probs_b) { + if (auto it = split_tensors.find(layer.ffn_exp_probs_b); it != split_tensors.end()) { + auto shared_split = create_split(ggml_nrows(layer.ffn_exp_probs_b), -1, model.splits, mem_used); + prepare_split_tensors(-1, ctx_split, layer.ffn_exp_probs_b, layer.split_ffn_exp_probs_b, shared_split, mem_used); + } + } + } + + if (model.output) { + if (auto it = split_tensors.find(model.output); it != split_tensors.end()) { + if (ggml_backend_buft_is_host(model.buft_output.buft_matrix)) { + LLAMA_LOG_INFO("%s: not splitting output tensor becausee buffer is host\n", __func__); + } else { + auto ctx_split = ctx_map[model.buft_output.buft_matrix]; + auto split = create_split(model.output->ne[1], 16, model.splits, mem_used); + prepare_split_tensors(1, ctx_split, model.output, model.split_output, split, mem_used); + if (auto it = split_tensors.find(model.output_norm); it != split_tensors.end() && !ggml_backend_buft_is_host(model.buft_output.buft_matrix)) { + auto ctx_split = ctx_map[model.buft_output.buft_matrix]; + prepare_split_tensors(-1, ctx_split, model.output_norm, model.split_output_norm, split, mem_used); + } + } + } + } + + LLAMA_LOG_INFO("Estimated model buffer size per device:\n"); + for (int i = 0; i < int(mem_used.size()); ++i) { + LLAMA_LOG_INFO(" Device %d: %8.2f MiB\n", i, mem_used[i]/1024./1024.); + } + } return use_mmap_buffer; } diff --git a/src/llama-model.h b/src/llama-model.h index 342d6df6..d6188721 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -183,6 +183,24 @@ struct llama_layer { struct ggml_tensor * bqk = nullptr; struct ggml_tensor * bkv = nullptr; + llama_split_tensor split_attn_norm; + llama_split_tensor split_wq; + llama_split_tensor split_wk; + llama_split_tensor split_wv; + llama_split_tensor split_wo; + llama_split_tensor split_wqkv; + llama_split_tensor split_wqk; + llama_split_tensor split_wkv; + llama_split_tensor split_bq; + llama_split_tensor split_bk; + llama_split_tensor split_bv; + llama_split_tensor split_bo; + llama_split_tensor split_bqkv; + llama_split_tensor split_bqk; + llama_split_tensor split_bkv; + llama_split_tensor split_q_norm; + llama_split_tensor split_k_norm; + // relative position bias struct ggml_tensor * attn_rel_b = nullptr; struct ggml_tensor * attn_rel_b_enc = nullptr; @@ -205,12 +223,22 @@ struct llama_layer { struct ggml_tensor * ffn_down_enc = nullptr; struct ggml_tensor * ffn_up_enc = nullptr; + llama_split_tensor split_ffn_up; + llama_split_tensor split_ffn_gate; + llama_split_tensor split_ffn_down; + llama_split_tensor split_ffn_norm; + // ff MoE struct ggml_tensor * ffn_gate_inp = nullptr; struct ggml_tensor * ffn_gate_exps = nullptr; struct ggml_tensor * ffn_down_exps = nullptr; struct ggml_tensor * ffn_up_exps = nullptr; + llama_split_tensor split_ffn_gate_inp; + llama_split_tensor split_ffn_up_exps; + llama_split_tensor split_ffn_gate_exps; + llama_split_tensor split_ffn_down_exps; + // ff MoE bias struct ggml_tensor * ffn_gate_inp_b = nullptr; struct ggml_tensor * ffn_gate_exps_b = nullptr; @@ -226,6 +254,15 @@ struct llama_layer { struct ggml_tensor * ffn_down_shexp = nullptr; struct ggml_tensor * ffn_up_shexp = nullptr; + llama_split_tensor split_ffn_up_shexp; + llama_split_tensor split_ffn_gate_shexp; + llama_split_tensor split_ffn_down_shexp; + + llama_split_tensor split_ffn_gate_inp_b; + llama_split_tensor split_ffn_gate_exps_b; + llama_split_tensor split_ffn_down_exps_b; + llama_split_tensor split_ffn_up_exps_b; + // ff bias struct ggml_tensor * ffn_gate_b = nullptr; struct ggml_tensor * ffn_down_b = nullptr; // b2 @@ -233,6 +270,12 @@ struct llama_layer { struct ggml_tensor * ffn_act = nullptr; struct ggml_tensor * ffn_exp_probs_b = nullptr; + llama_split_tensor split_ffn_gate_b; + llama_split_tensor split_ffn_down_b; + llama_split_tensor split_ffn_up_b; + llama_split_tensor split_ffn_act; + llama_split_tensor split_ffn_exp_probs_b; + // mamba proj struct ggml_tensor * ssm_in = nullptr; struct ggml_tensor * ssm_x = nullptr; @@ -253,6 +296,8 @@ struct llama_layer { struct ggml_tensor * rope_short = nullptr; struct ggml_tensor * rope_freqs = nullptr; + llama_split_tensor split_rope_freqs; + // bitnet scale struct ggml_tensor * wq_scale = nullptr; struct ggml_tensor * wk_scale = nullptr; @@ -298,6 +343,9 @@ struct llama_model { struct ggml_tensor * output_b; struct ggml_tensor * output_norm_enc; + llama_split_tensor split_output; + llama_split_tensor split_output_norm; + std::vector layers; llama_split_mode split_mode; @@ -358,6 +406,12 @@ struct llama_model { } void set_tensor_overrides(const llama_model_params& params); + + int device_count() const; + ggml_backend_buffer_type_t default_buffer_type_offload(int device) const; + + std::vector splits; + ggml_backend_buffer_type_t split_buft = nullptr; }; struct llama_lora_weight { diff --git a/src/llama.cpp b/src/llama.cpp index 587ff62f..6ea2b4cf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -108,6 +108,7 @@ #include #include #include +#include #include #include #include @@ -460,18 +461,18 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ GGML_UNUSED(gpu); } -static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) { +static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu) { ggml_backend_buffer_type_t buft = nullptr; #ifdef GGML_USE_CUDA if (ggml_backend_cuda_get_device_count() > 1) { - buft = ggml_backend_cuda_split_buffer_type(tensor_split); + buft = ggml_backend_cuda_split_buffer_type(model.splits.data()); } #endif #ifdef GGML_USE_SYCL if (ggml_backend_sycl_get_device_count() > 1) { - buft = ggml_backend_sycl_split_buffer_type(tensor_split); + buft = ggml_backend_sycl_split_buffer_type(model.splits.data()); } #endif @@ -480,7 +481,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo } return buft; - GGML_UNUSED(tensor_split); +} + +int llama_model::device_count() const { + return llama_get_device_count(*this); +} + +ggml_backend_buffer_type_t llama_model::default_buffer_type_offload(int device) const { + return llama_default_buffer_type_offload(*this, device); } static size_t llama_get_device_memory(const llama_model & model, int device) { @@ -548,23 +556,49 @@ bool llama_context::can_reuse_graph(const llama_batch & u_batch) { } bool llama_context::update_cache_copies() { - int n_layer = cache_copies.size()/2; + int n_layer = model.hparams.n_layer - model.hparams.nextn_predict_layers; //cache_copies.size()/2; if ((int)kv_self.k_l.size() != n_layer) return false; if (!(kv_self.v_l.empty() || (int)kv_self.v_l.size() == n_layer)) return false; - for (int il = 0; il < n_layer; ++il) { - auto& c = cache_copies[2*il+0]; - if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) return false; - c.cpy->view_offs = kv_self.head*c.step; - c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs; - c.cpy->data = c.cpy->src[1]->data; - } - if (kv_self.v_l.empty()) return true; - for (int il = 0; il < n_layer; ++il) { - auto& c = cache_copies[2*il+1]; - if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) return false; - c.cpy->view_offs = kv_self.head*c.step; - c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs; - c.cpy->data = c.cpy->src[1]->data; + if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) { + for (int il = 0; il < n_layer; ++il) { + auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra; + auto vl = !kv_self.v_l.empty() && kv_self.v_l[il] ? (ggml_split_tensor_t *)kv_self.v_l[il]->extra : nullptr; + GGML_ASSERT(kl && (!kv_self.v_l[il] || vl)); + if (vl) { + GGML_ASSERT(kl->n_device == vl->n_device); + } + for (int id = 0; id < kl->n_device; ++id) { + auto& c = cache_copies[2*model.splits.size()*il + 2*id + 0]; + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kl->splits[id]) return false; + c.cpy->view_offs = kv_self.head*c.step; + c.cpy->src[1]->data = (char *)kl->splits[id]->data + c.cpy->view_offs; + c.cpy->data = c.cpy->src[1]->data; + } + if (!vl) continue; + for (int id = 0; id < vl->n_device; ++id) { + auto& c = cache_copies[2*model.splits.size()*il + 2*id + 1]; + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != vl->splits[id]) return false; + c.cpy->view_offs = kv_self.head*c.step; + c.cpy->src[1]->data = (char *)vl->splits[id]->data + c.cpy->view_offs; + c.cpy->data = c.cpy->src[1]->data; + } + } + } else { + for (int il = 0; il < n_layer; ++il) { + auto& c = cache_copies[2*il+0]; + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) return false; + c.cpy->view_offs = kv_self.head*c.step; + c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs; + c.cpy->data = c.cpy->src[1]->data; + } + if (kv_self.v_l.empty()) return true; + for (int il = 0; il < n_layer; ++il) { + auto& c = cache_copies[2*il+1]; + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) return false; + c.cpy->view_offs = kv_self.head*c.step; + c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs; + c.cpy->data = c.cpy->src[1]->data; + } } return true; } @@ -572,7 +606,11 @@ bool llama_context::update_cache_copies() { llama_context::llama_context(const llama_model & model) : model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) { const auto & hparams = model.hparams; - cache_copies.resize(2*hparams.n_layer); + if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) { + cache_copies.resize(2*model.splits.size()*hparams.n_layer); + } else { + cache_copies.resize(2*hparams.n_layer); + } } llama_context::~llama_context() { @@ -626,42 +664,35 @@ static bool llama_kv_cache_init( } } + bool split_cache = false; + if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.arch != LLM_ARCH_DEEPSEEK2 && offload) { + cache.split_k_l.reserve(n_layer); + cache.split_v_l.reserve(n_layer); + split_cache = true; + } + // count used buffer types std::map buft_layer_count; if (offload) { for (int64_t i = 0; i < n_layer; ++i) { - buft_layer_count[model.buft_layer[i].buft]++; + if (split_cache) { + buft_layer_count[model.buft_layer[i].buft_matrix]++; + } else { + buft_layer_count[model.buft_layer[i].buft]++; + } } } else { buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; } - //if (cparams.fused_moe_up_gate) { - // int nbad = 0; - // for (int i = 0; i < (int) n_layer; i++) { - // auto& layer = model.layers[i]; - // if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) { - // ++nbad; - // } - // } - // if (nbad > 0) { - // if (nbad == (int)n_layer) { - // LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n"); - // const_cast(cparams).fused_moe_up_gate = false; - // } - // else { - // LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n", - // nbad, (int)n_layer); - // } - // } - //} - // create a context for each buffer type std::map ctx_map; for (auto & it : buft_layer_count) { int n_layers = it.second; + size_t ctx_mem_size = 5u*n_layers*ggml_tensor_overhead(); + if (split_cache) ctx_mem_size += 2*model.splits.size()*n_layers*ggml_tensor_overhead(); struct ggml_init_params params = { - /*.mem_size =*/ 5u*n_layers*ggml_tensor_overhead(), + /*.mem_size =*/ ctx_mem_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -698,24 +729,25 @@ static bool llama_kv_cache_init( } } - cache.k_l.reserve(n_layer); bool needs_v_cache = true; + cache.k_l.reserve(n_layer); if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) { needs_v_cache = cparams.mla_attn == 1 && !cparams.flash_attn; } if (needs_v_cache) cache.v_l.reserve(n_layer); + std::vector mem_split(model.splits.size(), 0); + int n_mla = 0; for (int i = 0; i < (int) n_layer; i++) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); const uint32_t n_head_kv = hparams.n_head_kv(i); const uint32_t n_embd_head_k= hparams.n_embd_head_k; - - struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + struct ggml_context * ctx = split_cache ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); ggml_tensor * k; ggml_tensor * v; - if (cparams.mla_attn) { + if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) { // DeepSeek MLA const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; @@ -740,10 +772,53 @@ static bool llama_kv_cache_init( else { k = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, n_head_kv*kv_size); v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); - ggml_format_name(k, "cache_k_l%d", i); - ggml_format_name(v, "cache_v_l%d", i); + auto k_name = std::string{"cache_k_l"} + std::to_string(i); + auto v_name = std::string{"cache_v_l"} + std::to_string(i); + ggml_set_name(k, k_name.c_str()); + ggml_set_name(v, v_name.c_str()); + //ggml_format_name(k, "cache_k_l%d", i); + //ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); + if (split_cache) { + auto K = model.layers[i].wk; + auto V = model.layers[i].wv; + if (K && V && K->extra && V->extra) { + auto extra_K = (const ggml_split_tensor_t *)K->extra; + auto extra_V = (const ggml_split_tensor_t *)V->extra; + auto & split_k_l = cache.split_k_l.emplace_back(); + auto & split_v_l = cache.split_v_l.emplace_back(); + split_k_l.tensor_splits.resize(extra_K->n_device, nullptr); + split_v_l.tensor_splits.resize(extra_V->n_device, nullptr); + for (int is = 0; is < extra_K->n_device; ++is) { + auto split = extra_K->splits[is]; + if (!split) continue; + split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, split->ne[1]/n_embd_head_k * kv_size); + auto split_name = k_name + '.' + std::to_string(is); + ggml_set_name(split_k_l.tensor_splits[is], split_name.c_str()); + mem_split[is] += ggml_nbytes(split_k_l.tensor_splits[is]); + } + split_k_l.ggml.n_device = extra_K->n_device; + split_k_l.ggml.split_dim = 0; + split_k_l.ggml.splits = split_k_l.tensor_splits.data(); + for (int is = 0; is < extra_V->n_device; ++is) { + auto split = extra_V->splits[is]; + if (!split) continue; + split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, type_v, split->ne[1] * kv_size); + auto split_name = v_name + '.' + std::to_string(is); + ggml_set_name(split_v_l.tensor_splits[is], split_name.c_str()); + mem_split[is] += ggml_nbytes(split_v_l.tensor_splits[is]); + } + split_v_l.ggml.n_device = extra_V->n_device; + split_v_l.ggml.split_dim = 0; + split_v_l.ggml.splits = split_v_l.tensor_splits.data(); + k->extra = (void *)&split_k_l.ggml; + v->extra = (void *)&split_v_l.ggml; + } + //} else { + // printf("Oops: don't have yet K and V for layer %d\n", i); + //} + } } } if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn && n_mla < n_layer && n_mla > 0) { @@ -756,15 +831,46 @@ static bool llama_kv_cache_init( for (auto it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); - return false; + int ntensor = 0; + for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + ++ntensor; + } + if (ntensor > 0) { + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + cache.bufs.push_back(buf); } - ggml_backend_buffer_clear(buf, 0); - LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); - cache.bufs.push_back(buf); } + if (split_cache) { + LLAMA_LOG_INFO("%s: KV cache size per device:\n", __func__); + for (int i = 0; i < int(mem_split.size()); ++i) printf(" Device %d: %g MiB\n", i, mem_split[i]/1024./1024.); + } + +#if 0 + for (int il = 0; il < n_layer; ++il) { + if (cache.k_l[il]->extra) { + printf("Layer %2d, K-buffer: %p:", il, (void *)cache.k_l[il]->buffer); + auto split_kl = (ggml_split_tensor_t *)cache.k_l[il]->extra; + for (int id = 0; id < split_kl->n_device; ++id) { + if (split_kl->splits[id]) printf(" %p,%p", (void *)split_kl->splits[id]->data, (void *)split_kl->splits[id]->buffer); + } + printf("\n"); + } + if (cache.v_l[il]->extra) { + printf("Layer %2d, V-buffer: %p:", il, (void *)cache.v_l[il]->buffer); + auto split_vl = (ggml_split_tensor_t *)cache.v_l[il]->extra; + for (int id = 0; id < split_vl->n_device; ++id) { + if (split_vl->splits[id]) printf(" %p,%p", (void *)split_vl->splits[id]->data, (void *)split_vl->splits[id]->buffer); + } + printf("\n"); + } + } +#endif return true; } @@ -1617,6 +1723,16 @@ static void ggml_backend_add_from_device(llama_context* ctx, ggml_backend_t back } } +static bool is_model_split_supported(const llama_model & model) { + static std::unordered_set k_supported = { + LLM_ARCH_LLAMA, + LLM_ARCH_QWEN3MOE, + LLM_ARCH_GLM4_MOE, + }; + auto it = k_supported.find(model.arch); + return it != k_supported.end(); +} + // Returns false if cancelled by progress_callback static bool llm_load_tensors( llama_model_loader & ml, @@ -1634,6 +1750,16 @@ static bool llm_load_tensors( auto & hparams = model.hparams; + if (split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) { + if (!is_model_split_supported(model)) { + LLAMA_LOG_WARN("\n=======================================================\n"); + LLAMA_LOG_WARN("Split mode 'graph' is not supported for this model\n"); + LLAMA_LOG_WARN(" => changing split mode to 'layer'\n"); + LLAMA_LOG_WARN("=======================================================\n\n"); + split_mode = LLAMA_SPLIT_MODE_LAYER; + } + } + model.split_mode = split_mode; model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; @@ -1652,10 +1778,7 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } - if (split_mode == LLAMA_SPLIT_MODE_LAYER) { - // calculate the split points - // int device_count = llama_get_device_count(model); - int device_count = model.devices.size(); + if (int device_count = model.devices.size(); device_count > 1) { bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); std::vector splits(device_count); if (all_zero) { @@ -1676,46 +1799,47 @@ static bool llm_load_tensors( for (int i = 0; i < device_count; ++i) { splits[i] /= split_sum; } + model.splits = std::move(splits); + } else { + model.splits = { 1.0f }; + } + + int device_count = model.splits.size(); + // assign the repeating layers to the devices according to the splits + int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); + if (split_mode == LLAMA_SPLIT_MODE_LAYER) { - // assign the repeating layers to the devices according to the splits - int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); for (int i = i_gpu_start; i < n_layer; ++i) { - int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin(); -#ifndef NDEBUG - ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); - const char* name = ggml_backend_buft_name(buft); - LLAMA_LOG_DEBUG("load_tensors: layers %3d assigned to backend %s\n", i, - name); -#endif + int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin(); model.buft_layer[i] = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); } // assign the output layer if (n_gpu_layers > n_layer) { - int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); -#ifndef NDEBUG - ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); - const char* name = ggml_backend_buft_name(buft); - LLAMA_LOG_DEBUG("load_tensors: output layers assigned to backend %s\n", - name); -#endif + int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - model.splits.begin(); model.buft_output = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); } else { model.buft_output = llama_default_buffer_type_cpu(true); } } else { ggml_backend_buffer_type_t split_buft; - if (split_mode == LLAMA_SPLIT_MODE_ROW) { - split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], tensor_split); + if ((split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) { + split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu]); + model.split_buft = split_buft; } else { // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]); } + auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]); // assign the repeating layers for (int i = i_gpu_start; i < n_layer; ++i) { - model.buft_layer[i] = { - split_buft, - llama_default_buffer_type_offload(model, model.devices[main_gpu]) - }; + if (split_mode == LLAMA_SPLIT_MODE_ATTN) { + int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, + float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin(); + model.buft_layer[i] = { split_buft, llama_default_buffer_type_offload(model, model.devices[layer_gpu]) }; + printf("Layer %d: assigning buft_layer to GPU %d\n", i, layer_gpu); + } else { + model.buft_layer[i] = { split_buft, buft_layer }; + } } // assign the output layer if (n_gpu_layers > n_layer) { @@ -1807,24 +1931,33 @@ static bool llm_load_tensors( } #endif else { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend buffer"); + int ntensor = 0; + for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + ++ntensor; } - model.bufs.push_back(buf); - if (use_mlock && ggml_backend_buffer_is_host(buf)) { - model.mlock_bufs.emplace_back(new llama_mlock); - auto & mlock_buf = model.mlock_bufs.back(); - mlock_buf->init (ggml_backend_buffer_get_base(buf)); - mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); - } - for (uint32_t idx = 0; idx < ml.files.size(); idx++) { - bufs.emplace(idx, buf); + if (ntensor > 0) { + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf == nullptr) { + LLAMA_LOG_ERROR("Failed to allocate buffer type %s\n", ggml_backend_buft_name(buft)); + throw std::runtime_error("unable to allocate backend buffer"); + } + model.bufs.push_back(buf); + if (use_mlock && ggml_backend_buffer_is_host(buf)) { + model.mlock_bufs.emplace_back(new llama_mlock); + auto & mlock_buf = model.mlock_bufs.back(); + mlock_buf->init (ggml_backend_buffer_get_base(buf)); + mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); + } + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + bufs.emplace(idx, buf); + } } } if (bufs.empty()) { - throw std::runtime_error("failed to allocate buffer"); + LLAMA_LOG_WARN("No tensors in buffer type %s\n", ggml_backend_buft_name(buft)); + continue; + //throw std::runtime_error("failed to allocate buffer (1)"); } for (auto & buf : bufs) { @@ -4326,8 +4459,8 @@ struct llama_context * llama_new_context_with_model( ggml_backend_add_from_device(ctx, ctx->backend_metal); } #elif defined(GGML_USE_CUDA) - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE) { + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu, cparams.cuda_params); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); @@ -4337,7 +4470,7 @@ struct llama_context * llama_new_context_with_model( ggml_backend_add_from_device(ctx, backend); } else { - // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU + // LLAMA_SPLIT_MODE_LAYER and LLAMA_SPLIT_MODE_GRAPH require a backend for each GPU for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_cuda_init(device, cparams.cuda_params); if (backend == nullptr) { @@ -4346,12 +4479,11 @@ struct llama_context * llama_new_context_with_model( return nullptr; } ggml_backend_add_from_device(ctx, backend); - } } #elif defined(GGML_USE_VULKAN) - if (model->split_mode == LLAMA_SPLIT_MODE_ROW) { - LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__); + if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH || model->split_mode == LLAMA_SPLIT_MODE_ATTN) { + LLAMA_LOG_ERROR("%s: split mode 'graph' or 'attn' not supported. Failed to initialize Vulkan backend\n", __func__); llama_free(ctx); return nullptr; } @@ -4375,8 +4507,8 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_SYCL) - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) { ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); @@ -4407,9 +4539,9 @@ struct llama_context * llama_new_context_with_model( ggml_backend_add_from_device(ctx, backend); } #elif defined(GGML_USE_CANN) - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used // TODO: ggml_backend_cann is not support split tensor now, just leave code here. - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) { ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);