diff --git a/common/common.cpp b/common/common.cpp index b6ee885c..9f1ce736 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1276,12 +1276,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else if (arg_next == "layer") { params.split_mode = LLAMA_SPLIT_MODE_LAYER; } - else if (arg_next == "row") { - fprintf(stderr, "\n\n=====================================================================================\n"); - fprintf(stderr, " Split mode row is no longer supported\n"); - fprintf(stderr, "=====================================================================================\n\n\n"); - GGML_ABORT("fatal error"); - params.split_mode = LLAMA_SPLIT_MODE_ROW; + else if (arg_next == "attn") { + params.split_mode = LLAMA_SPLIT_MODE_ATTN; + } + else if (arg_next == "graph") { + params.split_mode = LLAMA_SPLIT_MODE_GRAPH; } else { invalid_param = true; @@ -2249,6 +2248,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", "how to split the model across multiple GPUs, one of:\n" " - none: use one GPU only\n" + " - graph: split model tensors and computation graph across GPUs\n" " - layer (default): split layers and KV across GPUs\n" }); options.push_back({ "*", "-ts, --tensor-split SPLIT", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 31b769fe..f0f62d46 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -217,7 +217,7 @@ static const char * split_mode_str(llama_split_mode mode) { switch (mode) { case LLAMA_SPLIT_MODE_NONE: return "none"; case LLAMA_SPLIT_MODE_LAYER: return "layer"; - case LLAMA_SPLIT_MODE_ROW: return "row"; + case LLAMA_SPLIT_MODE_GRAPH: return "graph"; default: GGML_ABORT("invalid split mode"); } } @@ -334,7 +334,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" --n-cpu-moe (default: none)\n"); printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); - printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); @@ -630,12 +630,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { mode = LLAMA_SPLIT_MODE_NONE; } else if (m == "layer") { mode = LLAMA_SPLIT_MODE_LAYER; - } else if (m == "row") { - fprintf(stderr, "\n\n=======================================================================\n"); - fprintf(stderr, "Split mode 'row' is no longer supported\n"); - fprintf(stderr, "=======================================================================\n\n\n"); - invalid_param = true; - break; + } else if (m == "graph") { + mode = LLAMA_SPLIT_MODE_GRAPH; } else { invalid_param = true; break; diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 7b630489..d2a0cf85 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -3021,6 +3021,13 @@ extern "C" { GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); + typedef struct { + int n_device; + int split_dim; + struct ggml_tensor * tensor; + struct ggml_tensor ** splits; + } ggml_split_tensor_t; + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 813f4467..f7c1593b 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -43,7 +43,7 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { size_t size = buft->iface.get_alloc_size(buft, tensor); - assert(size >= ggml_nbytes(tensor)); + //assert(size >= ggml_nbytes(tensor)); return size; } return ggml_nbytes(tensor); @@ -1216,8 +1216,10 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co return -1; } + //printf("%s: have %d backends, buffer is %s\n", __func__, sched->n_backends, ggml_backend_buffer_name(buffer)); // find highest prio backend that supports the buffer type and the op for (int i = 0; i < sched->n_backends; i++) { + //printf(" Checking bacckend %d (%s)\n", i, ggml_backend_name(sched->backends[i])); if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) && ggml_backend_supports_op(sched->backends[i], op)) { return i; @@ -1393,6 +1395,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // do not overwrite user assignments if (*leaf_backend_id == -1) { *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); + //printf("Pass 1: assigned backend %d to leaf %d, %s\n", *leaf_backend_id, i, graph->leafs[i]->name); } } @@ -1402,6 +1405,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // do not overwrite user assignments if (*node_backend_id == -1) { *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); + //printf("Pass 1: assigned backend %d to node %d, %s(%s)\n", *node_backend_id, i, ggml_op_name(node->op), node->name); #if 0 // src @@ -1445,6 +1449,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg cur_backend_id = *node_backend_id; } } else if (cur_backend_id != -1) { + //printf("(u1) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } @@ -1466,6 +1471,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg cur_backend_id = *node_backend_id; } } else if (cur_backend_id != -1) { + //printf("(d1) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } @@ -1482,6 +1488,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { + //printf("(u2) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } @@ -1498,6 +1505,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { + //printf("(d2) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } @@ -1535,6 +1543,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (n_supported > n_supported_best) { n_supported_best = n_supported; *node_backend_id = b; + //printf("Pass 3: assigned backend %d to unassigned node %d, %s\n", b, i, node->name); SET_CAUSE(node, "3.best"); } } @@ -1555,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } if (supported) { + //printf("Pass 3: assigned backend %d to node %d, %s previously assigned to backend %d\n", b, i, node->name, *node_backend_id); *node_backend_id = b; SET_CAUSE(node, "3.upg"); break; @@ -1583,9 +1593,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // views are always on the same backend as the source *src_backend_id = tensor_backend_id(src->view_src); SET_CAUSE(src, "4.vsrc"); + //printf("Pass 4: assigned backend %d to src %d, %s in node %d, %s frpm view_src\n", *src_backend_id, j, src->name, i, node->name); } else { *src_backend_id = *cur_backend_id; SET_CAUSE(src, "4.cur"); + //printf("Pass 4: assigned backend %d to src %d, %s in node %d, %s frpm current\n", *src_backend_id, j, src->name, i, node->name); } } } @@ -1620,7 +1632,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // check if we should start a new split based on the sources of the current node bool need_new_split = false; - if (node_backend_id == cur_backend_id && split->n_inputs > 0) { + if (node->op == GGML_OP_ADD && node->op_params[0] == 0xff) { + need_new_split = true; + } + else if (node_backend_id == cur_backend_id && split->n_inputs > 0) { for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 0db57b08..2372efdb 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -221,7 +221,8 @@ static ggml_cuda_device_info ggml_cuda_init() { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); + GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", + prop.totalGlobalMem/(1024*1024)); info.default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; @@ -567,6 +568,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t } GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + printf("%s(%s -> %s)\n", __func__, src->name, dst->name); if (ggml_backend_buffer_is_cuda(src->buffer)) { ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context; @@ -601,7 +603,7 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor, + /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, @@ -703,59 +705,25 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { // cuda split buffer -static int64_t get_row_rounding(const std::array & tensor_split) { - int64_t row_rounding = 0; - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) { - continue; - } - - const int cc = ggml_cuda_info().devices[id].cc; - row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc)); - } - return row_rounding; -} - -static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { - const int64_t nrows = ggml_nrows(tensor); - const int64_t rounding = get_row_rounding(tensor_split); - - *row_low = id == 0 ? 0 : nrows*tensor_split[id]; - *row_low -= *row_low % rounding; - - if (id == ggml_backend_cuda_get_device_count() - 1) { - *row_high = nrows; - } else { - *row_high = nrows*tensor_split[id + 1]; - *row_high -= *row_high % rounding; - } -} - -static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); -} - struct ggml_backend_cuda_split_buffer_type_context { - std::array tensor_split; + //std::array tensor_split; }; struct ggml_backend_cuda_split_buffer_context { ~ggml_backend_cuda_split_buffer_context() { - for (ggml_tensor_extra_gpu * extra : tensor_extras) { - for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { - for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) { - if (extra->events[id][is] != nullptr) { - CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); - } - } - if (extra->data_device[id] != nullptr) { - CUDA_CHECK(cudaFree(extra->data_device[id])); - } - } - delete extra; - } + //for (ggml_tensor_extra_gpu * extra : tensor_extras) { + // for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { + // for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) { + // if (extra->events[id][is] != nullptr) { + // CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); + // } + // } + // if (extra->data_device[id] != nullptr) { + // CUDA_CHECK(cudaFree(extra->data_device[id])); + // } + // } + // delete extra; + //} } std::vector tensor_extras; @@ -784,128 +752,189 @@ GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buf GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported - - ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - - ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; - ctx->tensor_extras.push_back(extra); - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses +GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + if (!tensor->extra) return; + //printf("%s(%s, %p)\n", __func__, tensor->name, tensor->extra); + auto extra = (ggml_split_tensor_t *)tensor->extra; + GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count()); + for (int i = 0; i < extra->n_device; ++i) { + if (!extra->splits[i]) continue; + auto split = extra->splits[i]; + auto ne0 = split->ne[0]; + auto size = ggml_nbytes(split); + auto padded_size = size; if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + int nblock = (ne0 + MATRIX_ROW_PADDING - 1)/MATRIX_ROW_PADDING; + auto padded_row_size = ggml_row_size(split->type, nblock*MATRIX_ROW_PADDING); + auto row_size = ggml_row_size(split->type, ne0); + padded_size += padded_row_size - row_size; } - - // FIXME: do not crash if cudaMalloc fails - // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first - ggml_cuda_set_device(id); + ggml_cuda_set_device(i); char * buf; - CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); - - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); - } - - extra->data_device[id] = buf; - - for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) { - CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); + CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, padded_size, i)); + if (padded_size > size) { + CUDA_CHECK(cudaMemset(buf + size, 0, padded_size - size)); } + //printf(" allocated %zu bytes for tensor %s of type %s, dim = %ld x %ld x %ld. padding: %zu\n", padded_size, split->name, ggml_type_name(split->type), + // split->ne[0], split->ne[1], split->ne[2], padded_size - size); + split->data = buf; + auto ctx = new ggml_backend_cuda_buffer_context(i, buf); + auto buft = ggml_backend_cuda_buffer_type(i); + split->buffer = ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, padded_size); + ggml_backend_buffer_set_usage(split->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); } - tensor->extra = extra; + return; + } -GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + if (!tensor->extra) return; + static std::map k_map = { + { GGML_TYPE_Q4_0_R8 , 8}, + { GGML_TYPE_Q5_0_R4 , 4}, + { GGML_TYPE_Q8_0_R8 , 8}, + { GGML_TYPE_Q2_K_R4 , 4}, + { GGML_TYPE_Q3_K_R4 , 4}, + { GGML_TYPE_Q4_K_R4 , 4}, + { GGML_TYPE_Q5_K_R4 , 4}, + { GGML_TYPE_Q6_K_R4 , 4}, + { GGML_TYPE_IQ2_XXS_R4, 4}, + { GGML_TYPE_IQ2_XS_R4 , 4}, + { GGML_TYPE_IQ3_XXS_R4, 4}, + { GGML_TYPE_IQ1_S_R4 , 4}, + { GGML_TYPE_IQ4_NL_R4 , 4}, + { GGML_TYPE_IQ3_S_R4 , 4}, + { GGML_TYPE_IQ2_S_R4 , 4}, + { GGML_TYPE_IQ4_XS_R8 , 8}, + { GGML_TYPE_IQ1_M_R4 , 4}, + { GGML_TYPE_BF16_R16 , 16}, + { GGML_TYPE_Q6_0_R4 , 4}, + { GGML_TYPE_IQ2_BN_R4 , 4}, + { GGML_TYPE_IQ2_K_R4 , 4}, + { GGML_TYPE_IQ3_K_R4 , 4}, + { GGML_TYPE_IQ4_K_R4 , 4}, + { GGML_TYPE_IQ5_K_R4 , 4}, + { GGML_TYPE_IQ4_KS_R4 , 4}, + { GGML_TYPE_IQ5_KS_R4 , 4}, + { GGML_TYPE_Q8_K_R16 , 4}, + { GGML_TYPE_Q8_KV_R8 , 4}, + { GGML_TYPE_Q8_K_R8 , 8}, + }; + //printf("%s(%s)\n", __func__, tensor->name); + // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + auto extra = (ggml_split_tensor_t *)tensor->extra; + GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count()); - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; + if (extra->split_dim < 0) { + GGML_ASSERT(ggml_is_contiguous(tensor)); + auto nbytes = ggml_nbytes(tensor); + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + GGML_ASSERT(split->type == tensor->type); + GGML_ASSERT(ggml_are_same_shape(tensor, split)); + GGML_ASSERT(ggml_nbytes(split) == nbytes); + ggml_cuda_set_device(i); + CUDA_CHECK(cudaMemcpyAsync(split->data, data, nbytes, cudaMemcpyHostToDevice, cudaStreamPerThread)); } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + else if (extra->split_dim == 0) { + int n_interleave = 1; + if (auto it = k_map.find(tensor->type); it != k_map.end()) n_interleave = it->second; + //if (tensor->type >= GGML_TYPE_Q4_0_R8) { + // GGML_ABORT("Dim 0 copy of row-interleaved quants is not supported yet"); + //} + auto tt = ggml_internal_get_type_traits(tensor->type); + std::vector host_buffer; + GGML_ASSERT(ggml_is_contiguous(tensor)); + int nrows = ggml_nrows(tensor); + auto bs = tt.blck_size; + auto ts = tt.type_size; + auto row_size = ggml_row_size(tensor->type, tensor->ne[0]); + int ne = 0; + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + GGML_ASSERT(split->ne[1]%n_interleave == 0); + ggml_cuda_set_device(i); + GGML_ASSERT(split->type == tensor->type); + GGML_ASSERT((int)ggml_nrows(split) == nrows); + GGML_ASSERT(split->ne[0] % bs == 0); + auto source_offset = n_interleave*(tt.row_meta_size + (ne / bs) * ts); + auto split_row_size = ggml_row_size(split->type, split->ne[0]); + if (host_buffer.size() < nrows*split_row_size) host_buffer.resize(nrows*split_row_size); + for (int64_t i02 = 0; i02 < split->ne[2]; ++i02) { + for (int64_t i01 = 0; i01 < split->ne[1]; i01 += n_interleave) { + auto dst = host_buffer.data() + (i02*split->ne[1] + i01)*split_row_size; + auto src = (const char *)data + i02*tensor->nb[2] + i01*tensor->nb[1]; + if (tt.row_meta_size > 0) { + memcpy(dst, src, tt.row_meta_size*n_interleave); + } + memcpy(dst + tt.row_meta_size*n_interleave, src + source_offset, n_interleave*(split_row_size - tt.row_meta_size)); + } + } + CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), nrows*split_row_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + ne += split->ne[0]; } - - const char * buf_host = (const char *)data + offset_split; - CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + } + else if (extra->split_dim == 1) { + if (tensor->ne[2] > 1) { + auto row_size = ggml_row_size(tensor->type, tensor->ne[0]); + std::vector host_buffer; + int ne1 = 0; + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + ggml_cuda_set_device(i); + auto size = ggml_nbytes(split); + if (host_buffer.size() < size) host_buffer.resize(size); + for (int64_t i02 = 0; i02 < split->ne[2]; ++i02) { + auto dst = host_buffer.data() + i02*split->ne[1]*row_size; + auto src = (const char *)data + i02*tensor->nb[2] + ne1*tensor->nb[1]; + memcpy(dst, src, split->ne[1]*row_size); + } + CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + ne1 += split->ne[1]; + } + } else { + int n_interleave = 1; + if (auto it = k_map.find(tensor->type); it != k_map.end()) n_interleave = it->second; + size_t cur_offset = 0; + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + GGML_ASSERT(split->ne[1]%n_interleave == 0); + ggml_cuda_set_device(i); + auto size = ggml_nbytes(split); + const char * buf_host = (const char *)data + cur_offset; + CUDA_CHECK(cudaMemcpyAsync(split->data, buf_host, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + cur_offset += size; + } + } + } + else { + fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim == 0); + GGML_ABORT("fatal error"); } - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + for (int i = 0; i < extra->n_device; ++i) { + if (!extra->splits[i]) continue; CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } + } -GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, const ggml_tensor * tensor, + [[maybe_unused]] void * data, size_t offset, size_t size) { // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - char * buf_host = (char *)data + offset_split; - CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); - } - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); - } + GGML_ABORT("not implemented"); } GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -954,31 +983,26 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_b GGML_UNUSED(buft); } -GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; +GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size([[maybe_unused]] ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + if (!tensor->extra) return 0; + auto extra = (ggml_split_tensor_t *)tensor->extra; + GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count()); size_t total_size = 0; - - const int64_t ne0 = tensor->ne[0]; - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - total_size += ggml_nbytes_split(tensor, nrows_split); - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + total_size += ggml_nbytes(split); + auto ne0 = split->ne[0]; if (ne0 % MATRIX_ROW_PADDING != 0) { - total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + auto nblock = (ne0 + MATRIX_ROW_PADDING - 1)/MATRIX_ROW_PADDING; + auto row_size = ggml_row_size(split->type, ne0); + auto padded_row_size = ggml_row_size(split->type, nblock*MATRIX_ROW_PADDING); + total_size += padded_row_size - row_size; } } - return total_size; + } GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { @@ -996,40 +1020,12 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { - static std::mutex mutex; - std::lock_guard lock(mutex); - - static std::map, struct ggml_backend_buffer_type> buft_map; - - std::array tensor_split_arr = {}; - - bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; }); - if (all_zero) { - tensor_split_arr = ggml_cuda_info().default_tensor_split; - } else { - float split_sum = 0.0f; - for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { - tensor_split_arr[i] = split_sum; - split_sum += tensor_split[i]; - } - for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { - tensor_split_arr[i] /= split_sum; - } - } - - auto it = buft_map.find(tensor_split_arr); - if (it != buft_map.end()) { - return &it->second; - } - - struct ggml_backend_buffer_type buft { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * /*tensor_split*/) { + static ggml_backend_buffer_type buft { /* .iface = */ ggml_backend_cuda_split_buffer_type_interface, - /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, + /* .context = */ new ggml_backend_cuda_split_buffer_type_context{}, //{tensor_split_arr}, }; - - auto result = buft_map.emplace(tensor_split_arr, buft); - return &result.first->second; + return &buft; } // host buffer type @@ -1437,6 +1433,7 @@ static void ggml_cuda_op_mul_mat_cublas( GGML_UNUSED(src1_padded_row_size); } +#if 0 static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { static bool peer_access_enabled = false; @@ -1488,6 +1485,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { GGML_UNUSED(main_device); } +#endif static cudaError_t ggml_cuda_Memcpy2DPeerAsync( void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) { @@ -1553,20 +1551,6 @@ static void ggml_cuda_op_mul_mat( const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); - GGML_ASSERT(!(split && ne02 > 1)); - GGML_ASSERT(!(split && ne03 > 1)); - GGML_ASSERT(!(split && ne02 < ne12)); - - ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr; - - - std::array tensor_split; - if (split) { - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; - tensor_split = buft_ctx->tensor_split; - } - struct dev_data { int cc; @@ -1595,31 +1579,12 @@ static void ggml_cuda_op_mul_mat( dev[id].row_low = 0; dev[id].row_high = ne01; - // for multi GPU, get the row boundaries from tensor split - // and round to mul_mat_q tile sizes - if (split) { - const int64_t rounding = get_row_rounding(tensor_split); - - if (id != 0) { - dev[id].row_low = ne01*tensor_split[id]; - if (dev[id].row_low < ne01) { - dev[id].row_low -= dev[id].row_low % rounding; - } - } - - if (id != ggml_backend_cuda_get_device_count() - 1) { - dev[id].row_high = ne01*tensor_split[id + 1]; - if (dev[id].row_high < ne01) { - dev[id].row_high -= dev[id].row_high % rounding; - } - } - } } bool quantization_done = false; for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) { + if (id != ctx.device || dev[id].row_low == dev[id].row_high) { continue; } @@ -1632,7 +1597,7 @@ static void ggml_cuda_op_mul_mat( cudaStream_t stream = ctx.stream(id, 0); if (src0_is_contiguous) { - dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data; + dev[id].src0_dd = (char *) src0->data; } else { // If src0 is not contiguous it will be copied to a temporary buffer, it may then be necessary to clear padding. const size_t nbytes_data = ggml_nbytes(src0); @@ -1676,20 +1641,13 @@ static void ggml_cuda_op_mul_mat( if (dst_on_device) { dev[id].dst_dd = (float *) dst->data; } else { - const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst); + const size_t size_dst_ddf = ggml_nelements(dst); dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf); } } - // if multiple devices are used they need to wait for the main device - // here an event is recorded that signals that the main device has finished calculating the input data - if (split && used_devices > 1) { - ggml_cuda_set_device(ctx.device); - CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream())); - } - - const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; - if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1 && ne02 == ne12 && ne02 == dst->ne[2]) { + const int64_t src1_col_stride = ne11; + if (quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1 && ne02 == ne12 && ne02 == dst->ne[2]) { //printf("invoking fast path for %s x %s\n", src0->name, src1->name); int id = ctx.device; char * src0_dd_i = dev[id].src0_dd; @@ -1704,11 +1662,11 @@ static void ggml_cuda_op_mul_mat( } for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { - const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0; + const int64_t is = 0; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) { + if (id != ctx.device || dev[id].row_low == dev[id].row_high) { continue; } @@ -1719,11 +1677,6 @@ static void ggml_cuda_op_mul_mat( ggml_cuda_set_device(id); cudaStream_t stream = ctx.stream(id, is); - // wait for main GPU data if necessary - if (split && (id != ctx.device || is != 0)) { - CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0)); - } - for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) { const int64_t i03 = i0 / ne12; const int64_t i02 = i0 % ne12; @@ -1796,45 +1749,12 @@ static void ggml_cuda_op_mul_mat( // copy dst to host or other device if necessary if (!dst_on_device) { void * dst_off_device = dst->data; - if (split) { - // src0 = weight matrix is saved as a transposed matrix for better memory layout. - // dst is NOT transposed. - // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. - // Instead they need to be copied to the correct slice in ne0 = dst row index. - // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. - float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); - GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); - dhf_dst_i += src1_col_0*ne0 + dev[id].row_low; - CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync( - dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream)); - } else { - float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); - GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); - dhf_dst_i += src1_col_0*ne0; - CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream)); - } + float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); + GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); + dhf_dst_i += src1_col_0*ne0; + CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream)); } - // add event for the main device to wait on until other device is done - if (split && (id != ctx.device || is != 0)) { - CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream)); - } - } - } - } - - // main device waits for all other devices to be finished - if (split && ggml_backend_cuda_get_device_count() > 1) { - int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; - is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS; - - ggml_cuda_set_device(ctx.device); - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - if (dev[id].row_low == dev[id].row_high) { - continue; - } - for (int64_t is = 0; is < is_max; ++is) { - CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0)); } } } @@ -2167,7 +2087,6 @@ static int ggml_cuda_mul_mat_q(ggml_backend_cuda_context & ctx, const ggml_tenso static int ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cgraph * cgraph, int node_n) { - const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q. // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data. @@ -2192,26 +2111,11 @@ static int ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor bool any_gpus_with_slow_fp16 = false; - if (split) { - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; - auto & tensor_split = buft_ctx->tensor_split; - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - // skip devices that are not going to do any work: - if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) { - continue; - } + const int cc = ggml_cuda_info().devices[ctx.device].cc; + use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); + any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); - const int cc = ggml_cuda_info().devices[id].cc; - use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); - any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); - } - } else { - const int cc = ggml_cuda_info().devices[ctx.device].cc; - use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); - any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); - } - - if (!split && (use_mul_mat_vec_q || use_mul_mat_q) && src1->ne[2]*src1->ne[3] == 1) { + if ((use_mul_mat_vec_q || use_mul_mat_q) && src1->ne[2]*src1->ne[3] == 1) { return ggml_cuda_mul_mat_q(ctx, src0, src1, dst, cgraph, node_n, use_mul_mat_vec_q); } @@ -2223,13 +2127,13 @@ static int ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + if (any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // FP32 precision KQ single-batch for batch size 1 without FlashAttention ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst); - } else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + } else if (any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // FP32 precision KQV single-batch for batch size 1 without FlashAttention ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst); - } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) + } else if (src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch without FlashAttention ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); @@ -2364,7 +2268,6 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * ggml_backend_buffer_is_cuda(src0->buffer) && ggml_backend_buffer_is_cuda(src1->buffer) && ggml_backend_buffer_is_cuda(dst->buffer) && - !ggml_backend_buffer_is_cuda_split(src0->buffer) && src1->type == GGML_TYPE_F32) { int device_id = ctx.device; ggml_backend_cuda_buffer_context * src0_ctx = (ggml_backend_cuda_buffer_context *) src0->buffer->context; @@ -2405,8 +2308,7 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * if (next && next->op == GGML_OP_MUL_MAT_ID && next->src[0]->type == src0->type && src1 == next->src[1] && ggml_are_same_shape(src0, next->src[0]) && ggml_backend_buffer_is_cuda(next->src[0]->buffer) && - ggml_backend_buffer_is_cuda(next->buffer) && - !ggml_backend_buffer_is_cuda_split(next->src[0]->buffer)) { + ggml_backend_buffer_is_cuda(next->buffer)) { ggml_backend_cuda_buffer_context * next_src0_ctx = (ggml_backend_cuda_buffer_context *) next->src[0]->buffer->context; ggml_backend_cuda_buffer_context * next_dst_ctx = (ggml_backend_cuda_buffer_context *) next->buffer->context; if (next_src0_ctx->device == device_id && @@ -2432,8 +2334,6 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers"); - cudaStream_t stream = ctx.stream(); const int64_t n_as = ne02; @@ -2572,8 +2472,6 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten ggml_backend_buffer_is_cuda(src0_2->buffer) && ggml_backend_buffer_is_cuda(src1->buffer) && ggml_backend_buffer_is_cuda(dst->buffer) && - !ggml_backend_buffer_is_cuda_split(src0_1->buffer) && - !ggml_backend_buffer_is_cuda_split(src0_2->buffer) && src1->type == GGML_TYPE_F32) { int device_id = ctx.device; ggml_backend_cuda_buffer_context * src0_1_ctx = (ggml_backend_cuda_buffer_context *) src0_1->buffer->context; @@ -2615,7 +2513,6 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten bool fuse_next = next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) && ggml_backend_buffer_is_cuda(next->src[0]->buffer) && - !ggml_backend_buffer_is_cuda_split(next->src[0]->buffer) && ((ggml_backend_cuda_buffer_context *)next->src[0]->buffer->context)->device == device_id && ggml_backend_buffer_is_cuda(next->buffer) && ((ggml_backend_cuda_buffer_context *)next->buffer->context)->device == device_id; @@ -2673,9 +2570,6 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten } } - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_1->buffer) && "mul_mat_id does not support split buffers"); - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_2->buffer) && "mul_mat_id does not support split buffers"); - GGML_TENSOR_BINARY_OP_LOCALS cudaStream_t stream = ctx.stream(); @@ -2975,8 +2869,6 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor GGML_ASSERT(src1->ne[2] == 1); GGML_ASSERT(src1->ne[3] == 1); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_1->buffer)); - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_2->buffer)); auto stream = ctx.stream(); @@ -3052,10 +2944,6 @@ static inline bool ops_are_same_device(const ggml_cgraph * cgraph, int first, in } static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst, const ggml_cgraph * cgraph, int & i) { - // why is this here instead of mul_mat? - if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) { - ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); - } #if IK_PRINT_TIMING int64_t tim1 = ggml_time_us(); @@ -3065,7 +2953,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg auto fusion = ctx.fusion; - //printf("%4d %s(%s)\n", i, ggml_op_name(dst->op), dst->name); + //printf("%4d %s(%s) on device %d. time = %ld\n", i, ggml_op_name(dst->op), dst->name, ctx.device, ggml_time_us()); switch (dst->op) { case GGML_OP_ARGMAX: ggml_cuda_argmax(ctx, dst); @@ -3101,6 +2989,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg cgraph->nodes[i+2]->op == GGML_OP_FUSED_RMS_NORM && ggml_is_contiguous(dst->src[0]) && ggml_is_contiguous(dst->src[1]) && + dst->src[0]->type == GGML_TYPE_F32 && // with split mode "attn" we can end up having f16 ggml_are_same_shape(dst->src[0], dst->src[1]) && dst == cgraph->nodes[i+1]->src[0] && ggml_is_contiguous(cgraph->nodes[i+1]->src[1]) && @@ -3547,7 +3436,43 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ #ifdef GGML_CUDA_NO_PEER_COPY return false; #else - CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); + if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + // + // The goal here is to reduce traffic between GPU's, which is entirely non-negligible + // for prompt processing. + // We cast the tensor to be copied to f16, copy the f16 data peer-to-peer + // and then cast back to f32 on the destination side. + // The cost for converting to/from f16 is much ower than the cost of copying + // two times more data over PCI-E (well, at least the 30 GB/s PCI-E I have). + // iBut for some reason the following is not working. + // Can somebody tell me why? + // + ggml_cuda_pool_alloc tmp_src(cuda_ctx_src->pool(), ggml_nelements(src)); + ggml_cuda_pool_alloc tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst)); + + auto src_f16 = *src; + src_f16.type = GGML_TYPE_F16; + for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2; + src_f16.data = tmp_src.get(); + + auto dst_f16 = *dst; + dst_f16.type = GGML_TYPE_F16; + for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2; + dst_f16.data = tmp_dst.get(); + + ggml_cuda_set_device(cuda_ctx_src->device); + ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true); + CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream())); + + CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream())); + + ggml_cuda_set_device(cuda_ctx_dst->device); + CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream())); + ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true); + + } else { + CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); + } #endif } @@ -3571,6 +3496,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_cuda_set_device(cuda_ctx->device); CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream())); GGML_UNUSED(backend); @@ -3600,7 +3526,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG - GGML_CUDA_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__); + GGML_CUDA_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer %s\n", __func__, node->src[0]->name); #endif } @@ -3790,6 +3716,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx // TODO const bool integrated = false; //ggml_cuda_info().devices[cuda_ctx->device].integrated; + //printf("======================== %s: graph with %d nodes on device %d. time = %ld\n", __func__, cgraph->n_nodes, cuda_ctx->device, ggml_time_us()); while (!graph_evaluated_or_captured) { // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. // With the use of CUDA graphs, the execution will be performed by the graph launch. @@ -3823,8 +3750,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { assert(node->src[j]->buffer); - //assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || - // ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft))); } } #else @@ -3953,277 +3878,6 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t return GGML_STATUS_SUCCESS; } -/* -GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - ggml_cuda_set_device(cuda_ctx->device); - -#ifdef USE_CUDA_GRAPH - static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); - - // Objects required for CUDA Graph - if (cuda_ctx->cuda_graph == nullptr) { - cuda_ctx->cuda_graph.reset(new ggml_cuda_graph()); - } - - bool use_cuda_graph = true; - bool cuda_graph_update_required = false; - // vector of pointers to CUDA cpy kernels, which are required to identify - // kernel parameters which need updated in the graph for each token - std::vector ggml_cuda_cpy_fn_ptrs; - - if (cuda_ctx->cuda_graph->graph == nullptr) { - if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { - cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__); -#endif - } - } - - // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly, - // or previous graph capture failure. - // Also disable for multi-gpu for now. TO DO investigate - if (disable_cuda_graphs_due_to_env - || cuda_ctx->cuda_graph->disable_due_to_gpu_arch - || cuda_ctx->cuda_graph->disable_due_to_too_many_updates - || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) { - use_cuda_graph = false; - } - - if (use_cuda_graph) { - if (cuda_ctx->cuda_graph->instance == nullptr) { - cuda_graph_update_required = true; - } - - // Check if the graph size has changed - if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { - cuda_graph_update_required = true; - cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); - } - - // Loop over nodes in GGML graph to determine if CUDA graph update is required - // and store properties to allow this comparison for the next token - for (int i = 0; i < cgraph->n_nodes; i++) { - bool has_matching_properties = true; - if (!cuda_graph_update_required) { - has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); - } - if (!has_matching_properties) { - cuda_graph_update_required = true; - } - set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); - } - - // Loop over nodes in GGML graph to obtain info needed for CUDA graph - cuda_ctx->cuda_graph->updated_kernel_arg.clear(); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - - if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { - use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); -#endif - } - - if (node->op == GGML_OP_MUL_MAT_ID || node->op == GGML_OP_MOE_FUSED_UP_GATE) { - use_cuda_graph = false; // This node type is not supported by CUDA graph capture -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); -#endif - } - - if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { - // disable CUDA graphs for batch size > 1 for now. - // Changes in batch size or context size can cause changes to the grid size of some kernels. - use_cuda_graph = false; -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); -#endif - } - if (node->op == GGML_OP_MULTI_ADD && node->ne[1] > 1) { - // disable CUDA graphs for batch size > 1 for now. - // Changes in batch size or context size can cause changes to the grid size of some kernels. - use_cuda_graph = false; -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); -#endif - } - - if (node->op == GGML_OP_CPY) { - // store the copy op parameter which changes with each token. - cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data)); - // store a pointer to each copy op CUDA kernel to identify it later - void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]); - if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) { - ggml_cuda_cpy_fn_ptrs.push_back(ptr); - } - } - - if (!use_cuda_graph) { - break; - } - } - - // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. - if (use_cuda_graph && cuda_graph_update_required) { - cuda_ctx->cuda_graph->number_consecutive_updates++; - } else { - cuda_ctx->cuda_graph->number_consecutive_updates = 0; - } - - if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { - cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); -#endif - } - } - - if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture - CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); - } - -#else - bool use_cuda_graph = false; - bool cuda_graph_update_required = false; -#endif // USE_CUDA_GRAPH - - bool graph_evaluated_or_captured = false; - - while (!graph_evaluated_or_captured) { - // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. - // With the use of CUDA graphs, the execution will be performed by the graph launch. - if (!use_cuda_graph || cuda_graph_update_required) { - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - ggml_tensor * next = i < cgraph->n_nodes-1 ? cgraph->nodes[i+1] : nullptr; - - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { - continue; - } - -#ifndef NDEBUG - assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - if (node->src[j] != nullptr) { - assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); - } - } -#endif - - bool skip_next = false; - bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, next, skip_next); - if (!ok) { - GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); - } - GGML_ASSERT(ok); - if (skip_next) ++i; - } - } - -#ifdef USE_CUDA_GRAPH - if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture - if (cuda_ctx->cuda_graph->graph != nullptr) { - CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph)); - cuda_ctx->cuda_graph->graph = nullptr; - } - CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph)); - -#if 0 - if (disable_cuda_graphs_due_to_failed_capture) { - use_cuda_graph = false; - cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; -#ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__); -#endif - } else { - graph_evaluated_or_captured = true; // CUDA graph has been captured - } -#endif - graph_evaluated_or_captured = true; // CUDA graph has been captured - } else { - graph_evaluated_or_captured = true; // ggml graph has been directly evaluated - } - } - - if (use_cuda_graph) { - if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph. - CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); - } - - // Perform update to graph (if required for this token), and change copy parameter (required for every token) - - if (cuda_graph_update_required) { - // Extract nodes from graph - // First call with null argument gets number of nodes in graph - CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes)); - // Subsequent call with non-null argument gets nodes - cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes); - cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes); - if (cuda_ctx->cuda_graph->num_nodes > 0) { - CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes)); - - // Loop over nodes, and extract kernel parameters from each node - for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) { - cudaGraphNodeType node_type; - CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type)); - if (node_type == cudaGraphNodeTypeKernel) { - cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime - if (stat == cudaErrorInvalidDeviceFunction) { - // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node. - // We don't need to update blas nodes, so clear error and move on. - cudaGetLastError(); - } else { - GGML_ASSERT(stat == cudaSuccess); - } - } - } - } - } - - // One of the arguments to the copy kernel is updated for each token, hence we need to - // replace that argument with the updated value in the CUDA graph - if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured - int k = 0; - for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) { - if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) { - char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++); - cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr; - CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i])); - } - } - } - - // Update graph executable - cudaGraphExecUpdateResultInfo result_info; - cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); - if (stat == cudaErrorGraphExecUpdateFailure) { -#ifndef NDEBUG - GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__); -#endif - // The pre-existing graph exec cannot be updated due to violated constraints - // so instead clear error and re-instantiate - cudaGetLastError(); - CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance)); - cuda_ctx->cuda_graph->instance = nullptr; - CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); - } else { - GGML_ASSERT(stat == cudaSuccess); - } - // Launch graph - CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream())); -#else - graph_evaluated_or_captured = true; -#endif // USE_CUDA_GRAPH - } - - return GGML_STATUS_SUCCESS; -} -*/ - GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; switch (op->op) { @@ -4487,6 +4141,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons } GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + //printf("%s(%s, %s): %d, %d\n", __func__, ggml_backend_name(backend), ggml_backend_buft_name(buft), ggml_backend_buft_is_cuda_split(buft), ggml_backend_buft_is_cuda(buft)); if (ggml_backend_buft_is_cuda_split(buft)) { return true; } diff --git a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh index 95a80d54..a21bafb5 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh +++ b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh @@ -18,17 +18,8 @@ struct ggml_cuda_type_traits { static constexpr int qi = 4; }; -// Reminder: -// constexpr int qk = ggml_cuda_type_traits::qk; -// constexpr int qi = ggml_cuda_type_traits::qi; -// constexpr int vdr = get_vdr_mmvq(type); - -// QI4_XS = 256/(4*2) = 32 -// vdr = 4, qi = 32 -> qi/vdr = 8, kqs = 4*(tid%8), blocks_per_iter = 4*1*32/32 = 4 -// vdr = 2, qi = 32 -> qi/vdr =16, kqs = 2*(tid%16), blocks_per_iter = 2*1*32/32 = 2 -namespace { template -__device__ void iqk_mul_mat_vec_q_kerne( +static __device__ void iqk_mul_mat_vec_q_kerne( const void * __restrict__ vx, const void * __restrict__ vy, const float * bias, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size) { @@ -110,7 +101,7 @@ __device__ void iqk_mul_mat_vec_q_kerne( } template -__device__ void iqk_fused_mul_mat_vec_q_kernel( +static __device__ void iqk_fused_mul_mat_vec_q_kernel( const void * __restrict__ vup, const void * __restrict__ vgate, const void * __restrict__ vy, float * __restrict__ dst, const float * __restrict__ bias_u, const float * __restrict__ bias_g, const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size, @@ -228,7 +219,7 @@ template -void iqk_mul_mat_vec_q_cuda(const mmvq_args & args, cudaStream_t stream) { +static void iqk_mul_mat_vec_q_cuda(const mmvq_args & args, cudaStream_t stream) { GGML_ASSERT(args.ncols_x % ggml_blck_size(type) == 0); //GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE); @@ -428,7 +419,7 @@ void iqk_mul_mat_vec_q_cuda(const mmvq_args & args, cudaStream_t stream) { } } -__device__ __forceinline__ void get_int_from_table_16_shift(const uint32_t & q4, uint16_t shift, const uint8_t * all_values, +static __device__ __forceinline__ void get_int_from_table_16_shift(const uint32_t & q4, uint16_t shift, const uint8_t * all_values, int & val1, int & val2) { uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; @@ -476,7 +467,7 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con #endif } -__device__ __forceinline__ int int_from_table(const uint8_t * a8, const uint8_t * values) { +static __device__ __forceinline__ int int_from_table(const uint8_t * a8, const uint8_t * values) { uint16_t v1 = values[a8[0]] | (values[a8[1]] << 8); uint16_t v2 = values[a8[2]] | (values[a8[3]] << 8); return v1 | (v2 << 16); @@ -506,8 +497,6 @@ __device__ __forceinline__ int int_from_table(const uint8_t * a8, const uint8_t #define VDR_IQ3_K_Q8_1_MMVQ 4 #define VDR_IQ3_K_Q8_1_MMQ 4 -} // namespace - extern void mul_mat_vec_iq2_k_q8_1_cuda(const mmvq_args & args, cudaStream_t stream); extern void mul_mat_vec_iq3_k_q8_1_cuda(const mmvq_args & args, cudaStream_t stream); extern void mul_mat_vec_iq4_k_q8_1_cuda(const mmvq_args & args, cudaStream_t stream); diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 98d33ebc..c4619e60 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -176,15 +176,15 @@ static __global__ void rms_norm_f32_nc( } } -template -static __global__ void fused_rms_norm_f32(const float * x, const float * y, float * dst, const int ncols, const float eps) { +template +static __global__ void fused_rms_norm_f32(const src_t * x, const float * y, float * dst, const int ncols, const float eps) { const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row*ncols + col]; + const float xi = (float)x[row*ncols + col]; tmp += xi * xi; } @@ -206,13 +206,13 @@ static __global__ void fused_rms_norm_f32(const float * x, const float * y, floa const float scale = rsqrtf(mean + eps); for (int col = tid; col < ncols; col += block_size) { - dst[row*ncols + col] = scale * y[col] * x[row*ncols + col]; + dst[row*ncols + col] = scale * y[col] * (float)x[row*ncols + col]; } } -template +template static __global__ void fused_rms_norm_f32_nc( - const float * x, const float * y, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, + const src_t * x, const float * y, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps) { const int nrows = gridDim.x; const int nchannels = gridDim.y; @@ -229,7 +229,7 @@ static __global__ void fused_rms_norm_f32_nc( float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { - const float xi = x[col]; + const float xi = (float)x[col]; tmp += xi * xi; } @@ -257,7 +257,7 @@ static __global__ void fused_rms_norm_f32_nc( const float scale = rsqrtf(mean + eps); for (int col = tid; col < ncols; col += block_size) { - dst[col] = scale * y[col] * x[col]; + dst[col] = scale * y[col] * (float)x[col]; } } @@ -307,7 +307,8 @@ static void rms_norm_f32_nc_cuda( } } -static void fused_rms_norm_f32_cuda(const float * x, const float * y, float * dst, +template +static void fused_rms_norm_f32_cuda(const src_t * x, const float * y, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { constexpr int kBlockSize = 256; GGML_ASSERT(ncols % WARP_SIZE == 0); @@ -331,8 +332,9 @@ static void fused_rms_norm_f32_cuda(const float * x, const float * y, float * ds } } +template static void fused_rms_norm_f32_nc_cuda( - const float * x, const float * y, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples, + const src_t * x, const float * y, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, cudaStream_t stream) { const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { @@ -432,7 +434,7 @@ void ggml_cuda_op_fused_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT(src0->ne[0] == src1->ne[0]); @@ -445,14 +447,22 @@ void ggml_cuda_op_fused_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * if (ggml_is_contiguous(src0)) { const int64_t nrows = ggml_nrows(src0); - fused_rms_norm_f32_cuda(src0_d, src1_d, dst_d, ne00, nrows, eps, stream); + if (src0->type == GGML_TYPE_F32) { + fused_rms_norm_f32_cuda(src0_d, src1_d, dst_d, ne00, nrows, eps, stream); + } else { + fused_rms_norm_f32_cuda((const half *)src0_d, src1_d, dst_d, ne00, nrows, eps, stream); + } } else { auto ts0 = ggml_type_size(src0->type); GGML_ASSERT(src0->nb[0] == ts0); auto s01 = src0->nb[1] / ts0; auto s02 = src0->nb[2] / ts0; auto s03 = src0->nb[3] / ts0; - fused_rms_norm_f32_nc_cuda(src0_d, src1_d, dst_d, ne00, src0->ne[1], src0->ne[2], src0->ne[3], s01, s02, s03, eps, stream); + if (src0->type == GGML_TYPE_F32) { + fused_rms_norm_f32_nc_cuda(src0_d, src1_d, dst_d, ne00, src0->ne[1], src0->ne[2], src0->ne[3], s01, s02, s03, eps, stream); + } else { + fused_rms_norm_f32_nc_cuda((const half *)src0_d, src1_d, dst_d, ne00, src0->ne[1], src0->ne[2], src0->ne[3], s01, s02, s03, eps, stream); + } } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index cee25268..cb9c7562 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7284,7 +7284,19 @@ static struct ggml_tensor * ggml_fused_rms_norm_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result; + if (inplace) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + result = ggml_view_tensor(ctx, a); + } else { + if (a->type == GGML_TYPE_F32) { + result = ggml_dup_tensor(ctx, a); + } else { + result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], a->ne[1], a->ne[2], a->ne[3]); + } + } + + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &eps, sizeof(eps)); diff --git a/include/llama.h b/include/llama.h index ea63574f..3c9b331c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -275,7 +275,8 @@ extern "C" { enum llama_split_mode { LLAMA_SPLIT_MODE_NONE = 0, // single GPU LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs - LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs + LLAMA_SPLIT_MODE_ATTN = 2, // splits self-attention computations across GPUs + LLAMA_SPLIT_MODE_GRAPH = 3, // splits computations across GPUs }; diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 9588d1b9..123fc185 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -621,7 +621,8 @@ ggml_tensor * llm_build_context::llm_build_norm( ggml_tensor * llm_build_context::llm_build_ffn( ggml_context * ctx, llama_context & lctx, - ggml_tensor * cur, + ggml_tensor * ffn_norm, + ggml_tensor * input, ggml_tensor * up, ggml_tensor * up_b, ggml_tensor * up_s, @@ -634,14 +635,83 @@ ggml_tensor * llm_build_context::llm_build_ffn( ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, - const llm_build_cb & cb, int il) { + const llm_build_cb & cb, int il, ggml_cgraph * graph) { + + if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s && + up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR && + (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) { + auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : + type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU; + auto u = (ggml_split_tensor_t *)up->extra; + auto g = (ggml_split_tensor_t *)gate->extra; + auto d = (ggml_split_tensor_t *)down->extra; + GGML_ASSERT(u->n_device == g->n_device && u->n_device == d->n_device); + std::vector ffn; + ffn.reserve(u->n_device); + for (int id = 0; id < u->n_device; ++id) { + int il_cb = 1000*(id+1) + il; + auto split_u = u->splits[id]; + auto split_g = g->splits[id]; + auto split_d = d->splits[id]; + GGML_ASSERT((!split_u && !split_g && split_d) || (split_u && split_g && split_d)); + if (!split_u) continue; + auto cur = input; + if (ffn_norm && ffn_norm->extra) { + auto norm = (ggml_split_tensor_t *)ffn_norm->extra; + cur = llm_build_norm(ctx, input, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il); + cb(cur, "ffn_inp_normed", il_cb); + } + else if (input->type != GGML_TYPE_F32) { + cur = ggml_cast(ctx, input, GGML_TYPE_F32); + } + cur = ggml_fused_up_gate(ctx, split_u, split_g, cur, unary_op); + cb(cur, "ffn_up_gate", il_cb); + cur = llm_build_lora_mm(lctx, ctx, split_d, cur); + cb(cur, "ffn_down", il_cb); + if (lctx.model.arch == LLM_ARCH_GLM4 || lctx.model.arch == LLM_ARCH_GLM4_MOE) { + // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } + if (cur->ne[1] >= 32) { + cur = ggml_cast(ctx, cur, GGML_TYPE_F16); + } + if (graph) { + ggml_build_forward_expand(graph, cur); + } + ffn.push_back(cur); + } + if (ffn.size() == 1) return ffn.front(); + auto cur = ggml_add(ctx, ffn[0], ffn[1]); + cb(cur, "combine_ffn", il); + cur->op_params[0] = 0xff; + for (int id = 2; id < int(ffn.size()); ++id) { + cur = ggml_add(ctx, cur, ffn[id]); + cb(cur, "combine_ffn", il); + } + if (ffn.size() > 2) { + cur->op_params[0] = 0xff; + } + //if (cur->type != GGML_TYPE_F32) { + // cur = ggml_cast(ctx, cur, GGML_TYPE_F32); + //} + + return cur; + } + + if (ffn_norm) { + input = llm_build_norm(ctx, input, lctx.model.hparams, ffn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(input, "ffn_norm", il); + } + else if (input->type != GGML_TYPE_F32) { + input = ggml_cast(ctx, input, GGML_TYPE_F32); + } if (lctx.cparams.fused_up_gate && up && gate && !up_b && !up_s && !gate_b && !gate_s && type_gate == LLM_FFN_PAR && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) { auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU; - cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op); + auto cur = ggml_fused_up_gate(ctx, up, gate, input, unary_op); cb(cur, "ffn_up_gate", il); if (down) { cur = llm_build_lora_mm(lctx, ctx, down, cur); @@ -663,7 +733,7 @@ ggml_tensor * llm_build_context::llm_build_ffn( return cur; } - struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur; + struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, input) : input; cb(tmp, "ffn_up", il); if (up_b) { @@ -676,6 +746,7 @@ ggml_tensor * llm_build_context::llm_build_ffn( cb(tmp, "ffn_up_s", il); } + auto cur = input; if (gate) { switch (type_gate) { case LLM_FFN_SEQ: @@ -983,6 +1054,183 @@ llm_expert_gating_func_type gating_op, } +ggml_tensor * llm_build_context::llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx, + ggml_tensor * ffn_norm, + ggml_tensor * input, + ggml_tensor * gate_inp, ggml_tensor * gate_inp_b, + ggml_tensor * up_exps, ggml_tensor * up_exps_b, + ggml_tensor * gate_exps, ggml_tensor * gate_exps_b, + ggml_tensor * down_exps, ggml_tensor * down_exps_b, + ggml_tensor * exp_probs_b, + ggml_tensor * up_shexp, ggml_tensor * up_b_shexp, + ggml_tensor * gate_shexp, ggml_tensor * gate_b_shexp, + ggml_tensor * down_shexp, ggml_tensor * down_b_shexp, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + bool scale_w, + float w_scale, +llm_expert_gating_func_type gating_op, + llm_ffn_op_type type_op_shexp, + const llm_build_cb & cb, int il, ggml_cgraph * graph) { + + auto split_up_exps = (ggml_split_tensor_t *)up_exps->extra; + auto split_gate_exps = (ggml_split_tensor_t *)gate_exps->extra; + auto split_down_exps = (ggml_split_tensor_t *)down_exps->extra; + auto split_up_shexp = up_shexp ? (ggml_split_tensor_t *)up_shexp->extra : nullptr; + auto split_gate_shexp = gate_shexp ? (ggml_split_tensor_t *)gate_shexp->extra : nullptr; + auto split_down_shexp = down_shexp ? (ggml_split_tensor_t *)down_shexp->extra : nullptr; + auto split_up_b_shexp = up_b_shexp ? (ggml_split_tensor_t *)up_b_shexp : nullptr; + auto split_gate_b_shexp = gate_b_shexp ? (ggml_split_tensor_t *)gate_b_shexp : nullptr; + auto split_down_b_shexp = down_b_shexp ? (ggml_split_tensor_t *)down_b_shexp : nullptr; + if (!split_up_exps && !split_gate_exps && !split_down_exps) { + auto cur = input; + if (ffn_norm) { + auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm; + cur = llm_build_norm(ctx, input, lctx.model.hparams, the_ffn_norm, nullptr, LLM_NORM_RMS, cb, il); + cb(cur, "ffn_inp_normed", il); + } + else if (cur->type != GGML_TYPE_F32) { + cur = ggml_cast(ctx, cur, GGML_TYPE_F32); + } + auto the_gate_inp = gate_inp->extra ? ((ggml_split_tensor_t *)gate_inp->extra)->splits[lctx.model.main_gpu] : gate_inp; + auto the_gate_inp_b = gate_inp_b ? gate_inp_b->extra ? ((ggml_split_tensor_t *)gate_inp_b->extra)->splits[lctx.model.main_gpu] : gate_inp_b : nullptr; + auto the_exp_probs_b = exp_probs_b ? exp_probs_b->extra ? ((ggml_split_tensor_t *)exp_probs_b->extra)->splits[lctx.model.main_gpu] : exp_probs_b : nullptr; + //int n_before = graph->n_nodes; + auto routed_out = llm_build_moe_ffn(ctx, lctx, cur, + the_gate_inp, the_gate_inp_b, + up_exps, up_exps_b, + gate_exps, gate_exps_b, + down_exps, down_exps_b, + the_exp_probs_b, + n_expert, n_expert_used, + type_op, norm_w, scale_w, w_scale, + gating_op, cb, il, graph); + cb(routed_out, "routed_out", il); + ggml_build_forward_expand(graph, routed_out); + //printf("Using non-split llm_build_moe_ffn for layer %d. n_before = %d, n_now = %d\n", il, n_before, graph->n_nodes); + + if (up_shexp && gate_shexp && down_shexp) { + if (split_up_shexp) { + //printf("Using split ffn for shared experts in layer %d\n", il); + std::vector results(split_up_shexp->n_device); + GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == split_up_shexp->n_device); + GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_shexp->n_device); + GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_shexp->n_device); + for (int id = 0; id < split_up_shexp->n_device; ++id) { + int il_cb = 1000*id + il; + auto the_ffn_norm = ffn_norm ? ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[id] : ffn_norm : nullptr; + auto shared_out = llm_build_ffn(ctx, lctx, the_ffn_norm, input, + split_up_shexp->splits[id], split_up_b_shexp ? split_up_b_shexp->splits[id] : nullptr, nullptr, + split_gate_shexp->splits[id], split_gate_b_shexp ? split_gate_b_shexp->splits[id] : nullptr, nullptr, + split_down_shexp->splits[id], split_down_b_shexp ? split_down_b_shexp->splits[id] : nullptr, nullptr, + nullptr, type_op_shexp, LLM_FFN_PAR, cb, il); + cb(shared_out, "ffn_shexp_out", il_cb); + if (shared_out->ne[1] > 32) { + shared_out = ggml_cast(ctx, shared_out, GGML_TYPE_F16); + } + results[id] = shared_out; + } + cur = ggml_add(ctx, results[0], results[1]); + cur->op_params[0] = 0xff; + cb(cur, "ffn_shared_combined", il); + for (int id = 2; id < int(results.size()); ++id) { + cur = ggml_add(ctx, cur, results[id]); + cb(cur, "ffn_shared_combined", il); + } + if (routed_out->ne[1] > 32) { + auto routed_out_f16 = ggml_cast(ctx, routed_out, GGML_TYPE_F16); + cur = ggml_add(ctx, routed_out_f16, cur); + } else { + cur = ggml_add(ctx, routed_out, cur); + } + cb(cur, "ffn_out", il); + } else { + //printf("Using non-split ffn for shared experts in layer %d\n", il); + auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur, + up_shexp, up_b_shexp, nullptr, + gate_shexp, gate_b_shexp, nullptr, + down_shexp, down_b_shexp, nullptr, + nullptr, type_op_shexp, LLM_FFN_PAR, cb, il); + cb(shared_out, "ffn_shexp_out", il); + cur = ggml_add(ctx, routed_out, shared_out); + cb(cur, "ffn_out", il); + } + } else { + cur = routed_out; + } + if (cur != routed_out) { + ggml_build_forward_expand(graph, cur); + } + return cur; + } + GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps); + GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device); + std::vector results(split_up_exps->n_device); + GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) || + ( split_up_shexp && split_gate_shexp && split_down_shexp)); + auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra; + GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device); + auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr; + GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device); + for (int id = 0; id < split_up_exps->n_device; ++id) { + int il_cb = 1000*(id + 1) + il; + auto cur = input; + if (ffn_norm) { + auto split_ffn_norm = (ggml_split_tensor_t *)ffn_norm->extra; + GGML_ASSERT(split_ffn_norm && split_ffn_norm->n_device == split_up_exps->n_device); + cur = llm_build_norm(ctx, input, lctx.model.hparams, split_ffn_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il); + cb(cur, "ffn_inp_normed", il_cb); + } + else if (cur->type != GGML_TYPE_F32) { + cur = ggml_cast(ctx, cur, GGML_TYPE_F32); + } + auto routed_out = llm_build_moe_ffn(ctx, lctx, cur, + split_gate_inp->splits[id], gate_inp_b, + split_up_exps->splits[id], up_exps_b, + split_gate_exps->splits[id], gate_exps_b, + split_down_exps->splits[id], down_exps_b, + split_exp_probs_b ? split_exp_probs_b->splits[id] : nullptr, + n_expert, n_expert_used, + type_op, norm_w, scale_w, w_scale, + gating_op, cb, il, graph); + cb(routed_out, "routed_out", il_cb); + + if (split_up_shexp) { + GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == split_up_exps->n_device); + GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_exps->n_device); + GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_exps->n_device); + auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur, + split_up_shexp->splits[id], split_up_b_shexp ? split_up_b_shexp->splits[id] : nullptr, nullptr, + split_gate_shexp->splits[id], split_gate_b_shexp ? split_gate_b_shexp->splits[id] : nullptr, nullptr, + split_down_shexp->splits[id], split_down_b_shexp ? split_down_b_shexp->splits[id] : nullptr, nullptr, + nullptr, type_op_shexp, LLM_FFN_PAR, cb, il); + cb(shared_out, "ffn_shexp_out", il_cb); + + cur = ggml_add(ctx, routed_out, shared_out); + cb(cur, "ffn_out", il_cb); + } else { + cur = routed_out; + } + if (cur->ne[1] >= 32) { + cur = ggml_cast(ctx, cur, GGML_TYPE_F16); + cb(cur, "ffn_out_f16", il_cb); + } + results[id] = cur; + } + if (results.size() == 1) return results.front(); + + auto cur = ggml_add(ctx, results[0], results[1]); + cur->op_params[0] = 0xff; + cb(cur, "ffn_combined", il); + for (int id = 2; id < int(results.size()); ++id) { + cur = ggml_add(ctx, cur, results[id]); + cb(cur, "ffn_combined", il); + } + return cur; +} + static ggml_tensor * llm_build_kqv( struct ggml_context * ctx, struct llama_context & lctx, @@ -1243,7 +1491,7 @@ std::tuple llm_build_context::llm_buil ggml_tensor * wq, ggml_tensor * bq, ggml_tensor * wk, ggml_tensor * bk, ggml_tensor * wv, ggml_tensor * bv, - float attention_scale, int il) { + float attention_scale, int il) const { auto Qcur = llm_build_lora_mm(lctx, ctx0, wq, cur); cb(Qcur, "Qcur", il); auto Kcur = llm_build_lora_mm(lctx, ctx0, wk, cur); @@ -1282,7 +1530,7 @@ std::tuple llm_build_context::llm_buil ggml_tensor * wq, ggml_tensor * bq, ggml_tensor * wk, ggml_tensor * bk, ggml_tensor * wv, ggml_tensor * bv, - ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) { + ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) const { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); if (wqkv) { @@ -1299,12 +1547,12 @@ std::tuple llm_build_context::llm_buil cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); if (q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il); + Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il); cb(Qcur, "Qcur_normed", il); ggml_build_forward_expand(gf, Qcur); } if (k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il); + Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il); cb(Kcur, "Kcur_normed", il); ggml_build_forward_expand(gf, Kcur); } @@ -1336,12 +1584,12 @@ std::tuple llm_build_context::llm_buil cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); if (q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il); + Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il); cb(Qcur, "Qcur_normed", il); ggml_build_forward_expand(gf, Qcur); } if (k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il); + Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il); cb(Kcur, "Kcur_normed", il); ggml_build_forward_expand(gf, Kcur); } @@ -1351,21 +1599,79 @@ std::tuple llm_build_context::llm_buil } auto [Q, K, V] = llm_build_mul_mat_qkv(gf, cur, wq, bq, wk, bk, wv, bv, attention_scale, il); - auto Qcur = ggml_reshape_3d(ctx0, Q, n_embd_head, n_head, n_tokens); + auto Qcur = ggml_reshape_3d(ctx0, Q, n_embd_head, Q->ne[0]/n_embd_head, n_tokens); if (q_norm) { Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il); cb(Qcur, "Qcur_normed", il); } - auto Kcur = ggml_reshape_3d(ctx0, K, n_embd_head, n_head_kv, n_tokens); + auto Kcur = ggml_reshape_3d(ctx0, K, n_embd_head, K->ne[0]/n_embd_head, n_tokens); if (k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il); + Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il); cb(Kcur, "Kcur_normed", il); } auto Vcur = V; return {Qcur, Kcur, Vcur}; } +static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml_tensor * cur, ggml_tensor * output, const llm_build_cb & cb) { + // lm_head + if (output->extra) { + auto split_output = (ggml_split_tensor_t *)output->extra; + std::vector o; + o.reserve(split_output->n_device); + for (int id = 0; id < split_output->n_device; ++id) { + auto split = split_output->splits[id]; + if (!split) continue; + o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur)); + cb(o.back(), "output", id); + } + if (o.size() == 1) cur = o.front(); + cur = ggml_concat(ctx, o[0], o[1], 0); + for (int id = 2; id < int(o.size()); ++id) { + cur = ggml_concat(ctx, cur, o[id], 0); + } + } else { + cur = llm_build_context::llm_build_lora_mm(lctx, ctx, output, cur); + } + return cur; +} + +static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml_tensor * cur, ggml_tensor * output, ggml_tensor * output_norm, const llm_build_cb & cb) { + // lm_head + if (output->extra) { + auto split_output = (ggml_split_tensor_t *)output->extra; + auto split_output_norm = output_norm && output_norm->extra ? (ggml_split_tensor_t *)output_norm->extra : nullptr; + std::vector o; + o.reserve(split_output->n_device); + for (int id = 0; id < split_output->n_device; ++id) { + auto split = split_output->splits[id]; + if (!split) continue; + if (output_norm) { + auto the_norm = split_output_norm ? split_output_norm->splits[id] : output_norm; + auto cur_normed = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, the_norm, NULL, LLM_NORM_RMS, cb, -1); + cb(cur_normed, "output_normed", 1000*(id+1)); + o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur_normed)); + } else { + o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur)); + } + cb(o.back(), "output", id); + } + if (o.size() == 1) cur = o.front(); + cur = ggml_concat(ctx, o[0], o[1], 0); + for (int id = 2; id < int(o.size()); ++id) { + cur = ggml_concat(ctx, cur, o[id], 0); + } + } else { + if (output_norm) { + cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1); + cb(cur, "output_normed", -1); + } + cur = llm_build_context::llm_build_lora_mm(lctx, ctx, output, cur); + } + return cur; +} + ggml_cgraph * llm_build_context::build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -1405,15 +1711,22 @@ ggml_cgraph * llm_build_context::build_llama() { bool use_rope = model.arch == LLM_ARCH_LLAMA4 ? (il + 1) % hparams.n_no_rope_layer_step != 0 : true; auto this_KQ_mask = hparams.n_swa > 0 && hparams.n_swa_pattern > 0 && il % hparams.n_swa_pattern < (hparams.n_swa_pattern - 1) ? KQ_mask_swa : KQ_mask; + int this_n_swa = this_KQ_mask == KQ_mask_swa ? hparams.n_swa : 0; - // norm - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + // rope freq factors for llama3; may return nullptr for llama2 and other models + //auto rope_factors = build_rope_factors(il); // self-attention - { - // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + if (use_rope) { + cur = build_std_attention(gf, inpL, inp_pos, nullptr, this_KQ_mask, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il); + } + else { + + auto rope_factors = build_rope_factors(il); + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, model.layers[il].wqkv, model.layers[il].bqkv, @@ -1450,7 +1763,7 @@ ggml_cgraph * llm_build_context::build_llama() { cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, this_KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il, nullptr, - this_KQ_mask == KQ_mask_swa ? hparams.n_swa : 0); + this_n_swa); } if (il == n_layer - 1) { @@ -1475,15 +1788,12 @@ ggml_cgraph * llm_build_context::build_llama() { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { // non-MoE - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf); cb(cur, "ffn_out", il); } else if (model.arch == LLM_ARCH_LLAMA4) { // llama4 MoE @@ -1503,7 +1813,7 @@ ggml_cgraph * llm_build_context::build_llama() { cb, il, gf); // Shared experts - ggml_tensor * shexp_out = llm_build_ffn(ctx0, lctx, ffn_inp_normed, + ggml_tensor * shexp_out = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp_normed, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -1551,11 +1861,8 @@ ggml_cgraph * llm_build_context::build_llama() { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb); // For Granite architecture if (hparams.f_logit_scale) { @@ -1664,10 +1971,8 @@ ggml_cgraph * llm_build_context::build_deci() { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -1778,10 +2083,7 @@ ggml_cgraph * llm_build_context::build_baichuan() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -1873,10 +2175,7 @@ ggml_cgraph * llm_build_context::build_xverse() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -1986,7 +2285,7 @@ ggml_cgraph * llm_build_context::build_falcon() { // feed forward { - cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result + cur = llm_build_ffn(ctx0, lctx, nullptr, attn_norm, // !! use the attn norm, not the result model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2105,7 +2404,7 @@ ggml_cgraph * llm_build_context::build_grok() { cb(moe_out, "ffn_moe_out", il); if (model.layers[il].ffn_up) { - ggml_tensor* ffn_out = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor* ffn_out = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2344,10 +2643,7 @@ ggml_cgraph * llm_build_context::build_starcoder() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2424,10 +2720,7 @@ ggml_cgraph * llm_build_context::build_refact() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2604,21 +2897,21 @@ ggml_cgraph * llm_build_context::build_bert() { // feed-forward network if (model.arch == LLM_ARCH_BERT) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2704,10 +2997,7 @@ ggml_cgraph * llm_build_context::build_bloom() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2828,9 +3118,7 @@ ggml_cgraph * llm_build_context::build_mpt() { // feed forward { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2946,7 +3234,7 @@ ggml_cgraph * llm_build_context::build_stablelm() { // parallel residual cur = inpSA; } - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3049,10 +3337,7 @@ ggml_cgraph * llm_build_context::build_qwen() { // feed-forward forward { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3144,10 +3429,7 @@ ggml_cgraph * llm_build_context::build_qwen2() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3247,10 +3529,7 @@ ggml_cgraph * llm_build_context::build_qwen2vl() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3372,7 +3651,7 @@ ggml_cgraph * llm_build_context::build_qwen2moe() { ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); cb(cur_gate, "ffn_shexp_gate", il); - ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -3478,10 +3757,7 @@ ggml_cgraph * llm_build_context::build_qwen3() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3514,9 +3790,6 @@ ggml_cgraph * llm_build_context::build_qwen3() { ggml_cgraph * llm_build_context::build_qwen3moe() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3532,46 +3805,18 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - auto rope_cache = cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ? - ggml_rope_cache(ctx0, inp_pos, nullptr, n_embd_head, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow) : nullptr; - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + //cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); + //cb(cur, "attn_norm", il); - // self_attention - { - auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, - model.layers[il].wqkv, nullptr, - model.layers[il].wqk, nullptr, - model.layers[il].wq, nullptr, model.layers[il].wk, nullptr, model.layers[il].wv, nullptr, - model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0, il); - - if (rope_cache) { - Qcur = ggml_rope_fast(ctx0, Qcur, rope_cache); - Kcur = ggml_rope_fast(ctx0, Kcur, rope_cache); - } else { - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - } - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - - cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } + cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il); if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -3579,23 +3824,19 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = - llm_build_moe_ffn(ctx0, lctx, cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, true, - false, 0.0, - LLM_EXPERT_GATING_FUNC_SOFTMAX, - cb, il, gf); - cb(cur, "ffn_moe_out", il); + cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, + model.layers[il].ffn_gate_inp, nullptr, + model.layers[il].ffn_up_exps, nullptr, + model.layers[il].ffn_gate_exps, nullptr, + model.layers[il].ffn_down_exps, nullptr, + model.layers[il].ffn_exp_probs_b, + nullptr, nullptr, // we don't have shared expert biases? + nullptr, nullptr, + nullptr, nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, false, 0.0f, + LLM_EXPERT_GATING_FUNC_SOFTMAX, + LLM_FFN_SILU, cb, il, gf); cur = ggml_add(ctx0, cur, ffn_inp); cur = lctx.cvec.apply_to(ctx0, cur, il); @@ -3607,11 +3848,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3712,12 +3949,7 @@ ggml_cgraph * llm_build_context::build_qwen3vl() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3986,7 +4218,7 @@ ggml_cgraph * llm_build_context::build_phi2() { // FF { - ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output, + ffn_output = llm_build_ffn(ctx0, lctx, nullptr, attn_norm_output, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4099,14 +4331,11 @@ ggml_cgraph * llm_build_context::build_phi3() { cur = ggml_add(ctx0, cur, residual); residual = cur; - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // FF // special-case: the up and gate tensors are merged into a single tensor // TOOD: support into llm_build_ffn { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4195,7 +4424,7 @@ ggml_cgraph * llm_build_context::build_plamo() { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4292,10 +4521,7 @@ ggml_cgraph * llm_build_context::build_gpt2() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4394,10 +4620,7 @@ ggml_cgraph * llm_build_context::build_codeshell() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4485,10 +4708,7 @@ ggml_cgraph * llm_build_context::build_orion() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4578,10 +4798,7 @@ ggml_cgraph * llm_build_context::build_internlm2() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4691,10 +4908,7 @@ ggml_cgraph * llm_build_context::build_minicpm() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4793,12 +5007,9 @@ ggml_cgraph * llm_build_context::build_gemma() { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, sa_out, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4903,12 +5114,9 @@ ggml_cgraph * llm_build_context::build_gemma2() { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, sa_out, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5034,11 +5242,8 @@ ggml_cgraph * llm_build_context::build_gemma3() { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // feed-forward network - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, sa_out, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5132,11 +5337,7 @@ ggml_cgraph * llm_build_context::build_starcoder2() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -5392,7 +5593,7 @@ ggml_cgraph * llm_build_context::build_command_r() { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, ffn_inp, + cur = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5523,7 +5724,7 @@ ggml_cgraph * llm_build_context::build_olmo() { cur = llm_build_norm(ctx0, ffn_inp, hparams, NULL, NULL, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5637,10 +5838,7 @@ ggml_cgraph * llm_build_context::build_openelm() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5744,7 +5942,7 @@ ggml_cgraph * llm_build_context::build_gptneox() { cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -5772,7 +5970,7 @@ ggml_cgraph * llm_build_context::build_gptneox() { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -5865,10 +6063,7 @@ ggml_cgraph * llm_build_context::build_arctic() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6398,7 +6593,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() { cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6423,7 +6618,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -6482,22 +6677,26 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { // output token IDs (for last layer cropping) struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - auto rope_cache = cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ? + auto rope_cache = model.split_mode != LLAMA_SPLIT_MODE_GRAPH && cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ? ggml_rope_cache(ctx0, inp_pos, nullptr, n_embd_head, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) : nullptr; + float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + // Only process up to last layer (skip final NextN layer) // Final layer tensors are loaded but not processed in forward pass const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; for (int il = 0; il < n_transformer_layers; ++il) { struct ggml_tensor * inpSA = inpL; - // Pre-attention norm - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); - // self-attention - { + if (rope_cache == nullptr) { + cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, kq_scale, 0.0f, 0, il); + } else { + // Pre-attention norm + cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, model.layers[il].wqkv, model.layers[il].bqkv, model.layers[il].wqk, model.layers[il].bqk, @@ -6539,46 +6738,57 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // Post-attention norm - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "post_attn_norm", il); - if ((uint32_t) il < hparams.n_layer_dense_lead) { // dense FFN - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf); cb(cur, "ffn_out", il); } else { - // MoE FFN - struct ggml_tensor * routed_out = llm_build_moe_ffn(ctx0, lctx, cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, + cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, + model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b, + model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b, + model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b, + model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b, model.layers[il].ffn_exp_probs_b, + model.layers[il].ffn_up_shexp, nullptr, // we don't have shared expert biases? + model.layers[il].ffn_gate_shexp, nullptr, + model.layers[il].ffn_down_shexp, nullptr, n_expert, n_expert_used, - LLM_FFN_SILU, hparams.expert_weights_norm, - true, hparams.expert_weights_scale, - (enum llm_expert_gating_func_type) hparams.expert_gating_func, - cb, il, gf); - cb(routed_out, "routed_out", il); + LLM_FFN_SILU, hparams.expert_weights_norm, true, hparams.expert_weights_scale, + (llm_expert_gating_func_type) hparams.expert_gating_func, + LLM_FFN_SILU, cb, il, gf); - { - struct ggml_tensor * shared_out = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(shared_out, "ffn_shexp_out", il); + //// Post-attention norm + //cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); + //cb(cur, "post_attn_norm", il); + //// MoE FFN + //auto routed_out = llm_build_moe_ffn(ctx0, lctx, cur, + // model.layers[il].ffn_gate_inp, + // model.layers[il].ffn_up_exps, + // model.layers[il].ffn_gate_exps, + // model.layers[il].ffn_down_exps, + // model.layers[il].ffn_exp_probs_b, + // n_expert, n_expert_used, + // LLM_FFN_SILU, hparams.expert_weights_norm, + // true, hparams.expert_weights_scale, + // (enum llm_expert_gating_func_type) hparams.expert_gating_func, + // cb, il, gf); + //cb(routed_out, "routed_out", il); - cur = ggml_add(ctx0, routed_out, shared_out); - cb(cur, "ffn_out", il); - } + //auto shared_out = llm_build_ffn(ctx0, lctx, nullptr, cur, + // model.layers[il].ffn_up_shexp, NULL, NULL, + // model.layers[il].ffn_gate_shexp, NULL, NULL, + // model.layers[il].ffn_down_shexp, NULL, NULL, + // NULL, + // LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + //cb(shared_out, "ffn_shexp_out", il); + + //cur = ggml_add(ctx0, routed_out, shared_out); + //cb(cur, "ffn_out", il); } // residual and context vector @@ -6592,12 +6802,8 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { cur = inpL; - // final norm - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - // lm head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -6828,10 +7034,7 @@ ggml_cgraph * llm_build_context::build_bitnet_158() { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, NULL, NULL, NULL, @@ -6945,7 +7148,7 @@ ggml_cgraph * llm_build_context::build_cohere2() { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, + cur = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -7063,11 +7266,8 @@ ggml_cgraph * llm_build_context::build_t5_encoder() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm_enc, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm_enc, ffn_inp, model.layers[il].ffn_up_enc, NULL, NULL, model.layers[il].ffn_gate_enc, NULL, NULL, model.layers[il].ffn_down_enc, NULL, NULL, @@ -7251,11 +7451,8 @@ ggml_cgraph * llm_build_context::build_t5_decoder() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7349,10 +7546,7 @@ ggml_cgraph * llm_build_context::build_jais() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -7454,10 +7648,7 @@ ggml_cgraph * llm_build_context::build_chatglm() { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7580,12 +7771,8 @@ ggml_cgraph * llm_build_context::build_glm4() { // FF { - // Pre-MLP norm - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - // MLP - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7698,7 +7885,7 @@ ggml_cgraph * llm_build_context::build_dots1() { cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7721,7 +7908,7 @@ ggml_cgraph * llm_build_context::build_dots1() { cb(moe_out, "ffn_moe_out", il); { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -7842,10 +8029,7 @@ ggml_cgraph * llm_build_context::build_ernie4_5() { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7967,10 +8151,7 @@ ggml_cgraph * llm_build_context::build_ernie4_5_moe() { bool is_moe_layer = static_cast(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0; if (!is_moe_layer) { - cur = llm_build_norm(ctx0, ffn_inp,hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7998,7 +8179,7 @@ ggml_cgraph * llm_build_context::build_ernie4_5_moe() { // Shared expert (if present) if (hparams.n_ff_shexp > 0) { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -8115,7 +8296,7 @@ ggml_cgraph * llm_build_context::build_hunyuan_moe() { cb(cur, "ffn_norm", il); // feed-forward network (non-MoE) - ggml_tensor * cur_mlp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * cur_mlp = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -8363,7 +8544,7 @@ ggml_cgraph * llm_build_context::build_bailingmoe2() { cb(cur, "ffn_norm", il); if (static_cast(il) < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -8386,7 +8567,7 @@ ggml_cgraph * llm_build_context::build_bailingmoe2() { cb, il, gf); cb(moe_out, "ffn_moe_out", il); - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -8606,10 +8787,7 @@ ggml_cgraph* llm_build_context::build_smollm3() { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9010,3 +9188,210 @@ ggml_cgraph * llm_build_context::llama_build_graph( return result; } + +ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tensor * input, ggml_tensor * inp_pos, ggml_tensor * rope_factors_in, + ggml_tensor * KQ_mask, ggml_tensor * sinks, float KQ_scale, float f_attn_scale, int n_swa, int il) { + if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn && + model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) { + if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) { + ggml_split_tensor_t * attn_norm = model.layers[il].attn_norm ? (ggml_split_tensor_t *)model.layers[il].attn_norm->extra : nullptr; + auto wq = (ggml_split_tensor_t *)model.layers[il].wq->extra; + auto wk = (ggml_split_tensor_t *)model.layers[il].wk->extra; + auto wv = (ggml_split_tensor_t *)model.layers[il].wv->extra; + auto wo = (ggml_split_tensor_t *)model.layers[il].wo->extra; + GGML_ASSERT(wq->n_device == wk->n_device && wq->n_device == wv->n_device && wq->n_device == wo->n_device); + auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra; + auto vl = (ggml_split_tensor_t *)kv_self.v_l[il]->extra; + GGML_ASSERT(wq->n_device == kl->n_device && wq->n_device == vl->n_device); + ggml_split_tensor_t *bq = nullptr, *bo = nullptr, *bk = nullptr, *bv = nullptr; + if (model.layers[il].bq && model.layers[il].bq->extra) { + bq = (ggml_split_tensor_t *)model.layers[il].bq->extra; + GGML_ASSERT(bq->n_device == wq->n_device); + } + if (model.layers[il].bo && model.layers[il].bo->extra) { + bo = (ggml_split_tensor_t *)model.layers[il].bo->extra; + GGML_ASSERT(bo->n_device == wq->n_device); + } + if (model.layers[il].bk && model.layers[il].bk->extra) { + bk = (ggml_split_tensor_t *)model.layers[il].bk->extra; + GGML_ASSERT(bk->n_device == wq->n_device); + } + if (model.layers[il].bv && model.layers[il].bv->extra) { + bv = (ggml_split_tensor_t *)model.layers[il].bv->extra; + GGML_ASSERT(bv->n_device == wq->n_device); + } + std::vector attn; attn.reserve(wq->n_device); + for (int id = 0; id < wq->n_device; ++id) { + int il_cb = 1000*(id+1) + il; + auto split_wq = wq->splits[id]; + auto split_wk = wk->splits[id]; + auto split_wv = wv->splits[id]; + auto split_wo = wo->splits[id]; + auto split_kl = kl->splits[id]; + auto split_vl = vl->splits[id]; + GGML_ASSERT((!split_wq && !split_wk && !split_wv && !split_wo && !split_kl && !split_vl) || + (split_wq && split_wk && split_wv && split_wo && split_kl && split_vl)); + if (!split_wq) continue; + auto cur = input; + if (attn_norm) { + auto split_norm = attn_norm->splits[id]; + cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il_cb); + } + else if (cur->type != GGML_TYPE_F32) { + cur = ggml_cast(ctx0, cur, GGML_TYPE_F32); + } + auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ? + ((ggml_split_tensor_t *)model.layers[il].attn_q_norm->extra)->splits[id] : model.layers[il].attn_q_norm : nullptr; + auto the_k_norm = model.layers[il].attn_k_norm ? model.layers[il].attn_k_norm->extra ? + ((ggml_split_tensor_t *)model.layers[il].attn_k_norm->extra)->splits[id] : model.layers[il].attn_k_norm : nullptr; + auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, nullptr, nullptr, nullptr, nullptr, + split_wq, bq ? bq->splits[id] : nullptr, + split_wk, bk ? bk->splits[id] : nullptr, + split_wv, bv ? bv->splits[id] : nullptr, + the_q_norm, the_k_norm, f_attn_scale, il_cb); + auto rope_factors = rope_factors_in; + if (!rope_factors && model.layers[il].rope_freqs && model.layers[il].rope_freqs->extra) { + auto extra = (ggml_split_tensor_t *)model.layers[il].rope_freqs->extra; + rope_factors = extra->splits[id]; + } + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il_cb); + cb(Kcur, "Kcur", il_cb); + ggml_build_forward_expand(gf, Qcur); + ggml_build_forward_expand(gf, Kcur); + ggml_build_forward_expand(gf, Vcur); + + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_head_kv = split_wk->ne[1] / n_embd_head_k; + + GGML_ASSERT(kv_self.size == cparams.n_ctx); + + auto idx = 2*wq->n_device*il + 2*id; + GGML_ASSERT(idx+1 < (int)lctx.cache_copies.size()); + auto k_row_size = ggml_row_size(split_kl->type, n_embd_head_k); + ggml_tensor * k_cache_view = ggml_view_2d(ctx0, split_kl, n_embd_head_k, n_tokens*n_head_kv, + k_row_size, k_row_size*n_head_kv*kv_head); + + lctx.cache_copies[idx+0].cpy = ggml_cpy(ctx0, Kcur, k_cache_view); + lctx.cache_copies[idx+0].step = k_row_size*n_head_kv; + + // note: storing RoPE-ed version of K in the KV cache + ggml_build_forward_expand(gf, lctx.cache_copies[idx+0].cpy); + + struct ggml_tensor * v_cache_view = nullptr; + + if (cparams.flash_attn) { + v_cache_view = ggml_view_1d(ctx0, split_vl, n_tokens*split_wv->ne[1], + kv_head*ggml_row_size(split_vl->type, split_wv->ne[1])); + lctx.cache_copies[idx+1].step = ggml_row_size(split_vl->type, split_wv->ne[1]); + } else { + // note: the V cache is transposed when not using flash attention + v_cache_view = ggml_view_2d(ctx0, split_vl, n_tokens, split_wv->ne[1], + ( n_ctx)*ggml_element_size(split_vl), + (kv_head)*ggml_element_size(split_vl)); + lctx.cache_copies[idx+1].step = ggml_element_size(split_vl); + + Vcur = ggml_transpose(ctx0, Vcur); + } + cb(v_cache_view, "v_cache_view", il_cb); + + lctx.cache_copies[idx+1].cpy = ggml_cpy(ctx0, Vcur, v_cache_view); + ggml_build_forward_expand(gf, lctx.cache_copies[idx+1].cpy); + + auto q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + cb(q, "q", il_cb); + + auto k = ggml_view_3d(ctx0, split_kl, n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(split_kl->type, n_embd_head_k)*n_head_kv, //n_embd_k_gqa), + ggml_row_size(split_kl->type, n_embd_head_k), 0); + cb(k, "k", il_cb); + + auto v = ggml_view_3d(ctx0, split_vl, n_embd_head_v, n_kv, n_head_kv, + ggml_row_size(split_vl->type, split_wv->ne[1]), + ggml_row_size(split_vl->type, n_embd_head_v), 0); + cb(v, "v", il_cb); + +#ifdef GGML_USE_VULKAN + constexpr bool use_f32_precision = true; +#else + constexpr bool use_f32_precision = false; +#endif + cur = ggml_flash_attn_ext(ctx0, q, k, v, KQ_mask, KQ_scale, hparams.f_max_alibi_bias, + hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); + cb(cur, "flash_attn", il_cb); + ggml_flash_attn_ext_add_sinks(cur, sinks); + if (n_swa > 0) { + ((int32_t *)cur->op_params)[4] = n_swa; + } + + // Some models produced NaNs/gibberish when FA is computed with f16 precision on CUDA + if (use_f32_precision || model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || + (model.arch == LLM_ARCH_DEEPSEEK2 && q->ne[1] <= 8) || model.arch == LLM_ARCH_COHERE2 || model.arch == LLM_ARCH_GLM4 || + model.arch == LLM_ARCH_GLM4_MOE) { + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + } + + cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens); + cb(cur, "flash_attn_reshaped", il_cb); + + cur = llm_build_lora_mm(lctx, ctx0, split_wo, cur); + if (lctx.model.arch == LLM_ARCH_GLM4 || lctx.model.arch == LLM_ARCH_GLM4_MOE) { + // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } + cb(cur, "kqv_wo", il_cb); + if (bo) { + cur = ggml_add(ctx0, cur, bo->splits[id]); + cb(cur, "kqv_wo_biased", il_cb); + } + if (cur->ne[1] >= 32) { + cur = ggml_cast(ctx0, cur, GGML_TYPE_F16); + } + ggml_build_forward_expand(gf, cur); + attn.push_back(cur); + } + if (attn.size() == 1) return attn.front(); + auto cur = ggml_add(ctx0, attn[0], attn[1]); + cb(cur, "combine_attn", il); + cur->op_params[0] = 0xff; + for (int id = 2; id < (int)attn.size(); ++id) { + cur = ggml_add(ctx0, cur, attn[id]); + cb(cur, "combine_attn", il); + } + // TODO: for more than 2 GPUs, do we need to add another forced graph split? + //if (attn.size() > 2) { + // cur->op_params[0] = 0xff; + //} + return cur; + } + } + + auto cur = input; + if (model.layers[il].attn_norm) { + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + } + + auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, + model.layers[il].wqkv, model.layers[il].bqkv, + model.layers[il].wqk, model.layers[il].bqk, + model.layers[il].wq, model.layers[il].bq, model.layers[il].wk, model.layers[il].bk, model.layers[il].wv, model.layers[il].bv, + model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, f_attn_scale, il); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors_in, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, rope_factors_in, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa); + + return cur; +} diff --git a/src/llama-build-context.h b/src/llama-build-context.h index a96a49d4..328b51ce 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -148,7 +148,7 @@ struct llm_build_context { ggml_tensor * wq, ggml_tensor * bq, ggml_tensor * wk, ggml_tensor * bk, ggml_tensor * wv, ggml_tensor * bv, - float attention_scale, int il); + float attention_scale, int il) const; std::tuple llm_build_mul_mat_qkv(ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * wqkv, ggml_tensor * bqkv, @@ -156,7 +156,7 @@ struct llm_build_context { ggml_tensor * wq, ggml_tensor * bq, ggml_tensor * wk, ggml_tensor * bk, ggml_tensor * wv, ggml_tensor * bv, - ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il); + ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) const; ggml_cgraph * build_llama(); @@ -317,7 +317,7 @@ struct llm_build_context { float kq_scale, const llm_build_cb & cb, int il, ggml_tensor * sinks = nullptr, int n_swa = 0); - static ggml_tensor * llm_build_ffn(ggml_context * ctx, llama_context & lctx, + static ggml_tensor * llm_build_ffn(ggml_context * ctx, llama_context & lctx, ggml_tensor * ffn_norm, ggml_tensor * cur, ggml_tensor * up, ggml_tensor * up_b, @@ -331,7 +331,7 @@ struct llm_build_context { ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, - const llm_build_cb & cb, int il); + const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr); static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx, ggml_tensor * cur, @@ -375,6 +375,27 @@ llm_expert_gating_func_type gating_op, gating_op, cb, il, graph); } + static ggml_tensor * llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx, + ggml_tensor * ffn_norm, + ggml_tensor * input, + ggml_tensor * gate_inp, ggml_tensor * gate_inp_b, + ggml_tensor * up_exps, ggml_tensor * up_exps_b, + ggml_tensor * gate_exps, ggml_tensor * gate_exps_b, + ggml_tensor * down_exps, ggml_tensor * down_exps_b, + ggml_tensor * exp_probs_b, + ggml_tensor * up_shexp, ggml_tensor * up_b_shexp, + ggml_tensor * gate_shexp, ggml_tensor * gate_b_shexp, + ggml_tensor * down_shexp, ggml_tensor * down_b_shexp, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + bool scale_w, + float w_scale, +llm_expert_gating_func_type gating_op, + llm_ffn_op_type type_op_shexp, + const llm_build_cb & cb, int il, ggml_cgraph * graph); + static ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids); static ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx); @@ -383,4 +404,7 @@ llm_expert_gating_func_type gating_op, static ggml_cgraph * llama_build_graph(llama_context & lctx, const llama_batch & batch, bool worst_case); + ggml_tensor * build_std_attention(ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * inp_pos, ggml_tensor * rope_factors, + ggml_tensor * KQ_mask, ggml_tensor * sinks, float KQ_scale, float f_attn_scale, int n_swa, int il); + }; diff --git a/src/llama-context.h b/src/llama-context.h index bb21d880..61ad51e5 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -57,6 +57,9 @@ struct llama_kv_cache { std::vector k_l; // per layer std::vector v_l; + std::vector split_k_l; + std::vector split_v_l; + std::vector ctxs; std::vector bufs; diff --git a/src/llama-impl.h b/src/llama-impl.h index 38844506..b6d84369 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -224,3 +224,8 @@ struct gguf_context; std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i); ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer); + +struct llama_split_tensor { + std::vector tensor_splits; + ggml_split_tensor_t ggml; +}; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 5f671fe7..a0f0f0bf 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #define LLAMA_API_INTERNAL @@ -139,7 +140,7 @@ struct create_tensors_helper : public create_tensors_helper_interface { ggml_context ** actual_ctx = nullptr); void create_default_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool norm_bias); - void create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm = true); + void create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm = true, bool use_ctx_split = false); void create_std_attn(int i, const LLM_TN & tn, llama_layer & layer, int n_embd, int n_embd_gqa, ggml_context * ctx_split); void create_std_ffn(int i, const LLM_TN & tn, llama_layer & layer, int n_ff, int n_embd, ggml_context * ctx_split); @@ -153,12 +154,15 @@ struct create_tensors_helper : public create_tensors_helper_interface { std::map buft_layer_count; std::map ctx_map; + ggml_context * split_ctx = nullptr; size_t ctx_size; ggml_context * ctx_input; ggml_context * ctx_output; ggml_context * ctx_output_split; + std::unordered_set split_tensors; + inline ggml_context * ctx_for_buft(ggml_backend_buffer_type_t buft) { if (auto it = ctx_map.find(buft); it != ctx_map.end()) return it->second; @@ -179,6 +183,14 @@ struct create_tensors_helper : public create_tensors_helper_interface { create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_model & _model) : ml(_ml), model(_model) { +#if 0 + for (int i = 0; i < model.hparams.n_layer; ++i) { + printf("Layer %2d: %s %s\n", i, ggml_backend_buft_name(model.buft_layer[i].buft_matrix), ggml_backend_buft_name(model.buft_layer[i].buft)); + } + printf("Output: %s %s\n", ggml_backend_buft_name(model.buft_output.buft_matrix), ggml_backend_buft_name(model.buft_output.buft)); + printf(" Input: %s %s\n", ggml_backend_buft_name(model.buft_input.buft_matrix), ggml_backend_buft_name(model.buft_input.buft)); +#endif + const int n_layer = model.hparams.n_layer; buft_layer_count[model.buft_input.buft]++; buft_layer_count[model.buft_input.buft_matrix]++; @@ -192,6 +204,11 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output ctx_size += ggml_tensor_overhead()*n_layer*3; // for moe merged tensors + if (model.splits.size() > 1) { + ctx_size += ggml_tensor_overhead()*n_layer*4; // for KV cache + ctx_size *= (model.splits.size() + 1); + } + for (auto & it : buft_layer_count) { struct ggml_init_params params = { /*.mem_size =*/ ctx_size, @@ -205,10 +222,95 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod ctx_map[it.first] = ctx; model.ctxs.push_back(ctx); } + if (model.split_buft) { + if (auto it = ctx_map.find(model.split_buft); it != ctx_map.end()) { + split_ctx = it->second; + } + } +#if 0 + printf("=======================================================================\n"); + auto n_device = model.device_count(); + printf(" Model has %d devices:\n", n_device); + for (int device = 0; device < n_device; ++device) { + auto buft = model.default_buffer_type_offload(device); + if (buft) { + printf(" %d %s\n", device, ggml_backend_buft_name(buft)); + } else { + printf(" Oops: null buft for debvice %d\n", device); + } + } + if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) { + printf("model.splits:"); + for (auto s : model.splits) printf(" %g", s); + printf("\n"); + } +#endif +} + +static std::vector create_split(int nr, int granularity, const std::vector & splits, const std::vector & mem_used) { + GGML_ASSERT(nr % granularity == 0); + GGML_ASSERT(!splits.empty()); + if (granularity < 0) return std::vector(splits.size(), nr); + GGML_ASSERT(mem_used.size() == splits.size()); + size_t tot_memory_used = 1; + for (auto & mem : mem_used) tot_memory_used += mem; + int nchunk = nr / granularity; + std::vector result(splits.size()); + float last_split = 0; + int sum = 0; + for (int i = 0; i < (int)splits.size(); ++i) { + float p = splits[i] - last_split; + p += (p - 1.f*mem_used[i]/tot_memory_used); + result[i] = roundf(p*nchunk); + if (result[i] < 0) result[i] = 0; + sum += result[i]; + last_split = splits[i]; + } + while (sum > nchunk) { + last_split = 0; + float best_err = std::numeric_limits::max(); + int ibest = -1; + for (int i = 0; i < (int)splits.size(); ++i) { + if (result[i] > 0) { + float p = splits[i] - last_split; + float n_want = p*nchunk; + float err = std::abs(n_want - result[i] + 1); + //float err = std::abs(n_want - result[i] + 1) + std::abs(p - 1.f*mem_used[i]/tot_memory_used)*nchunk; + if (err < best_err) { + best_err = err; ibest = i; + } + } + last_split = splits[i]; + } + GGML_ASSERT(ibest >= 0 && result[ibest] > 0); + --result[ibest]; + --sum; + } + while (sum < nchunk) { + last_split = 0; + float best_err = std::numeric_limits::max(); + int ibest = -1; + for (int i = 0; i < (int)splits.size(); ++i) { + float p = splits[i] - last_split; + float n_want = p*nchunk; + float err = std::abs(n_want - result[i] - 1); + //float err = std::abs(n_want - result[i] - 1) + std::abs(p - 1.f*mem_used[i]/tot_memory_used)*nchunk; + if (err < best_err) { + best_err = err; ibest = i; + } + last_split = splits[i]; + } + GGML_ASSERT(ibest >= 0); + ++result[ibest]; + ++sum; + } + for (auto & r : result) r *= granularity; + return result; } ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector & ne, int flags, ggml_context ** actual_context) { + //auto requested_ctx = ctx; if (ml.tensor_buft_overrides) { for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { std::regex pattern(overrides->pattern); @@ -220,7 +322,12 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std } } if (actual_context) *actual_context = ctx; - return ml.create_tensor(ctx, name, ne, flags); + auto tensor = ml.create_tensor(ctx, name, ne, flags); + if (tensor && ctx == split_ctx) { + //printf("%s: adding tensor %s to split tensors\n", __func__, tensor->name); + split_tensors.insert(tensor); + } + return tensor; } #define LOADING_PRELUDE \ @@ -251,17 +358,18 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std bool use_mmap_buffer = true; -void create_tensors_helper::create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm) { +void create_tensors_helper::create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm, bool use_ctx_split) { model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + auto out_ctx = use_ctx_split ? ctx_output_split : ctx_output; if (has_norm) { - model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); } - model.output = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.output = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + model.output = create_tensor(out_ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -280,7 +388,7 @@ void create_tensors_helper::create_std_ffn(int i, const LLM_TN & tn, llama_layer bool create_tensors_helper::create_llama_tensors(const LLM_TN & tn) { LOADING_PRELUDE - create_embd_output(tn, n_embd, n_vocab); + create_embd_output(tn, n_embd, n_vocab, true, true); for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); @@ -288,7 +396,7 @@ bool create_tensors_helper::create_llama_tensors(const LLM_TN & tn) { auto & layer = model.layers[i]; - layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); use_mmap_buffer &= !merge_qkv(tn, i, 1); @@ -297,12 +405,12 @@ bool create_tensors_helper::create_llama_tensors(const LLM_TN & tn) { // optional bias tensors layer.bo = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm = create_tensor(model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.rope_freqs = create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_freqs = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); if (n_expert == 0) { - create_std_ffn(i, tn, layer, n_ff, n_embd, ctx_split); + create_std_ffn(i, tn, layer, n_ff, n_embd, model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer); // optional MLP bias layer.ffn_gate_b = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -1043,11 +1151,11 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) { // output { - model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + model.output = create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -1057,18 +1165,19 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) { auto & layer = model.layers[i]; - layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); use_mmap_buffer &= !merge_qkv(tn, i, 0); layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}); - layer.attn_k_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}); - layer.attn_q_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}); + layer.attn_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}); + layer.attn_q_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}); - layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + auto ffn_ctx = model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer; + layer.ffn_norm = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + layer.ffn_gate_inp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); if (n_expert == 0) { throw std::runtime_error("n_expert must be > 0 for QWEN3MOE"); @@ -1080,9 +1189,9 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) { // MoE branch const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; - layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); - layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}); - layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); + layer.ffn_gate_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); + layer.ffn_down_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}); + layer.ffn_up_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); } return use_mmap_buffer; } @@ -1734,7 +1843,7 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers"); GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers"); - create_embd_output(tn, n_embd, n_vocab); + create_embd_output(tn, n_embd, n_vocab, true, true); for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); @@ -1748,7 +1857,7 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { auto & layer = model.layers[i]; - layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); + layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); // GLM-style attention with bias terms if (!flags) { @@ -1765,12 +1874,17 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags); // K/Q norm tensors (optional for GLM-4.5 355B variant) - layer.attn_q_norm = create_tensor(ctx_layer, + layer.attn_q_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags); - layer.attn_k_norm = create_tensor(ctx_layer, + layer.attn_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags); - layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags); + auto ffn_ctx = model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer; + + // Why are we adding an additional tensor type? + // attn_post_norm is the exact same thing as ffn_norm + //layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags); + layer.ffn_norm = create_tensor(ffn_ctx, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags); // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE @@ -1778,35 +1892,35 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { if (use_moe) { // MoE layers - layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags); + layer.ffn_gate_inp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags); // gate bias - layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags); + layer.ffn_exp_probs_b = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags); // MoE branch const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; - layer.ffn_gate_exps = create_tensor(ctx_split, + layer.ffn_gate_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags); - layer.ffn_down_exps = create_tensor(ctx_split, + layer.ffn_down_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags); - layer.ffn_up_exps = create_tensor(ctx_split, + layer.ffn_up_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags); // Shared expert if (n_expert_shared > 0) { const int64_t n_ff_shexp = n_ff_exp * n_expert_shared; - layer.ffn_gate_shexp = create_tensor(ctx_split, + layer.ffn_gate_shexp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); - layer.ffn_down_shexp = create_tensor(ctx_split, + layer.ffn_down_shexp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags); - layer.ffn_up_shexp = create_tensor(ctx_split, + layer.ffn_up_shexp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); } } else { // Dense layers (first k layers) - GLM uses separate gate/up projections - layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags); - layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags); - layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags); + layer.ffn_gate = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags); + layer.ffn_down = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags); + layer.ffn_up = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags); } // --- NextN / MTP tensors (preserved but unused), on the final layer --- if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { @@ -2629,18 +2743,77 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool i layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); if (bias) { auto flags = bias == 1 ? llama_model_loader::TENSOR_NOT_REQUIRED : 0; - layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {layer.wq->ne[1]}, flags); - layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {layer.wk->ne[1]}, flags); - layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {layer.wv->ne[1]}, flags); + layer.bq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "bias", i), {layer.wq->ne[1]}, flags); + layer.bk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "bias", i), {layer.wk->ne[1]}, flags); + layer.bv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "bias", i), {layer.wv->ne[1]}, flags); } } return fused_qkv; } +static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor * tensor, llama_split_tensor & split_tensor, + const std::vector & splits, std::vector & mem_used) { + GGML_ASSERT(split_dim <= 1); + GGML_ASSERT(splits.size() > 1); + std::string name{tensor->name}; + split_tensor.tensor_splits.resize(splits.size()); + if (split_dim < 0) { + for (int i = 0; i < int(splits.size()); ++i) { + if (splits[i] > 0) { + split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, tensor->ne[0], tensor->ne[1], tensor->ne[2]); + auto name_i = name + '.' + std::to_string(i); + ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str()); + } else { + split_tensor.tensor_splits[i] = nullptr; + } + } + } + else if (split_dim == 1) { + for (int i = 0; i < int(splits.size()); ++i) { + if (splits[i] > 0) { + split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, tensor->ne[0], splits[i], tensor->ne[2]); + auto name_i = name + '.' + std::to_string(i); + ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str()); + } else { + split_tensor.tensor_splits[i] = nullptr; + } + } + } else { + for (int i = 0; i < int(splits.size()); ++i) { + if (splits[i] > 0) { + split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, splits[i], tensor->ne[1], tensor->ne[2]); + auto name_i = name + '.' + std::to_string(i); + ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str()); + } else { + split_tensor.tensor_splits[i] = nullptr; + } + } + } + split_tensor.ggml.n_device = splits.size(); + split_tensor.ggml.split_dim = split_dim; + split_tensor.ggml.splits = split_tensor.tensor_splits.data(); + tensor->extra = (void *)&split_tensor.ggml; + GGML_ASSERT(mem_used.size() >= splits.size()); + for (int i = 0; i < split_tensor.ggml.n_device; ++i) { + if (split_tensor.ggml.splits[i]) { + //auto nbytes = ggml_nbytes(split_tensor.ggml.splits[i]); + //printf("mem_used(%s): %8.2f, total: %8.2f\n", split_tensor.ggml.splits[i]->name, nbytes/1024./1024., (mem_used[i] + nbytes)/1024./1024.); + mem_used[i] += ggml_nbytes(split_tensor.ggml.splits[i]); + } + } +} + bool create_tensors_helper::create_tensors() { const auto tn = LLM_TN(model.arch); bool use_mmap_buffer = true; + if (ml.merge_qkv && (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)) { + LLAMA_LOG_WARN("\n========================================================\n"); + LLAMA_LOG_WARN("merge_qkv is not compatible with split model 'graph'\n"); + LLAMA_LOG_WARN(" => turning off merge_qkv\n"); + LLAMA_LOG_WARN("========================================================\n\n"); + ml.merge_qkv = false; + } switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: @@ -2761,6 +2934,157 @@ bool create_tensors_helper::create_tensors() { default: throw std::runtime_error("unknown architecture"); } + if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) { + std::vector mem_used(model.splits.size(), 0); + const auto & hparams = model.hparams; + int gqa_ratio = hparams.n_head() / hparams.n_head_kv(); + //printf("GQA ratio: %d\n", gqa_ratio); + for (int il = 0; il < int(model.layers.size()); ++il) { + if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) { + LLAMA_LOG_INFO("%s: not splitting layer %d because buffer type is host\n", __func__, il); + continue; + } + auto & layer = model.layers[il]; + auto ctx_split = ctx_for_layer_split(il); + if (layer.attn_norm) { + auto split = create_split(ggml_nrows(layer.attn_norm), -1, model.splits, mem_used); + prepare_split_tensors(-1, ctx_split, layer.attn_norm, layer.split_attn_norm, split, mem_used); + } + if (layer.rope_freqs) { + auto split = create_split(ggml_nrows(layer.rope_freqs), -1, model.splits, mem_used); + prepare_split_tensors(-1, ctx_split, layer.rope_freqs, layer.split_rope_freqs, split, mem_used); + } + if (layer.wo && layer.wq && layer.wk && layer.wv) { + int attn_granularity = hparams.n_embd_head_k * gqa_ratio; + if (ggml_is_quantized(layer.wo->type)) { + auto tt = ggml_internal_get_type_traits(layer.wo->type); + if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size; + } + GGML_ASSERT(attn_granularity % hparams.n_embd_head_k == 0); + auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits, mem_used); + prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split, mem_used); + if (layer.bo) { + prepare_split_tensors(-1, ctx_split, layer.bo, layer.split_bo, split, mem_used); + } + if (layer.bq) { + prepare_split_tensors(0, ctx_split, layer.bq, layer.split_bq, split, mem_used); + } + if (layer.attn_q_norm) { + prepare_split_tensors(-1, ctx_split, layer.attn_q_norm, layer.split_q_norm, split, mem_used); + } + for (auto & s : split) s /= gqa_ratio; + prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split, mem_used); + if (layer.bk) { + prepare_split_tensors(0, ctx_split, layer.bk, layer.split_bk, split, mem_used); + } + if (layer.bv) { + prepare_split_tensors(0, ctx_split, layer.bv, layer.split_bv, split, mem_used); + } + if (layer.attn_k_norm) { + prepare_split_tensors(-1, ctx_split, layer.attn_k_norm, layer.split_k_norm, split, mem_used); + } + } + + if (layer.ffn_norm) { + if (auto it = split_tensors.find(layer.ffn_norm); it != split_tensors.end()) { + auto split = create_split(ggml_nrows(layer.ffn_norm), -1, model.splits, mem_used); + prepare_split_tensors(-1, ctx_split, layer.ffn_norm, layer.split_ffn_norm, split, mem_used); + } + } + + if (layer.ffn_down && layer.ffn_up && layer.ffn_gate) { + bool use_split = split_tensors.find(layer.ffn_down) != split_tensors.end() && + split_tensors.find(layer.ffn_gate) != split_tensors.end() && + split_tensors.find(layer.ffn_up) != split_tensors.end(); + if (use_split) { + int ffn_granularity = 16; + if (ggml_is_quantized(layer.ffn_down->type)) { + auto tt = ggml_internal_get_type_traits(layer.ffn_down->type); + if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; + } + auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, model.splits, mem_used); + prepare_split_tensors(0, ctx_split, layer.ffn_down, layer.split_ffn_down, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_up, layer.split_ffn_up, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_gate, layer.split_ffn_gate, split, mem_used); + } + } + + //bool any_ffn_split = false; + if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) { + bool use_split = split_tensors.find(layer.ffn_down_shexp) != split_tensors.end() && + split_tensors.find(layer.ffn_gate_shexp) != split_tensors.end() && + split_tensors.find(layer.ffn_up_shexp) != split_tensors.end(); + if (use_split) { + //any_ffn_split = true; + int ffn_granularity = 16; + if (ggml_is_quantized(layer.ffn_down_shexp->type)) { + auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type); + if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; + } + auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits, mem_used); + prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used); + } + } + + if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) { + bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() && + split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() && + split_tensors.find(layer.ffn_up_exps) != split_tensors.end(); + + if (use_split) { + //any_ffn_split = true; + int ffn_granularity = 16; + if (ggml_is_quantized(layer.ffn_down_exps->type)) { + auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type); + if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; + } + auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits, mem_used); + //printf("split(%2d):", il); for (auto & s : split) printf(" %d", s); printf("\n"); + prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split, mem_used); + } + } + + if (layer.ffn_gate_inp) { + if (auto it = split_tensors.find(layer.ffn_gate_inp); it != split_tensors.end()) { + auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, model.splits, mem_used); + prepare_split_tensors(-1, ctx_split, layer.ffn_gate_inp, layer.split_ffn_gate_inp, shared_split, mem_used); + } + } + if (layer.ffn_exp_probs_b) { + if (auto it = split_tensors.find(layer.ffn_exp_probs_b); it != split_tensors.end()) { + auto shared_split = create_split(ggml_nrows(layer.ffn_exp_probs_b), -1, model.splits, mem_used); + prepare_split_tensors(-1, ctx_split, layer.ffn_exp_probs_b, layer.split_ffn_exp_probs_b, shared_split, mem_used); + } + } + } + + if (model.output) { + if (auto it = split_tensors.find(model.output); it != split_tensors.end()) { + if (ggml_backend_buft_is_host(model.buft_output.buft_matrix)) { + LLAMA_LOG_INFO("%s: not splitting output tensor becausee buffer is host\n", __func__); + } else { + auto ctx_split = ctx_map[model.buft_output.buft_matrix]; + auto split = create_split(model.output->ne[1], 16, model.splits, mem_used); + prepare_split_tensors(1, ctx_split, model.output, model.split_output, split, mem_used); + if (auto it = split_tensors.find(model.output_norm); it != split_tensors.end() && !ggml_backend_buft_is_host(model.buft_output.buft_matrix)) { + auto ctx_split = ctx_map[model.buft_output.buft_matrix]; + prepare_split_tensors(-1, ctx_split, model.output_norm, model.split_output_norm, split, mem_used); + } + } + } + } + + LLAMA_LOG_INFO("Estimated model buffer size per device:\n"); + for (int i = 0; i < int(mem_used.size()); ++i) { + LLAMA_LOG_INFO(" Device %d: %8.2f MiB\n", i, mem_used[i]/1024./1024.); + } + } return use_mmap_buffer; } diff --git a/src/llama-model.h b/src/llama-model.h index 342d6df6..d6188721 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -183,6 +183,24 @@ struct llama_layer { struct ggml_tensor * bqk = nullptr; struct ggml_tensor * bkv = nullptr; + llama_split_tensor split_attn_norm; + llama_split_tensor split_wq; + llama_split_tensor split_wk; + llama_split_tensor split_wv; + llama_split_tensor split_wo; + llama_split_tensor split_wqkv; + llama_split_tensor split_wqk; + llama_split_tensor split_wkv; + llama_split_tensor split_bq; + llama_split_tensor split_bk; + llama_split_tensor split_bv; + llama_split_tensor split_bo; + llama_split_tensor split_bqkv; + llama_split_tensor split_bqk; + llama_split_tensor split_bkv; + llama_split_tensor split_q_norm; + llama_split_tensor split_k_norm; + // relative position bias struct ggml_tensor * attn_rel_b = nullptr; struct ggml_tensor * attn_rel_b_enc = nullptr; @@ -205,12 +223,22 @@ struct llama_layer { struct ggml_tensor * ffn_down_enc = nullptr; struct ggml_tensor * ffn_up_enc = nullptr; + llama_split_tensor split_ffn_up; + llama_split_tensor split_ffn_gate; + llama_split_tensor split_ffn_down; + llama_split_tensor split_ffn_norm; + // ff MoE struct ggml_tensor * ffn_gate_inp = nullptr; struct ggml_tensor * ffn_gate_exps = nullptr; struct ggml_tensor * ffn_down_exps = nullptr; struct ggml_tensor * ffn_up_exps = nullptr; + llama_split_tensor split_ffn_gate_inp; + llama_split_tensor split_ffn_up_exps; + llama_split_tensor split_ffn_gate_exps; + llama_split_tensor split_ffn_down_exps; + // ff MoE bias struct ggml_tensor * ffn_gate_inp_b = nullptr; struct ggml_tensor * ffn_gate_exps_b = nullptr; @@ -226,6 +254,15 @@ struct llama_layer { struct ggml_tensor * ffn_down_shexp = nullptr; struct ggml_tensor * ffn_up_shexp = nullptr; + llama_split_tensor split_ffn_up_shexp; + llama_split_tensor split_ffn_gate_shexp; + llama_split_tensor split_ffn_down_shexp; + + llama_split_tensor split_ffn_gate_inp_b; + llama_split_tensor split_ffn_gate_exps_b; + llama_split_tensor split_ffn_down_exps_b; + llama_split_tensor split_ffn_up_exps_b; + // ff bias struct ggml_tensor * ffn_gate_b = nullptr; struct ggml_tensor * ffn_down_b = nullptr; // b2 @@ -233,6 +270,12 @@ struct llama_layer { struct ggml_tensor * ffn_act = nullptr; struct ggml_tensor * ffn_exp_probs_b = nullptr; + llama_split_tensor split_ffn_gate_b; + llama_split_tensor split_ffn_down_b; + llama_split_tensor split_ffn_up_b; + llama_split_tensor split_ffn_act; + llama_split_tensor split_ffn_exp_probs_b; + // mamba proj struct ggml_tensor * ssm_in = nullptr; struct ggml_tensor * ssm_x = nullptr; @@ -253,6 +296,8 @@ struct llama_layer { struct ggml_tensor * rope_short = nullptr; struct ggml_tensor * rope_freqs = nullptr; + llama_split_tensor split_rope_freqs; + // bitnet scale struct ggml_tensor * wq_scale = nullptr; struct ggml_tensor * wk_scale = nullptr; @@ -298,6 +343,9 @@ struct llama_model { struct ggml_tensor * output_b; struct ggml_tensor * output_norm_enc; + llama_split_tensor split_output; + llama_split_tensor split_output_norm; + std::vector layers; llama_split_mode split_mode; @@ -358,6 +406,12 @@ struct llama_model { } void set_tensor_overrides(const llama_model_params& params); + + int device_count() const; + ggml_backend_buffer_type_t default_buffer_type_offload(int device) const; + + std::vector splits; + ggml_backend_buffer_type_t split_buft = nullptr; }; struct llama_lora_weight { diff --git a/src/llama.cpp b/src/llama.cpp index 587ff62f..6ea2b4cf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -108,6 +108,7 @@ #include #include #include +#include #include #include #include @@ -460,18 +461,18 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ GGML_UNUSED(gpu); } -static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) { +static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu) { ggml_backend_buffer_type_t buft = nullptr; #ifdef GGML_USE_CUDA if (ggml_backend_cuda_get_device_count() > 1) { - buft = ggml_backend_cuda_split_buffer_type(tensor_split); + buft = ggml_backend_cuda_split_buffer_type(model.splits.data()); } #endif #ifdef GGML_USE_SYCL if (ggml_backend_sycl_get_device_count() > 1) { - buft = ggml_backend_sycl_split_buffer_type(tensor_split); + buft = ggml_backend_sycl_split_buffer_type(model.splits.data()); } #endif @@ -480,7 +481,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo } return buft; - GGML_UNUSED(tensor_split); +} + +int llama_model::device_count() const { + return llama_get_device_count(*this); +} + +ggml_backend_buffer_type_t llama_model::default_buffer_type_offload(int device) const { + return llama_default_buffer_type_offload(*this, device); } static size_t llama_get_device_memory(const llama_model & model, int device) { @@ -548,23 +556,49 @@ bool llama_context::can_reuse_graph(const llama_batch & u_batch) { } bool llama_context::update_cache_copies() { - int n_layer = cache_copies.size()/2; + int n_layer = model.hparams.n_layer - model.hparams.nextn_predict_layers; //cache_copies.size()/2; if ((int)kv_self.k_l.size() != n_layer) return false; if (!(kv_self.v_l.empty() || (int)kv_self.v_l.size() == n_layer)) return false; - for (int il = 0; il < n_layer; ++il) { - auto& c = cache_copies[2*il+0]; - if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) return false; - c.cpy->view_offs = kv_self.head*c.step; - c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs; - c.cpy->data = c.cpy->src[1]->data; - } - if (kv_self.v_l.empty()) return true; - for (int il = 0; il < n_layer; ++il) { - auto& c = cache_copies[2*il+1]; - if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) return false; - c.cpy->view_offs = kv_self.head*c.step; - c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs; - c.cpy->data = c.cpy->src[1]->data; + if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) { + for (int il = 0; il < n_layer; ++il) { + auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra; + auto vl = !kv_self.v_l.empty() && kv_self.v_l[il] ? (ggml_split_tensor_t *)kv_self.v_l[il]->extra : nullptr; + GGML_ASSERT(kl && (!kv_self.v_l[il] || vl)); + if (vl) { + GGML_ASSERT(kl->n_device == vl->n_device); + } + for (int id = 0; id < kl->n_device; ++id) { + auto& c = cache_copies[2*model.splits.size()*il + 2*id + 0]; + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kl->splits[id]) return false; + c.cpy->view_offs = kv_self.head*c.step; + c.cpy->src[1]->data = (char *)kl->splits[id]->data + c.cpy->view_offs; + c.cpy->data = c.cpy->src[1]->data; + } + if (!vl) continue; + for (int id = 0; id < vl->n_device; ++id) { + auto& c = cache_copies[2*model.splits.size()*il + 2*id + 1]; + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != vl->splits[id]) return false; + c.cpy->view_offs = kv_self.head*c.step; + c.cpy->src[1]->data = (char *)vl->splits[id]->data + c.cpy->view_offs; + c.cpy->data = c.cpy->src[1]->data; + } + } + } else { + for (int il = 0; il < n_layer; ++il) { + auto& c = cache_copies[2*il+0]; + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) return false; + c.cpy->view_offs = kv_self.head*c.step; + c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs; + c.cpy->data = c.cpy->src[1]->data; + } + if (kv_self.v_l.empty()) return true; + for (int il = 0; il < n_layer; ++il) { + auto& c = cache_copies[2*il+1]; + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) return false; + c.cpy->view_offs = kv_self.head*c.step; + c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs; + c.cpy->data = c.cpy->src[1]->data; + } } return true; } @@ -572,7 +606,11 @@ bool llama_context::update_cache_copies() { llama_context::llama_context(const llama_model & model) : model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) { const auto & hparams = model.hparams; - cache_copies.resize(2*hparams.n_layer); + if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) { + cache_copies.resize(2*model.splits.size()*hparams.n_layer); + } else { + cache_copies.resize(2*hparams.n_layer); + } } llama_context::~llama_context() { @@ -626,42 +664,35 @@ static bool llama_kv_cache_init( } } + bool split_cache = false; + if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.arch != LLM_ARCH_DEEPSEEK2 && offload) { + cache.split_k_l.reserve(n_layer); + cache.split_v_l.reserve(n_layer); + split_cache = true; + } + // count used buffer types std::map buft_layer_count; if (offload) { for (int64_t i = 0; i < n_layer; ++i) { - buft_layer_count[model.buft_layer[i].buft]++; + if (split_cache) { + buft_layer_count[model.buft_layer[i].buft_matrix]++; + } else { + buft_layer_count[model.buft_layer[i].buft]++; + } } } else { buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; } - //if (cparams.fused_moe_up_gate) { - // int nbad = 0; - // for (int i = 0; i < (int) n_layer; i++) { - // auto& layer = model.layers[i]; - // if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) { - // ++nbad; - // } - // } - // if (nbad > 0) { - // if (nbad == (int)n_layer) { - // LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n"); - // const_cast(cparams).fused_moe_up_gate = false; - // } - // else { - // LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n", - // nbad, (int)n_layer); - // } - // } - //} - // create a context for each buffer type std::map ctx_map; for (auto & it : buft_layer_count) { int n_layers = it.second; + size_t ctx_mem_size = 5u*n_layers*ggml_tensor_overhead(); + if (split_cache) ctx_mem_size += 2*model.splits.size()*n_layers*ggml_tensor_overhead(); struct ggml_init_params params = { - /*.mem_size =*/ 5u*n_layers*ggml_tensor_overhead(), + /*.mem_size =*/ ctx_mem_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -698,24 +729,25 @@ static bool llama_kv_cache_init( } } - cache.k_l.reserve(n_layer); bool needs_v_cache = true; + cache.k_l.reserve(n_layer); if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) { needs_v_cache = cparams.mla_attn == 1 && !cparams.flash_attn; } if (needs_v_cache) cache.v_l.reserve(n_layer); + std::vector mem_split(model.splits.size(), 0); + int n_mla = 0; for (int i = 0; i < (int) n_layer; i++) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); const uint32_t n_head_kv = hparams.n_head_kv(i); const uint32_t n_embd_head_k= hparams.n_embd_head_k; - - struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + struct ggml_context * ctx = split_cache ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); ggml_tensor * k; ggml_tensor * v; - if (cparams.mla_attn) { + if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) { // DeepSeek MLA const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; @@ -740,10 +772,53 @@ static bool llama_kv_cache_init( else { k = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, n_head_kv*kv_size); v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); - ggml_format_name(k, "cache_k_l%d", i); - ggml_format_name(v, "cache_v_l%d", i); + auto k_name = std::string{"cache_k_l"} + std::to_string(i); + auto v_name = std::string{"cache_v_l"} + std::to_string(i); + ggml_set_name(k, k_name.c_str()); + ggml_set_name(v, v_name.c_str()); + //ggml_format_name(k, "cache_k_l%d", i); + //ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); + if (split_cache) { + auto K = model.layers[i].wk; + auto V = model.layers[i].wv; + if (K && V && K->extra && V->extra) { + auto extra_K = (const ggml_split_tensor_t *)K->extra; + auto extra_V = (const ggml_split_tensor_t *)V->extra; + auto & split_k_l = cache.split_k_l.emplace_back(); + auto & split_v_l = cache.split_v_l.emplace_back(); + split_k_l.tensor_splits.resize(extra_K->n_device, nullptr); + split_v_l.tensor_splits.resize(extra_V->n_device, nullptr); + for (int is = 0; is < extra_K->n_device; ++is) { + auto split = extra_K->splits[is]; + if (!split) continue; + split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, split->ne[1]/n_embd_head_k * kv_size); + auto split_name = k_name + '.' + std::to_string(is); + ggml_set_name(split_k_l.tensor_splits[is], split_name.c_str()); + mem_split[is] += ggml_nbytes(split_k_l.tensor_splits[is]); + } + split_k_l.ggml.n_device = extra_K->n_device; + split_k_l.ggml.split_dim = 0; + split_k_l.ggml.splits = split_k_l.tensor_splits.data(); + for (int is = 0; is < extra_V->n_device; ++is) { + auto split = extra_V->splits[is]; + if (!split) continue; + split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, type_v, split->ne[1] * kv_size); + auto split_name = v_name + '.' + std::to_string(is); + ggml_set_name(split_v_l.tensor_splits[is], split_name.c_str()); + mem_split[is] += ggml_nbytes(split_v_l.tensor_splits[is]); + } + split_v_l.ggml.n_device = extra_V->n_device; + split_v_l.ggml.split_dim = 0; + split_v_l.ggml.splits = split_v_l.tensor_splits.data(); + k->extra = (void *)&split_k_l.ggml; + v->extra = (void *)&split_v_l.ggml; + } + //} else { + // printf("Oops: don't have yet K and V for layer %d\n", i); + //} + } } } if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn && n_mla < n_layer && n_mla > 0) { @@ -756,15 +831,46 @@ static bool llama_kv_cache_init( for (auto it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); - return false; + int ntensor = 0; + for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + ++ntensor; + } + if (ntensor > 0) { + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + cache.bufs.push_back(buf); } - ggml_backend_buffer_clear(buf, 0); - LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); - cache.bufs.push_back(buf); } + if (split_cache) { + LLAMA_LOG_INFO("%s: KV cache size per device:\n", __func__); + for (int i = 0; i < int(mem_split.size()); ++i) printf(" Device %d: %g MiB\n", i, mem_split[i]/1024./1024.); + } + +#if 0 + for (int il = 0; il < n_layer; ++il) { + if (cache.k_l[il]->extra) { + printf("Layer %2d, K-buffer: %p:", il, (void *)cache.k_l[il]->buffer); + auto split_kl = (ggml_split_tensor_t *)cache.k_l[il]->extra; + for (int id = 0; id < split_kl->n_device; ++id) { + if (split_kl->splits[id]) printf(" %p,%p", (void *)split_kl->splits[id]->data, (void *)split_kl->splits[id]->buffer); + } + printf("\n"); + } + if (cache.v_l[il]->extra) { + printf("Layer %2d, V-buffer: %p:", il, (void *)cache.v_l[il]->buffer); + auto split_vl = (ggml_split_tensor_t *)cache.v_l[il]->extra; + for (int id = 0; id < split_vl->n_device; ++id) { + if (split_vl->splits[id]) printf(" %p,%p", (void *)split_vl->splits[id]->data, (void *)split_vl->splits[id]->buffer); + } + printf("\n"); + } + } +#endif return true; } @@ -1617,6 +1723,16 @@ static void ggml_backend_add_from_device(llama_context* ctx, ggml_backend_t back } } +static bool is_model_split_supported(const llama_model & model) { + static std::unordered_set k_supported = { + LLM_ARCH_LLAMA, + LLM_ARCH_QWEN3MOE, + LLM_ARCH_GLM4_MOE, + }; + auto it = k_supported.find(model.arch); + return it != k_supported.end(); +} + // Returns false if cancelled by progress_callback static bool llm_load_tensors( llama_model_loader & ml, @@ -1634,6 +1750,16 @@ static bool llm_load_tensors( auto & hparams = model.hparams; + if (split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) { + if (!is_model_split_supported(model)) { + LLAMA_LOG_WARN("\n=======================================================\n"); + LLAMA_LOG_WARN("Split mode 'graph' is not supported for this model\n"); + LLAMA_LOG_WARN(" => changing split mode to 'layer'\n"); + LLAMA_LOG_WARN("=======================================================\n\n"); + split_mode = LLAMA_SPLIT_MODE_LAYER; + } + } + model.split_mode = split_mode; model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; @@ -1652,10 +1778,7 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } - if (split_mode == LLAMA_SPLIT_MODE_LAYER) { - // calculate the split points - // int device_count = llama_get_device_count(model); - int device_count = model.devices.size(); + if (int device_count = model.devices.size(); device_count > 1) { bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); std::vector splits(device_count); if (all_zero) { @@ -1676,46 +1799,47 @@ static bool llm_load_tensors( for (int i = 0; i < device_count; ++i) { splits[i] /= split_sum; } + model.splits = std::move(splits); + } else { + model.splits = { 1.0f }; + } + + int device_count = model.splits.size(); + // assign the repeating layers to the devices according to the splits + int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); + if (split_mode == LLAMA_SPLIT_MODE_LAYER) { - // assign the repeating layers to the devices according to the splits - int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); for (int i = i_gpu_start; i < n_layer; ++i) { - int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin(); -#ifndef NDEBUG - ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); - const char* name = ggml_backend_buft_name(buft); - LLAMA_LOG_DEBUG("load_tensors: layers %3d assigned to backend %s\n", i, - name); -#endif + int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin(); model.buft_layer[i] = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); } // assign the output layer if (n_gpu_layers > n_layer) { - int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); -#ifndef NDEBUG - ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); - const char* name = ggml_backend_buft_name(buft); - LLAMA_LOG_DEBUG("load_tensors: output layers assigned to backend %s\n", - name); -#endif + int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - model.splits.begin(); model.buft_output = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); } else { model.buft_output = llama_default_buffer_type_cpu(true); } } else { ggml_backend_buffer_type_t split_buft; - if (split_mode == LLAMA_SPLIT_MODE_ROW) { - split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], tensor_split); + if ((split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) { + split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu]); + model.split_buft = split_buft; } else { // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]); } + auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]); // assign the repeating layers for (int i = i_gpu_start; i < n_layer; ++i) { - model.buft_layer[i] = { - split_buft, - llama_default_buffer_type_offload(model, model.devices[main_gpu]) - }; + if (split_mode == LLAMA_SPLIT_MODE_ATTN) { + int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, + float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin(); + model.buft_layer[i] = { split_buft, llama_default_buffer_type_offload(model, model.devices[layer_gpu]) }; + printf("Layer %d: assigning buft_layer to GPU %d\n", i, layer_gpu); + } else { + model.buft_layer[i] = { split_buft, buft_layer }; + } } // assign the output layer if (n_gpu_layers > n_layer) { @@ -1807,24 +1931,33 @@ static bool llm_load_tensors( } #endif else { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend buffer"); + int ntensor = 0; + for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + ++ntensor; } - model.bufs.push_back(buf); - if (use_mlock && ggml_backend_buffer_is_host(buf)) { - model.mlock_bufs.emplace_back(new llama_mlock); - auto & mlock_buf = model.mlock_bufs.back(); - mlock_buf->init (ggml_backend_buffer_get_base(buf)); - mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); - } - for (uint32_t idx = 0; idx < ml.files.size(); idx++) { - bufs.emplace(idx, buf); + if (ntensor > 0) { + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf == nullptr) { + LLAMA_LOG_ERROR("Failed to allocate buffer type %s\n", ggml_backend_buft_name(buft)); + throw std::runtime_error("unable to allocate backend buffer"); + } + model.bufs.push_back(buf); + if (use_mlock && ggml_backend_buffer_is_host(buf)) { + model.mlock_bufs.emplace_back(new llama_mlock); + auto & mlock_buf = model.mlock_bufs.back(); + mlock_buf->init (ggml_backend_buffer_get_base(buf)); + mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); + } + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + bufs.emplace(idx, buf); + } } } if (bufs.empty()) { - throw std::runtime_error("failed to allocate buffer"); + LLAMA_LOG_WARN("No tensors in buffer type %s\n", ggml_backend_buft_name(buft)); + continue; + //throw std::runtime_error("failed to allocate buffer (1)"); } for (auto & buf : bufs) { @@ -4326,8 +4459,8 @@ struct llama_context * llama_new_context_with_model( ggml_backend_add_from_device(ctx, ctx->backend_metal); } #elif defined(GGML_USE_CUDA) - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE) { + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu, cparams.cuda_params); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); @@ -4337,7 +4470,7 @@ struct llama_context * llama_new_context_with_model( ggml_backend_add_from_device(ctx, backend); } else { - // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU + // LLAMA_SPLIT_MODE_LAYER and LLAMA_SPLIT_MODE_GRAPH require a backend for each GPU for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_cuda_init(device, cparams.cuda_params); if (backend == nullptr) { @@ -4346,12 +4479,11 @@ struct llama_context * llama_new_context_with_model( return nullptr; } ggml_backend_add_from_device(ctx, backend); - } } #elif defined(GGML_USE_VULKAN) - if (model->split_mode == LLAMA_SPLIT_MODE_ROW) { - LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__); + if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH || model->split_mode == LLAMA_SPLIT_MODE_ATTN) { + LLAMA_LOG_ERROR("%s: split mode 'graph' or 'attn' not supported. Failed to initialize Vulkan backend\n", __func__); llama_free(ctx); return nullptr; } @@ -4375,8 +4507,8 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_SYCL) - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) { ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); @@ -4407,9 +4539,9 @@ struct llama_context * llama_new_context_with_model( ggml_backend_add_from_device(ctx, backend); } #elif defined(GGML_USE_CANN) - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used // TODO: ggml_backend_cann is not support split tensor now, just leave code here. - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) { ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);