diff --git a/common/common.cpp b/common/common.cpp index b6ee885c..5714fcaa 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1277,10 +1277,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - fprintf(stderr, "\n\n=====================================================================================\n"); - fprintf(stderr, " Split mode row is no longer supported\n"); - fprintf(stderr, "=====================================================================================\n\n\n"); - GGML_ABORT("fatal error"); + //fprintf(stderr, "\n\n=====================================================================================\n"); + //fprintf(stderr, " Split mode row is no longer supported\n"); + //fprintf(stderr, "=====================================================================================\n\n\n"); + //GGML_ABORT("fatal error"); params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 7b630489..d2a0cf85 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -3021,6 +3021,13 @@ extern "C" { GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); + typedef struct { + int n_device; + int split_dim; + struct ggml_tensor * tensor; + struct ggml_tensor ** splits; + } ggml_split_tensor_t; + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 10e9534f..dd9498fd 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -785,90 +785,145 @@ GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buf GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported - - ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - - ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; - ctx->tensor_extras.push_back(extra); - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses +GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + if (!tensor->extra) return; + printf("%s(%s, %p)\n", __func__, tensor->name, tensor->extra); + auto extra = (ggml_split_tensor_t *)tensor->extra; + GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count()); + for (int i = 0; i < extra->n_device; ++i) { + if (!extra->splits[i]) continue; + auto split = extra->splits[i]; + auto ne0 = split->ne[0]; + auto size = ggml_nbytes(split); + auto padded_size = size; if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + int nblock = (ne0 + MATRIX_ROW_PADDING - 1)/MATRIX_ROW_PADDING; + auto padded_row_size = ggml_row_size(split->type, nblock*MATRIX_ROW_PADDING); + auto row_size = ggml_row_size(split->type, ne0); + padded_size += padded_row_size - row_size; } - - // FIXME: do not crash if cudaMalloc fails - // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first - ggml_cuda_set_device(id); + ggml_cuda_set_device(i); char * buf; - CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); - - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); - } - - extra->data_device[id] = buf; - - for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) { - CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); + CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, padded_size, i)); + if (padded_size > size) { + CUDA_CHECK(cudaMemset(buf + size, 0, padded_size - size)); } + printf(" allocated %zu bytes for tensor %s of type %s, dim = %ld x %ld x %ld. padding: %zu\n", padded_size, split->name, ggml_type_name(split->type), + split->ne[0], split->ne[1], split->ne[2], padded_size - size); + split->data = buf; } - tensor->extra = extra; + return; + + //if (tensor->view_src != nullptr) fprintf(stderr, "%s: tensor %s is a view into %s\n", __func__, tensor->name, tensor->view_src->name); + //GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported + + //ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; + //ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + //const int64_t ne0 = tensor->ne[0]; + + //ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; + //ctx->tensor_extras.push_back(extra); + + //for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + // int64_t row_low, row_high; + // get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + // int64_t nrows_split = row_high - row_low; + // if (nrows_split == 0) { + // continue; + // } + + // size_t size = ggml_nbytes_split(tensor, nrows_split); + // const size_t original_size = size; + + // // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + // if (ne0 % MATRIX_ROW_PADDING != 0) { + // size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + // } + + // // FIXME: do not crash if cudaMalloc fails + // // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first + // ggml_cuda_set_device(id); + // char * buf; + // CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); + + // // set padding to 0 to avoid possible NaN values + // if (size > original_size) { + // CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); + // } + + // extra->data_device[id] = buf; + + // for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) { + // CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); + // } + //} + //tensor->extra = extra; } GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + if (!tensor->extra) return; + printf("%s(%s)\n", __func__, tensor->name); + // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + auto extra = (ggml_split_tensor_t *)tensor->extra; + GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count()); - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - const char * buf_host = (const char *)data + offset_split; - CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + if (extra->split_dim != 0) { + fprintf(stderr, "Split tensor copy not yet immplemented for dim 0\n"); + return; } - for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + size_t cur_offset = 0; + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + auto size = ggml_nbytes(split); + const char * buf_host = (const char *)data + cur_offset; + CUDA_CHECK(cudaMemcpyAsync(split->data, buf_host, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + cur_offset += size; + } + + for (int i = 0; i < extra->n_device; ++i) { + if (!extra->splits[i]) continue; CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } + + //ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + //const int64_t ne0 = tensor->ne[0]; + //const size_t nb1 = tensor->nb[1]; + //ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + //for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + // int64_t row_low, row_high; + // get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + // int64_t nrows_split = row_high - row_low; + // if (nrows_split == 0) { + // continue; + // } + + // const size_t offset_split = row_low*nb1; + // size_t size = ggml_nbytes_split(tensor, nrows_split); + // const size_t original_size = size; + + // // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + // if (ne0 % MATRIX_ROW_PADDING != 0) { + // size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + // } + + // const char * buf_host = (const char *)data + offset_split; + // CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + //} + + //for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + // CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + //} } GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { diff --git a/src/llama-context.h b/src/llama-context.h index bb21d880..61ad51e5 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -57,6 +57,9 @@ struct llama_kv_cache { std::vector k_l; // per layer std::vector v_l; + std::vector split_k_l; + std::vector split_v_l; + std::vector ctxs; std::vector bufs; diff --git a/src/llama-impl.h b/src/llama-impl.h index 38844506..b6d84369 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -224,3 +224,8 @@ struct gguf_context; std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i); ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer); + +struct llama_split_tensor { + std::vector tensor_splits; + ggml_split_tensor_t ggml; +}; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 5f671fe7..b120b36e 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -192,6 +192,11 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output ctx_size += ggml_tensor_overhead()*n_layer*3; // for moe merged tensors + if (model.splits.size() > 1) { + ctx_size += ggml_tensor_overhead()*n_layer*4; // for KV cache + ctx_size *= model.splits.size(); + } + for (auto & it : buft_layer_count) { struct ggml_init_params params = { /*.mem_size =*/ ctx_size, @@ -205,10 +210,78 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod ctx_map[it.first] = ctx; model.ctxs.push_back(ctx); } + printf("=======================================================================\n"); + auto n_device = model.device_count(); + printf(" Model has %d devices:\n", n_device); + for (int device = 0; device < n_device; ++device) { + auto buft = model.default_buffer_type_offload(device); + if (buft) { + printf(" %d %s\n", device, ggml_backend_buft_name(buft)); + } else { + printf(" Oops: null buft for debvice %d\n", device); + } + } + if (model.split_mode == LLAMA_SPLIT_MODE_ROW) { + printf("model.splits:"); + for (auto s : model.splits) printf(" %g", s); + printf("\n"); + } +} + +static std::vector create_split(int nr, int granularity, const std::vector & splits) { + GGML_ASSERT(nr % granularity == 0); + GGML_ASSERT(!splits.empty()); + int nchunk = nr / granularity; + std::vector result(splits.size()); + float last_split = 0; + int sum = 0; + for (int i = 0; i < (int)splits.size(); ++i) { + float p = splits[i] - last_split; + result[i] = roundf(p*nchunk); + sum += result[i]; + last_split = splits[i]; + } + while (sum > nchunk) { + last_split = 0; + float best_err = std::numeric_limits::max(); + int ibest = -1; + for (int i = 0; i < (int)splits.size(); ++i) { + if (result[i] > 0) { + float n_want = (splits[i] - last_split)*nchunk; + float err = std::abs(n_want - result[i] + 1); + if (err < best_err) { + best_err = err; ibest = i; + } + } + last_split = splits[i]; + } + GGML_ASSERT(ibest >= 0 && result[ibest] > 0); + --result[ibest]; + --sum; + } + while (sum < nchunk) { + last_split = 0; + float best_err = std::numeric_limits::max(); + int ibest = -1; + for (int i = 0; i < (int)splits.size(); ++i) { + float n_want = (splits[i] - last_split)*nchunk; + float err = std::abs(n_want - result[i] - 1); + if (err < best_err) { + best_err = err; ibest = i; + } + last_split = splits[i]; + } + GGML_ASSERT(ibest >= 0); + ++result[ibest]; + ++sum; + } + for (auto & r : result) r *= granularity; + return result; } ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector & ne, int flags, ggml_context ** actual_context) { + //auto requested_ctx = ctx; if (ml.tensor_buft_overrides) { for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { std::regex pattern(overrides->pattern); @@ -220,7 +293,40 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std } } if (actual_context) *actual_context = ctx; - return ml.create_tensor(ctx, name, ne, flags); + auto tensor = ml.create_tensor(ctx, name, ne, flags); + //if (tensor && requested_ctx == ctx && model.split_mode == LLAMA_SPLIT_MODE_ROW) { + // int i_layer = -1; + // if (auto pos = name.find("blk."); pos == 0) { + // GGML_ASSERT(sscanf(name.c_str(), "blk.%d.", &i_layer) == 1); + // } + // if (i_layer >= 0) { + // auto & layer = model.layers[i_layer]; + // auto & hparams = model.hparams; + // if (auto pos = name.find("attn_q.weight"); pos != std::string::npos) { + // auto split = create_split(tensor->ne[1], hparams.n_embd_head_k, model.splits); + // printf("%s(%s):", __func__, name.c_str()); + // for (auto s : split) printf(" %d", s); + // printf("\n"); + // layer.split_wq.tensor_splits.resize(split.size()); + // size_t offset = 0; + // for (int i = 0; i < (int)split.size(); ++i) { + // if (split[i] > 0) { + // layer.split_wq.tensor_splits[i] = ggml_view_2d(ctx, tensor, tensor->ne[0], split[i], tensor->nb[1], offset); + // auto split_name = name + '.' + std::to_string(i); + // ggml_set_name(layer.split_wq.tensor_splits[i], split_name.c_str()); + // offset += tensor->nb[1]*split[i]; + // } else { + // layer.split_wq.tensor_splits[i] = nullptr; + // } + // } + // layer.split_wq.ggml.n_device = split.size(); + // layer.split_wq.ggml.split_dim = 1; + // layer.split_wq.ggml.splits = layer.split_wq.tensor_splits.data(); + // } + // } + //} + return tensor; + //return ml.create_tensor(ctx, name, ne, flags); } #define LOADING_PRELUDE \ @@ -2638,6 +2744,40 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool i return fused_qkv; } +static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor * tensor, llama_split_tensor & split_tensor, const std::vector & splits) { + GGML_ASSERT(split_dim == 0 || split_dim == 1); + GGML_ASSERT(splits.size() > 1); + std::string name{tensor->name}; + split_tensor.tensor_splits.resize(splits.size()); + if (split_dim == 1) { + size_t offset = 0; + for (int i = 0; i < int(splits.size()); ++i) { + if (splits[i] > 0) { + split_tensor.tensor_splits[i] = ggml_view_3d(ctx, tensor, tensor->ne[0], splits[i], tensor->ne[2], tensor->nb[1], tensor->nb[2], offset); + auto name_i = name + '.' + std::to_string(i); + ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str()); + offset += tensor->nb[1]*splits[i]; + } else { + split_tensor.tensor_splits[i] = nullptr; + } + } + } else { + for (int i = 0; i < int(splits.size()); ++i) { + if (splits[i] > 0) { + split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, splits[i], tensor->ne[1], tensor->ne[2]); + auto name_i = name + '.' + std::to_string(i); + ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str()); + } else { + split_tensor.tensor_splits[i] = nullptr; + } + } + } + split_tensor.ggml.n_device = splits.size(); + split_tensor.ggml.split_dim = split_dim; + split_tensor.ggml.splits = split_tensor.tensor_splits.data(); + tensor->extra = (void *)&split_tensor.ggml; +} + bool create_tensors_helper::create_tensors() { const auto tn = LLM_TN(model.arch); bool use_mmap_buffer = true; @@ -2761,6 +2901,71 @@ bool create_tensors_helper::create_tensors() { default: throw std::runtime_error("unknown architecture"); } + if (model.split_mode == LLAMA_SPLIT_MODE_ROW) { + const auto & hparams = model.hparams; + int gqa_ratio = hparams.n_head() / hparams.n_head_kv(); + printf("GQA ratio: %d\n", gqa_ratio); + for (int il = 0; il < int(model.layers.size()); ++il) { + auto & layer = model.layers[il]; + auto ctx_split = ctx_for_layer_split(il); + if (layer.wo && layer.wq && layer.wk && layer.wv) { + int attn_granularity = hparams.n_embd_head_k; + if (ggml_is_quantized(layer.wo->type)) { + auto tt = ggml_internal_get_type_traits(layer.wo->type); + if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size; + } + GGML_ASSERT(attn_granularity % hparams.n_embd_head_k == 0); + auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits); + prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split); + prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split); + for (auto & s : split) s /= gqa_ratio; + prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split); + prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split); + } + + if (layer.ffn_down && layer.ffn_up && layer.ffn_gate) { + int ffn_granularity = 16; + if (ggml_is_quantized(layer.ffn_down->type)) { + auto tt = ggml_internal_get_type_traits(layer.ffn_down->type); + if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; + } + auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, model.splits); + prepare_split_tensors(0, ctx_split, layer.ffn_down, layer.split_ffn_down, split); + prepare_split_tensors(1, ctx_split, layer.ffn_up, layer.split_ffn_up, split); + prepare_split_tensors(1, ctx_split, layer.ffn_gate, layer.split_ffn_gate, split); + } + + if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) { + int ffn_granularity = 16; + if (ggml_is_quantized(layer.ffn_down_shexp->type)) { + auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type); + if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; + } + auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits); + prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split); + prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split); + prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split); + } + + if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) { + int ffn_granularity = 16; + if (ggml_is_quantized(layer.ffn_down_exps->type)) { + auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type); + if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; + } + auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits); + prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split); + prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split); + prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split); + } + } + + if (model.output) { + auto ctx_split = ctx_map[model.buft_output.buft_matrix]; + auto split = create_split(model.output->ne[1], 16, model.splits); + prepare_split_tensors(1, ctx_split, model.output, model.split_output, split); + } + } return use_mmap_buffer; } diff --git a/src/llama-model.h b/src/llama-model.h index 342d6df6..7d0b1a0a 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -183,6 +183,21 @@ struct llama_layer { struct ggml_tensor * bqk = nullptr; struct ggml_tensor * bkv = nullptr; + llama_split_tensor split_wq; + llama_split_tensor split_wk; + llama_split_tensor split_wv; + llama_split_tensor split_wo; + llama_split_tensor split_wqkv; + llama_split_tensor split_wqk; + llama_split_tensor split_wkv; + llama_split_tensor split_bq; + llama_split_tensor split_bk; + llama_split_tensor split_bv; + llama_split_tensor split_bo; + llama_split_tensor split_bqkv; + llama_split_tensor split_bqk; + llama_split_tensor split_bkv; + // relative position bias struct ggml_tensor * attn_rel_b = nullptr; struct ggml_tensor * attn_rel_b_enc = nullptr; @@ -205,12 +220,20 @@ struct llama_layer { struct ggml_tensor * ffn_down_enc = nullptr; struct ggml_tensor * ffn_up_enc = nullptr; + llama_split_tensor split_ffn_up; + llama_split_tensor split_ffn_gate; + llama_split_tensor split_ffn_down; + // ff MoE struct ggml_tensor * ffn_gate_inp = nullptr; struct ggml_tensor * ffn_gate_exps = nullptr; struct ggml_tensor * ffn_down_exps = nullptr; struct ggml_tensor * ffn_up_exps = nullptr; + llama_split_tensor split_ffn_up_exps; + llama_split_tensor split_ffn_gate_exps; + llama_split_tensor split_ffn_down_exps; + // ff MoE bias struct ggml_tensor * ffn_gate_inp_b = nullptr; struct ggml_tensor * ffn_gate_exps_b = nullptr; @@ -226,6 +249,10 @@ struct llama_layer { struct ggml_tensor * ffn_down_shexp = nullptr; struct ggml_tensor * ffn_up_shexp = nullptr; + llama_split_tensor split_ffn_up_shexp; + llama_split_tensor split_ffn_gate_shexp; + llama_split_tensor split_ffn_down_shexp; + // ff bias struct ggml_tensor * ffn_gate_b = nullptr; struct ggml_tensor * ffn_down_b = nullptr; // b2 @@ -298,6 +325,8 @@ struct llama_model { struct ggml_tensor * output_b; struct ggml_tensor * output_norm_enc; + llama_split_tensor split_output; + std::vector layers; llama_split_mode split_mode; @@ -358,6 +387,11 @@ struct llama_model { } void set_tensor_overrides(const llama_model_params& params); + + int device_count() const; + ggml_backend_buffer_type_t default_buffer_type_offload(int device) const; + + std::vector splits; }; struct llama_lora_weight { diff --git a/src/llama.cpp b/src/llama.cpp index 587ff62f..cce9729d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -483,6 +483,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo GGML_UNUSED(tensor_split); } +int llama_model::device_count() const { + return llama_get_device_count(*this); +} + +ggml_backend_buffer_type_t llama_model::default_buffer_type_offload(int device) const { + return llama_default_buffer_type_offload(*this, device); +} + static size_t llama_get_device_memory(const llama_model & model, int device) { #if defined(GGML_USE_RPC) int dev_count = (int)llama_get_device_count(model); @@ -626,42 +634,35 @@ static bool llama_kv_cache_init( } } + bool split_cache = false; + if (model.split_mode == LLAMA_SPLIT_MODE_ROW && model.arch != LLM_ARCH_DEEPSEEK2 && offload) { + cache.split_k_l.reserve(n_layer); + cache.split_v_l.reserve(n_layer); + split_cache = true; + } + // count used buffer types std::map buft_layer_count; if (offload) { for (int64_t i = 0; i < n_layer; ++i) { - buft_layer_count[model.buft_layer[i].buft]++; + if (split_cache) { + buft_layer_count[model.buft_layer[i].buft_matrix]++; + } else { + buft_layer_count[model.buft_layer[i].buft]++; + } } } else { buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; } - //if (cparams.fused_moe_up_gate) { - // int nbad = 0; - // for (int i = 0; i < (int) n_layer; i++) { - // auto& layer = model.layers[i]; - // if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) { - // ++nbad; - // } - // } - // if (nbad > 0) { - // if (nbad == (int)n_layer) { - // LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n"); - // const_cast(cparams).fused_moe_up_gate = false; - // } - // else { - // LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n", - // nbad, (int)n_layer); - // } - // } - //} - // create a context for each buffer type std::map ctx_map; for (auto & it : buft_layer_count) { int n_layers = it.second; + size_t ctx_mem_size = 5u*n_layers*ggml_tensor_overhead(); + if (split_cache) ctx_mem_size += 2*model.splits.size()*n_layers*ggml_tensor_overhead(); struct ggml_init_params params = { - /*.mem_size =*/ 5u*n_layers*ggml_tensor_overhead(), + /*.mem_size =*/ ctx_mem_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -698,8 +699,8 @@ static bool llama_kv_cache_init( } } - cache.k_l.reserve(n_layer); bool needs_v_cache = true; + cache.k_l.reserve(n_layer); if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) { needs_v_cache = cparams.mla_attn == 1 && !cparams.flash_attn; } @@ -711,11 +712,10 @@ static bool llama_kv_cache_init( const uint32_t n_head_kv = hparams.n_head_kv(i); const uint32_t n_embd_head_k= hparams.n_embd_head_k; - - struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + struct ggml_context * ctx = split_cache ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); ggml_tensor * k; ggml_tensor * v; - if (cparams.mla_attn) { + if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) { // DeepSeek MLA const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; @@ -744,6 +744,36 @@ static bool llama_kv_cache_init( ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); + if (split_cache) { + auto K = model.layers[i].wk; + auto V = model.layers[i].wv; + if (K && V && K->extra && V->extra) { + auto extra_K = (const ggml_split_tensor_t *)K->extra; + auto extra_V = (const ggml_split_tensor_t *)V->extra; + auto & split_k_l = cache.split_k_l.emplace_back(); + auto & split_v_l = cache.split_v_l.emplace_back(); + split_k_l.tensor_splits.resize(extra_K->n_device, nullptr); + split_v_l.tensor_splits.resize(extra_V->n_device, nullptr); + for (int is = 0; is < extra_K->n_device; ++is) { + auto split = extra_K->splits[is]; + if (!split) continue; + split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, split->ne[1]/n_embd_head_k * kv_size); + } + split_k_l.ggml.n_device = extra_K->n_device; + split_k_l.ggml.split_dim = 0; + split_k_l.ggml.splits = split_k_l.tensor_splits.data(); + for (int is = 0; is < extra_V->n_device; ++is) { + auto split = extra_V->splits[is]; + if (!split) continue; + split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, type_v, split->ne[1] * kv_size); + } + split_v_l.ggml.n_device = extra_V->n_device; + split_v_l.ggml.split_dim = 0; + split_v_l.ggml.splits = split_v_l.tensor_splits.data(); + } else { + printf("Oops: don't have yet K and V for layer %d\n", i); + } + } } } if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn && n_mla < n_layer && n_mla > 0) { @@ -1652,10 +1682,7 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } - if (split_mode == LLAMA_SPLIT_MODE_LAYER) { - // calculate the split points - // int device_count = llama_get_device_count(model); - int device_count = model.devices.size(); + if (int device_count = model.devices.size(); device_count > 1) { bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); std::vector splits(device_count); if (all_zero) { @@ -1676,36 +1703,31 @@ static bool llm_load_tensors( for (int i = 0; i < device_count; ++i) { splits[i] /= split_sum; } + model.splits = std::move(splits); + } else { + model.splits = { 1.0f }; + } + if (split_mode == LLAMA_SPLIT_MODE_LAYER) { + + int device_count = model.splits.size(); // assign the repeating layers to the devices according to the splits int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); for (int i = i_gpu_start; i < n_layer; ++i) { - int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin(); -#ifndef NDEBUG - ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); - const char* name = ggml_backend_buft_name(buft); - LLAMA_LOG_DEBUG("load_tensors: layers %3d assigned to backend %s\n", i, - name); -#endif + int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin(); model.buft_layer[i] = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); } // assign the output layer if (n_gpu_layers > n_layer) { - int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); -#ifndef NDEBUG - ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); - const char* name = ggml_backend_buft_name(buft); - LLAMA_LOG_DEBUG("load_tensors: output layers assigned to backend %s\n", - name); -#endif + int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - model.splits.begin(); model.buft_output = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); } else { model.buft_output = llama_default_buffer_type_cpu(true); } } else { ggml_backend_buffer_type_t split_buft; - if (split_mode == LLAMA_SPLIT_MODE_ROW) { - split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], tensor_split); + if (split_mode == LLAMA_SPLIT_MODE_ROW && model.splits.size() > 1) { + split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], model.splits.data()); } else { // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);