This commit is contained in:
Kawrakow
2025-11-25 14:51:33 +00:00
parent 5ea430aaa4
commit 32c6df015b
8 changed files with 449 additions and 118 deletions

View File

@@ -1277,10 +1277,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
}
else if (arg_next == "row") {
fprintf(stderr, "\n\n=====================================================================================\n");
fprintf(stderr, " Split mode row is no longer supported\n");
fprintf(stderr, "=====================================================================================\n\n\n");
GGML_ABORT("fatal error");
//fprintf(stderr, "\n\n=====================================================================================\n");
//fprintf(stderr, " Split mode row is no longer supported\n");
//fprintf(stderr, "=====================================================================================\n\n\n");
//GGML_ABORT("fatal error");
params.split_mode = LLAMA_SPLIT_MODE_ROW;
}
else {

View File

@@ -3021,6 +3021,13 @@ extern "C" {
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
typedef struct {
int n_device;
int split_dim;
struct ggml_tensor * tensor;
struct ggml_tensor ** splits;
} ggml_split_tensor_t;
#ifdef __cplusplus
}
#endif

View File

@@ -785,90 +785,145 @@ GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buf
GGML_UNUSED(buffer);
}
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
const int64_t ne0 = tensor->ne[0];
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
ctx->tensor_extras.push_back(extra);
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
int64_t row_low, row_high;
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
int64_t nrows_split = row_high - row_low;
if (nrows_split == 0) {
continue;
}
size_t size = ggml_nbytes_split(tensor, nrows_split);
const size_t original_size = size;
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
if (!tensor->extra) return;
printf("%s(%s, %p)\n", __func__, tensor->name, tensor->extra);
auto extra = (ggml_split_tensor_t *)tensor->extra;
GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count());
for (int i = 0; i < extra->n_device; ++i) {
if (!extra->splits[i]) continue;
auto split = extra->splits[i];
auto ne0 = split->ne[0];
auto size = ggml_nbytes(split);
auto padded_size = size;
if (ne0 % MATRIX_ROW_PADDING != 0) {
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
int nblock = (ne0 + MATRIX_ROW_PADDING - 1)/MATRIX_ROW_PADDING;
auto padded_row_size = ggml_row_size(split->type, nblock*MATRIX_ROW_PADDING);
auto row_size = ggml_row_size(split->type, ne0);
padded_size += padded_row_size - row_size;
}
// FIXME: do not crash if cudaMalloc fails
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
ggml_cuda_set_device(id);
ggml_cuda_set_device(i);
char * buf;
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
// set padding to 0 to avoid possible NaN values
if (size > original_size) {
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
}
extra->data_device[id] = buf;
for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, padded_size, i));
if (padded_size > size) {
CUDA_CHECK(cudaMemset(buf + size, 0, padded_size - size));
}
printf(" allocated %zu bytes for tensor %s of type %s, dim = %ld x %ld x %ld. padding: %zu\n", padded_size, split->name, ggml_type_name(split->type),
split->ne[0], split->ne[1], split->ne[2], padded_size - size);
split->data = buf;
}
tensor->extra = extra;
return;
//if (tensor->view_src != nullptr) fprintf(stderr, "%s: tensor %s is a view into %s\n", __func__, tensor->name, tensor->view_src->name);
//GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
//ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
//ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
//const int64_t ne0 = tensor->ne[0];
//ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
//ctx->tensor_extras.push_back(extra);
//for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
// int64_t row_low, row_high;
// get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
// int64_t nrows_split = row_high - row_low;
// if (nrows_split == 0) {
// continue;
// }
// size_t size = ggml_nbytes_split(tensor, nrows_split);
// const size_t original_size = size;
// // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
// if (ne0 % MATRIX_ROW_PADDING != 0) {
// size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
// }
// // FIXME: do not crash if cudaMalloc fails
// // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
// ggml_cuda_set_device(id);
// char * buf;
// CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
// // set padding to 0 to avoid possible NaN values
// if (size > original_size) {
// CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
// }
// extra->data_device[id] = buf;
// for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
// CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
// }
//}
//tensor->extra = extra;
}
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (!tensor->extra) return;
printf("%s(%s)\n", __func__, tensor->name);
// split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
auto extra = (ggml_split_tensor_t *)tensor->extra;
GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count());
const int64_t ne0 = tensor->ne[0];
const size_t nb1 = tensor->nb[1];
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
int64_t row_low, row_high;
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
int64_t nrows_split = row_high - row_low;
if (nrows_split == 0) {
continue;
}
const size_t offset_split = row_low*nb1;
size_t size = ggml_nbytes_split(tensor, nrows_split);
const size_t original_size = size;
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
if (ne0 % MATRIX_ROW_PADDING != 0) {
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}
const char * buf_host = (const char *)data + offset_split;
CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
if (extra->split_dim != 0) {
fprintf(stderr, "Split tensor copy not yet immplemented for dim 0\n");
return;
}
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
size_t cur_offset = 0;
for (int i = 0; i < extra->n_device; ++i) {
auto split = extra->splits[i];
if (!split) continue;
auto size = ggml_nbytes(split);
const char * buf_host = (const char *)data + cur_offset;
CUDA_CHECK(cudaMemcpyAsync(split->data, buf_host, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
cur_offset += size;
}
for (int i = 0; i < extra->n_device; ++i) {
if (!extra->splits[i]) continue;
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
//ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
//const int64_t ne0 = tensor->ne[0];
//const size_t nb1 = tensor->nb[1];
//ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
//for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
// int64_t row_low, row_high;
// get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
// int64_t nrows_split = row_high - row_low;
// if (nrows_split == 0) {
// continue;
// }
// const size_t offset_split = row_low*nb1;
// size_t size = ggml_nbytes_split(tensor, nrows_split);
// const size_t original_size = size;
// // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
// if (ne0 % MATRIX_ROW_PADDING != 0) {
// size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
// }
// const char * buf_host = (const char *)data + offset_split;
// CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
//}
//for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
// CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
//}
}
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {

View File

@@ -57,6 +57,9 @@ struct llama_kv_cache {
std::vector<struct ggml_tensor *> k_l; // per layer
std::vector<struct ggml_tensor *> v_l;
std::vector<llama_split_tensor> split_k_l;
std::vector<llama_split_tensor> split_v_l;
std::vector<struct ggml_context *> ctxs;
std::vector<ggml_backend_buffer_t> bufs;

View File

@@ -224,3 +224,8 @@ struct gguf_context;
std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i);
ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer);
struct llama_split_tensor {
std::vector<ggml_tensor *> tensor_splits;
ggml_split_tensor_t ggml;
};

View File

@@ -192,6 +192,11 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
ctx_size += ggml_tensor_overhead()*n_layer*3; // for moe merged tensors
if (model.splits.size() > 1) {
ctx_size += ggml_tensor_overhead()*n_layer*4; // for KV cache
ctx_size *= model.splits.size();
}
for (auto & it : buft_layer_count) {
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
@@ -205,10 +210,78 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
ctx_map[it.first] = ctx;
model.ctxs.push_back(ctx);
}
printf("=======================================================================\n");
auto n_device = model.device_count();
printf(" Model has %d devices:\n", n_device);
for (int device = 0; device < n_device; ++device) {
auto buft = model.default_buffer_type_offload(device);
if (buft) {
printf(" %d %s\n", device, ggml_backend_buft_name(buft));
} else {
printf(" Oops: null buft for debvice %d\n", device);
}
}
if (model.split_mode == LLAMA_SPLIT_MODE_ROW) {
printf("model.splits:");
for (auto s : model.splits) printf(" %g", s);
printf("\n");
}
}
static std::vector<int> create_split(int nr, int granularity, const std::vector<float> & splits) {
GGML_ASSERT(nr % granularity == 0);
GGML_ASSERT(!splits.empty());
int nchunk = nr / granularity;
std::vector<int> result(splits.size());
float last_split = 0;
int sum = 0;
for (int i = 0; i < (int)splits.size(); ++i) {
float p = splits[i] - last_split;
result[i] = roundf(p*nchunk);
sum += result[i];
last_split = splits[i];
}
while (sum > nchunk) {
last_split = 0;
float best_err = std::numeric_limits<float>::max();
int ibest = -1;
for (int i = 0; i < (int)splits.size(); ++i) {
if (result[i] > 0) {
float n_want = (splits[i] - last_split)*nchunk;
float err = std::abs(n_want - result[i] + 1);
if (err < best_err) {
best_err = err; ibest = i;
}
}
last_split = splits[i];
}
GGML_ASSERT(ibest >= 0 && result[ibest] > 0);
--result[ibest];
--sum;
}
while (sum < nchunk) {
last_split = 0;
float best_err = std::numeric_limits<float>::max();
int ibest = -1;
for (int i = 0; i < (int)splits.size(); ++i) {
float n_want = (splits[i] - last_split)*nchunk;
float err = std::abs(n_want - result[i] - 1);
if (err < best_err) {
best_err = err; ibest = i;
}
last_split = splits[i];
}
GGML_ASSERT(ibest >= 0);
++result[ibest];
++sum;
}
for (auto & r : result) r *= granularity;
return result;
}
ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
int flags, ggml_context ** actual_context) {
//auto requested_ctx = ctx;
if (ml.tensor_buft_overrides) {
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
std::regex pattern(overrides->pattern);
@@ -220,7 +293,40 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
}
}
if (actual_context) *actual_context = ctx;
return ml.create_tensor(ctx, name, ne, flags);
auto tensor = ml.create_tensor(ctx, name, ne, flags);
//if (tensor && requested_ctx == ctx && model.split_mode == LLAMA_SPLIT_MODE_ROW) {
// int i_layer = -1;
// if (auto pos = name.find("blk."); pos == 0) {
// GGML_ASSERT(sscanf(name.c_str(), "blk.%d.", &i_layer) == 1);
// }
// if (i_layer >= 0) {
// auto & layer = model.layers[i_layer];
// auto & hparams = model.hparams;
// if (auto pos = name.find("attn_q.weight"); pos != std::string::npos) {
// auto split = create_split(tensor->ne[1], hparams.n_embd_head_k, model.splits);
// printf("%s(%s):", __func__, name.c_str());
// for (auto s : split) printf(" %d", s);
// printf("\n");
// layer.split_wq.tensor_splits.resize(split.size());
// size_t offset = 0;
// for (int i = 0; i < (int)split.size(); ++i) {
// if (split[i] > 0) {
// layer.split_wq.tensor_splits[i] = ggml_view_2d(ctx, tensor, tensor->ne[0], split[i], tensor->nb[1], offset);
// auto split_name = name + '.' + std::to_string(i);
// ggml_set_name(layer.split_wq.tensor_splits[i], split_name.c_str());
// offset += tensor->nb[1]*split[i];
// } else {
// layer.split_wq.tensor_splits[i] = nullptr;
// }
// }
// layer.split_wq.ggml.n_device = split.size();
// layer.split_wq.ggml.split_dim = 1;
// layer.split_wq.ggml.splits = layer.split_wq.tensor_splits.data();
// }
// }
//}
return tensor;
//return ml.create_tensor(ctx, name, ne, flags);
}
#define LOADING_PRELUDE \
@@ -2638,6 +2744,40 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool i
return fused_qkv;
}
static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor * tensor, llama_split_tensor & split_tensor, const std::vector<int> & splits) {
GGML_ASSERT(split_dim == 0 || split_dim == 1);
GGML_ASSERT(splits.size() > 1);
std::string name{tensor->name};
split_tensor.tensor_splits.resize(splits.size());
if (split_dim == 1) {
size_t offset = 0;
for (int i = 0; i < int(splits.size()); ++i) {
if (splits[i] > 0) {
split_tensor.tensor_splits[i] = ggml_view_3d(ctx, tensor, tensor->ne[0], splits[i], tensor->ne[2], tensor->nb[1], tensor->nb[2], offset);
auto name_i = name + '.' + std::to_string(i);
ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str());
offset += tensor->nb[1]*splits[i];
} else {
split_tensor.tensor_splits[i] = nullptr;
}
}
} else {
for (int i = 0; i < int(splits.size()); ++i) {
if (splits[i] > 0) {
split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, splits[i], tensor->ne[1], tensor->ne[2]);
auto name_i = name + '.' + std::to_string(i);
ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str());
} else {
split_tensor.tensor_splits[i] = nullptr;
}
}
}
split_tensor.ggml.n_device = splits.size();
split_tensor.ggml.split_dim = split_dim;
split_tensor.ggml.splits = split_tensor.tensor_splits.data();
tensor->extra = (void *)&split_tensor.ggml;
}
bool create_tensors_helper::create_tensors() {
const auto tn = LLM_TN(model.arch);
bool use_mmap_buffer = true;
@@ -2761,6 +2901,71 @@ bool create_tensors_helper::create_tensors() {
default:
throw std::runtime_error("unknown architecture");
}
if (model.split_mode == LLAMA_SPLIT_MODE_ROW) {
const auto & hparams = model.hparams;
int gqa_ratio = hparams.n_head() / hparams.n_head_kv();
printf("GQA ratio: %d\n", gqa_ratio);
for (int il = 0; il < int(model.layers.size()); ++il) {
auto & layer = model.layers[il];
auto ctx_split = ctx_for_layer_split(il);
if (layer.wo && layer.wq && layer.wk && layer.wv) {
int attn_granularity = hparams.n_embd_head_k;
if (ggml_is_quantized(layer.wo->type)) {
auto tt = ggml_internal_get_type_traits(layer.wo->type);
if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size;
}
GGML_ASSERT(attn_granularity % hparams.n_embd_head_k == 0);
auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits);
prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split);
prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split);
for (auto & s : split) s /= gqa_ratio;
prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split);
prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split);
}
if (layer.ffn_down && layer.ffn_up && layer.ffn_gate) {
int ffn_granularity = 16;
if (ggml_is_quantized(layer.ffn_down->type)) {
auto tt = ggml_internal_get_type_traits(layer.ffn_down->type);
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
}
auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, model.splits);
prepare_split_tensors(0, ctx_split, layer.ffn_down, layer.split_ffn_down, split);
prepare_split_tensors(1, ctx_split, layer.ffn_up, layer.split_ffn_up, split);
prepare_split_tensors(1, ctx_split, layer.ffn_gate, layer.split_ffn_gate, split);
}
if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) {
int ffn_granularity = 16;
if (ggml_is_quantized(layer.ffn_down_shexp->type)) {
auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
}
auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits);
prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split);
prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split);
prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split);
}
if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) {
int ffn_granularity = 16;
if (ggml_is_quantized(layer.ffn_down_exps->type)) {
auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type);
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
}
auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits);
prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split);
prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split);
prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split);
}
}
if (model.output) {
auto ctx_split = ctx_map[model.buft_output.buft_matrix];
auto split = create_split(model.output->ne[1], 16, model.splits);
prepare_split_tensors(1, ctx_split, model.output, model.split_output, split);
}
}
return use_mmap_buffer;
}

View File

@@ -183,6 +183,21 @@ struct llama_layer {
struct ggml_tensor * bqk = nullptr;
struct ggml_tensor * bkv = nullptr;
llama_split_tensor split_wq;
llama_split_tensor split_wk;
llama_split_tensor split_wv;
llama_split_tensor split_wo;
llama_split_tensor split_wqkv;
llama_split_tensor split_wqk;
llama_split_tensor split_wkv;
llama_split_tensor split_bq;
llama_split_tensor split_bk;
llama_split_tensor split_bv;
llama_split_tensor split_bo;
llama_split_tensor split_bqkv;
llama_split_tensor split_bqk;
llama_split_tensor split_bkv;
// relative position bias
struct ggml_tensor * attn_rel_b = nullptr;
struct ggml_tensor * attn_rel_b_enc = nullptr;
@@ -205,12 +220,20 @@ struct llama_layer {
struct ggml_tensor * ffn_down_enc = nullptr;
struct ggml_tensor * ffn_up_enc = nullptr;
llama_split_tensor split_ffn_up;
llama_split_tensor split_ffn_gate;
llama_split_tensor split_ffn_down;
// ff MoE
struct ggml_tensor * ffn_gate_inp = nullptr;
struct ggml_tensor * ffn_gate_exps = nullptr;
struct ggml_tensor * ffn_down_exps = nullptr;
struct ggml_tensor * ffn_up_exps = nullptr;
llama_split_tensor split_ffn_up_exps;
llama_split_tensor split_ffn_gate_exps;
llama_split_tensor split_ffn_down_exps;
// ff MoE bias
struct ggml_tensor * ffn_gate_inp_b = nullptr;
struct ggml_tensor * ffn_gate_exps_b = nullptr;
@@ -226,6 +249,10 @@ struct llama_layer {
struct ggml_tensor * ffn_down_shexp = nullptr;
struct ggml_tensor * ffn_up_shexp = nullptr;
llama_split_tensor split_ffn_up_shexp;
llama_split_tensor split_ffn_gate_shexp;
llama_split_tensor split_ffn_down_shexp;
// ff bias
struct ggml_tensor * ffn_gate_b = nullptr;
struct ggml_tensor * ffn_down_b = nullptr; // b2
@@ -298,6 +325,8 @@ struct llama_model {
struct ggml_tensor * output_b;
struct ggml_tensor * output_norm_enc;
llama_split_tensor split_output;
std::vector<llama_layer> layers;
llama_split_mode split_mode;
@@ -358,6 +387,11 @@ struct llama_model {
}
void set_tensor_overrides(const llama_model_params& params);
int device_count() const;
ggml_backend_buffer_type_t default_buffer_type_offload(int device) const;
std::vector<float> splits;
};
struct llama_lora_weight {

View File

@@ -483,6 +483,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
GGML_UNUSED(tensor_split);
}
int llama_model::device_count() const {
return llama_get_device_count(*this);
}
ggml_backend_buffer_type_t llama_model::default_buffer_type_offload(int device) const {
return llama_default_buffer_type_offload(*this, device);
}
static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
int dev_count = (int)llama_get_device_count(model);
@@ -626,42 +634,35 @@ static bool llama_kv_cache_init(
}
}
bool split_cache = false;
if (model.split_mode == LLAMA_SPLIT_MODE_ROW && model.arch != LLM_ARCH_DEEPSEEK2 && offload) {
cache.split_k_l.reserve(n_layer);
cache.split_v_l.reserve(n_layer);
split_cache = true;
}
// count used buffer types
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
if (offload) {
for (int64_t i = 0; i < n_layer; ++i) {
buft_layer_count[model.buft_layer[i].buft]++;
if (split_cache) {
buft_layer_count[model.buft_layer[i].buft_matrix]++;
} else {
buft_layer_count[model.buft_layer[i].buft]++;
}
}
} else {
buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
}
//if (cparams.fused_moe_up_gate) {
// int nbad = 0;
// for (int i = 0; i < (int) n_layer; i++) {
// auto& layer = model.layers[i];
// if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) {
// ++nbad;
// }
// }
// if (nbad > 0) {
// if (nbad == (int)n_layer) {
// LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n");
// const_cast<llama_cparams&>(cparams).fused_moe_up_gate = false;
// }
// else {
// LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n",
// nbad, (int)n_layer);
// }
// }
//}
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
for (auto & it : buft_layer_count) {
int n_layers = it.second;
size_t ctx_mem_size = 5u*n_layers*ggml_tensor_overhead();
if (split_cache) ctx_mem_size += 2*model.splits.size()*n_layers*ggml_tensor_overhead();
struct ggml_init_params params = {
/*.mem_size =*/ 5u*n_layers*ggml_tensor_overhead(),
/*.mem_size =*/ ctx_mem_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
@@ -698,8 +699,8 @@ static bool llama_kv_cache_init(
}
}
cache.k_l.reserve(n_layer);
bool needs_v_cache = true;
cache.k_l.reserve(n_layer);
if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) {
needs_v_cache = cparams.mla_attn == 1 && !cparams.flash_attn;
}
@@ -711,11 +712,10 @@ static bool llama_kv_cache_init(
const uint32_t n_head_kv = hparams.n_head_kv(i);
const uint32_t n_embd_head_k= hparams.n_embd_head_k;
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
struct ggml_context * ctx = split_cache ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
ggml_tensor * k;
ggml_tensor * v;
if (cparams.mla_attn) {
if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) {
// DeepSeek MLA
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
@@ -744,6 +744,36 @@ static bool llama_kv_cache_init(
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
cache.v_l.push_back(v);
if (split_cache) {
auto K = model.layers[i].wk;
auto V = model.layers[i].wv;
if (K && V && K->extra && V->extra) {
auto extra_K = (const ggml_split_tensor_t *)K->extra;
auto extra_V = (const ggml_split_tensor_t *)V->extra;
auto & split_k_l = cache.split_k_l.emplace_back();
auto & split_v_l = cache.split_v_l.emplace_back();
split_k_l.tensor_splits.resize(extra_K->n_device, nullptr);
split_v_l.tensor_splits.resize(extra_V->n_device, nullptr);
for (int is = 0; is < extra_K->n_device; ++is) {
auto split = extra_K->splits[is];
if (!split) continue;
split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, split->ne[1]/n_embd_head_k * kv_size);
}
split_k_l.ggml.n_device = extra_K->n_device;
split_k_l.ggml.split_dim = 0;
split_k_l.ggml.splits = split_k_l.tensor_splits.data();
for (int is = 0; is < extra_V->n_device; ++is) {
auto split = extra_V->splits[is];
if (!split) continue;
split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, type_v, split->ne[1] * kv_size);
}
split_v_l.ggml.n_device = extra_V->n_device;
split_v_l.ggml.split_dim = 0;
split_v_l.ggml.splits = split_v_l.tensor_splits.data();
} else {
printf("Oops: don't have yet K and V for layer %d\n", i);
}
}
}
}
if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn && n_mla < n_layer && n_mla > 0) {
@@ -1652,10 +1682,7 @@ static bool llm_load_tensors(
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
}
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
// calculate the split points
// int device_count = llama_get_device_count(model);
int device_count = model.devices.size();
if (int device_count = model.devices.size(); device_count > 1) {
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
std::vector<float> splits(device_count);
if (all_zero) {
@@ -1676,36 +1703,31 @@ static bool llm_load_tensors(
for (int i = 0; i < device_count; ++i) {
splits[i] /= split_sum;
}
model.splits = std::move(splits);
} else {
model.splits = { 1.0f };
}
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
int device_count = model.splits.size();
// assign the repeating layers to the devices according to the splits
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
for (int i = i_gpu_start; i < n_layer; ++i) {
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
#ifndef NDEBUG
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
const char* name = ggml_backend_buft_name(buft);
LLAMA_LOG_DEBUG("load_tensors: layers %3d assigned to backend %s\n", i,
name);
#endif
int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin();
model.buft_layer[i] = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
}
// assign the output layer
if (n_gpu_layers > n_layer) {
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
#ifndef NDEBUG
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
const char* name = ggml_backend_buft_name(buft);
LLAMA_LOG_DEBUG("load_tensors: output layers assigned to backend %s\n",
name);
#endif
int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - model.splits.begin();
model.buft_output = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
} else {
model.buft_output = llama_default_buffer_type_cpu(true);
}
} else {
ggml_backend_buffer_type_t split_buft;
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], tensor_split);
if (split_mode == LLAMA_SPLIT_MODE_ROW && model.splits.size() > 1) {
split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], model.splits.data());
} else {
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);