mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-01 17:40:25 +00:00
WIP
This commit is contained in:
@@ -1277,10 +1277,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
}
|
||||
else if (arg_next == "row") {
|
||||
fprintf(stderr, "\n\n=====================================================================================\n");
|
||||
fprintf(stderr, " Split mode row is no longer supported\n");
|
||||
fprintf(stderr, "=====================================================================================\n\n\n");
|
||||
GGML_ABORT("fatal error");
|
||||
//fprintf(stderr, "\n\n=====================================================================================\n");
|
||||
//fprintf(stderr, " Split mode row is no longer supported\n");
|
||||
//fprintf(stderr, "=====================================================================================\n\n\n");
|
||||
//GGML_ABORT("fatal error");
|
||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||
}
|
||||
else {
|
||||
|
||||
@@ -3021,6 +3021,13 @@ extern "C" {
|
||||
|
||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||
|
||||
typedef struct {
|
||||
int n_device;
|
||||
int split_dim;
|
||||
struct ggml_tensor * tensor;
|
||||
struct ggml_tensor ** splits;
|
||||
} ggml_split_tensor_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -785,90 +785,145 @@ GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buf
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
|
||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
|
||||
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
int64_t row_low, row_high;
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
||||
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
if (nrows_split == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||
const size_t original_size = size;
|
||||
|
||||
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
if (!tensor->extra) return;
|
||||
printf("%s(%s, %p)\n", __func__, tensor->name, tensor->extra);
|
||||
auto extra = (ggml_split_tensor_t *)tensor->extra;
|
||||
GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count());
|
||||
for (int i = 0; i < extra->n_device; ++i) {
|
||||
if (!extra->splits[i]) continue;
|
||||
auto split = extra->splits[i];
|
||||
auto ne0 = split->ne[0];
|
||||
auto size = ggml_nbytes(split);
|
||||
auto padded_size = size;
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
int nblock = (ne0 + MATRIX_ROW_PADDING - 1)/MATRIX_ROW_PADDING;
|
||||
auto padded_row_size = ggml_row_size(split->type, nblock*MATRIX_ROW_PADDING);
|
||||
auto row_size = ggml_row_size(split->type, ne0);
|
||||
padded_size += padded_row_size - row_size;
|
||||
}
|
||||
|
||||
// FIXME: do not crash if cudaMalloc fails
|
||||
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||
ggml_cuda_set_device(id);
|
||||
ggml_cuda_set_device(i);
|
||||
char * buf;
|
||||
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
|
||||
|
||||
// set padding to 0 to avoid possible NaN values
|
||||
if (size > original_size) {
|
||||
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
||||
}
|
||||
|
||||
extra->data_device[id] = buf;
|
||||
|
||||
for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
||||
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, padded_size, i));
|
||||
if (padded_size > size) {
|
||||
CUDA_CHECK(cudaMemset(buf + size, 0, padded_size - size));
|
||||
}
|
||||
printf(" allocated %zu bytes for tensor %s of type %s, dim = %ld x %ld x %ld. padding: %zu\n", padded_size, split->name, ggml_type_name(split->type),
|
||||
split->ne[0], split->ne[1], split->ne[2], padded_size - size);
|
||||
split->data = buf;
|
||||
}
|
||||
tensor->extra = extra;
|
||||
return;
|
||||
|
||||
//if (tensor->view_src != nullptr) fprintf(stderr, "%s: tensor %s is a view into %s\n", __func__, tensor->name, tensor->view_src->name);
|
||||
//GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
|
||||
//ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
//ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
//const int64_t ne0 = tensor->ne[0];
|
||||
|
||||
//ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
//ctx->tensor_extras.push_back(extra);
|
||||
|
||||
//for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
// int64_t row_low, row_high;
|
||||
// get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
||||
|
||||
// int64_t nrows_split = row_high - row_low;
|
||||
// if (nrows_split == 0) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||
// const size_t original_size = size;
|
||||
|
||||
// // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
||||
// if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
// size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
// }
|
||||
|
||||
// // FIXME: do not crash if cudaMalloc fails
|
||||
// // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||
// ggml_cuda_set_device(id);
|
||||
// char * buf;
|
||||
// CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
|
||||
|
||||
// // set padding to 0 to avoid possible NaN values
|
||||
// if (size > original_size) {
|
||||
// CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
||||
// }
|
||||
|
||||
// extra->data_device[id] = buf;
|
||||
|
||||
// for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
|
||||
// CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
||||
// }
|
||||
//}
|
||||
//tensor->extra = extra;
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
if (!tensor->extra) return;
|
||||
printf("%s(%s)\n", __func__, tensor->name);
|
||||
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
auto extra = (ggml_split_tensor_t *)tensor->extra;
|
||||
GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count());
|
||||
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
const size_t nb1 = tensor->nb[1];
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
||||
|
||||
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
int64_t row_low, row_high;
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
||||
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
if (nrows_split == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t offset_split = row_low*nb1;
|
||||
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||
const size_t original_size = size;
|
||||
|
||||
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
|
||||
const char * buf_host = (const char *)data + offset_split;
|
||||
CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||
if (extra->split_dim != 0) {
|
||||
fprintf(stderr, "Split tensor copy not yet immplemented for dim 0\n");
|
||||
return;
|
||||
}
|
||||
|
||||
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
size_t cur_offset = 0;
|
||||
for (int i = 0; i < extra->n_device; ++i) {
|
||||
auto split = extra->splits[i];
|
||||
if (!split) continue;
|
||||
auto size = ggml_nbytes(split);
|
||||
const char * buf_host = (const char *)data + cur_offset;
|
||||
CUDA_CHECK(cudaMemcpyAsync(split->data, buf_host, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||
cur_offset += size;
|
||||
}
|
||||
|
||||
for (int i = 0; i < extra->n_device; ++i) {
|
||||
if (!extra->splits[i]) continue;
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
|
||||
//ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
//const int64_t ne0 = tensor->ne[0];
|
||||
//const size_t nb1 = tensor->nb[1];
|
||||
//ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
||||
|
||||
//for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
// int64_t row_low, row_high;
|
||||
// get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
||||
|
||||
// int64_t nrows_split = row_high - row_low;
|
||||
// if (nrows_split == 0) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// const size_t offset_split = row_low*nb1;
|
||||
// size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||
// const size_t original_size = size;
|
||||
|
||||
// // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
||||
// if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
// size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
// }
|
||||
|
||||
// const char * buf_host = (const char *)data + offset_split;
|
||||
// CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||
//}
|
||||
|
||||
//for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
// CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
//}
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
|
||||
@@ -57,6 +57,9 @@ struct llama_kv_cache {
|
||||
std::vector<struct ggml_tensor *> k_l; // per layer
|
||||
std::vector<struct ggml_tensor *> v_l;
|
||||
|
||||
std::vector<llama_split_tensor> split_k_l;
|
||||
std::vector<llama_split_tensor> split_v_l;
|
||||
|
||||
std::vector<struct ggml_context *> ctxs;
|
||||
std::vector<ggml_backend_buffer_t> bufs;
|
||||
|
||||
|
||||
@@ -224,3 +224,8 @@ struct gguf_context;
|
||||
std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i);
|
||||
|
||||
ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer);
|
||||
|
||||
struct llama_split_tensor {
|
||||
std::vector<ggml_tensor *> tensor_splits;
|
||||
ggml_split_tensor_t ggml;
|
||||
};
|
||||
|
||||
@@ -192,6 +192,11 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
|
||||
ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
||||
ctx_size += ggml_tensor_overhead()*n_layer*3; // for moe merged tensors
|
||||
|
||||
if (model.splits.size() > 1) {
|
||||
ctx_size += ggml_tensor_overhead()*n_layer*4; // for KV cache
|
||||
ctx_size *= model.splits.size();
|
||||
}
|
||||
|
||||
for (auto & it : buft_layer_count) {
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
@@ -205,10 +210,78 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
|
||||
ctx_map[it.first] = ctx;
|
||||
model.ctxs.push_back(ctx);
|
||||
}
|
||||
printf("=======================================================================\n");
|
||||
auto n_device = model.device_count();
|
||||
printf(" Model has %d devices:\n", n_device);
|
||||
for (int device = 0; device < n_device; ++device) {
|
||||
auto buft = model.default_buffer_type_offload(device);
|
||||
if (buft) {
|
||||
printf(" %d %s\n", device, ggml_backend_buft_name(buft));
|
||||
} else {
|
||||
printf(" Oops: null buft for debvice %d\n", device);
|
||||
}
|
||||
}
|
||||
if (model.split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
printf("model.splits:");
|
||||
for (auto s : model.splits) printf(" %g", s);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<int> create_split(int nr, int granularity, const std::vector<float> & splits) {
|
||||
GGML_ASSERT(nr % granularity == 0);
|
||||
GGML_ASSERT(!splits.empty());
|
||||
int nchunk = nr / granularity;
|
||||
std::vector<int> result(splits.size());
|
||||
float last_split = 0;
|
||||
int sum = 0;
|
||||
for (int i = 0; i < (int)splits.size(); ++i) {
|
||||
float p = splits[i] - last_split;
|
||||
result[i] = roundf(p*nchunk);
|
||||
sum += result[i];
|
||||
last_split = splits[i];
|
||||
}
|
||||
while (sum > nchunk) {
|
||||
last_split = 0;
|
||||
float best_err = std::numeric_limits<float>::max();
|
||||
int ibest = -1;
|
||||
for (int i = 0; i < (int)splits.size(); ++i) {
|
||||
if (result[i] > 0) {
|
||||
float n_want = (splits[i] - last_split)*nchunk;
|
||||
float err = std::abs(n_want - result[i] + 1);
|
||||
if (err < best_err) {
|
||||
best_err = err; ibest = i;
|
||||
}
|
||||
}
|
||||
last_split = splits[i];
|
||||
}
|
||||
GGML_ASSERT(ibest >= 0 && result[ibest] > 0);
|
||||
--result[ibest];
|
||||
--sum;
|
||||
}
|
||||
while (sum < nchunk) {
|
||||
last_split = 0;
|
||||
float best_err = std::numeric_limits<float>::max();
|
||||
int ibest = -1;
|
||||
for (int i = 0; i < (int)splits.size(); ++i) {
|
||||
float n_want = (splits[i] - last_split)*nchunk;
|
||||
float err = std::abs(n_want - result[i] - 1);
|
||||
if (err < best_err) {
|
||||
best_err = err; ibest = i;
|
||||
}
|
||||
last_split = splits[i];
|
||||
}
|
||||
GGML_ASSERT(ibest >= 0);
|
||||
++result[ibest];
|
||||
++sum;
|
||||
}
|
||||
for (auto & r : result) r *= granularity;
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
||||
int flags, ggml_context ** actual_context) {
|
||||
//auto requested_ctx = ctx;
|
||||
if (ml.tensor_buft_overrides) {
|
||||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
@@ -220,7 +293,40 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
|
||||
}
|
||||
}
|
||||
if (actual_context) *actual_context = ctx;
|
||||
return ml.create_tensor(ctx, name, ne, flags);
|
||||
auto tensor = ml.create_tensor(ctx, name, ne, flags);
|
||||
//if (tensor && requested_ctx == ctx && model.split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
// int i_layer = -1;
|
||||
// if (auto pos = name.find("blk."); pos == 0) {
|
||||
// GGML_ASSERT(sscanf(name.c_str(), "blk.%d.", &i_layer) == 1);
|
||||
// }
|
||||
// if (i_layer >= 0) {
|
||||
// auto & layer = model.layers[i_layer];
|
||||
// auto & hparams = model.hparams;
|
||||
// if (auto pos = name.find("attn_q.weight"); pos != std::string::npos) {
|
||||
// auto split = create_split(tensor->ne[1], hparams.n_embd_head_k, model.splits);
|
||||
// printf("%s(%s):", __func__, name.c_str());
|
||||
// for (auto s : split) printf(" %d", s);
|
||||
// printf("\n");
|
||||
// layer.split_wq.tensor_splits.resize(split.size());
|
||||
// size_t offset = 0;
|
||||
// for (int i = 0; i < (int)split.size(); ++i) {
|
||||
// if (split[i] > 0) {
|
||||
// layer.split_wq.tensor_splits[i] = ggml_view_2d(ctx, tensor, tensor->ne[0], split[i], tensor->nb[1], offset);
|
||||
// auto split_name = name + '.' + std::to_string(i);
|
||||
// ggml_set_name(layer.split_wq.tensor_splits[i], split_name.c_str());
|
||||
// offset += tensor->nb[1]*split[i];
|
||||
// } else {
|
||||
// layer.split_wq.tensor_splits[i] = nullptr;
|
||||
// }
|
||||
// }
|
||||
// layer.split_wq.ggml.n_device = split.size();
|
||||
// layer.split_wq.ggml.split_dim = 1;
|
||||
// layer.split_wq.ggml.splits = layer.split_wq.tensor_splits.data();
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
return tensor;
|
||||
//return ml.create_tensor(ctx, name, ne, flags);
|
||||
}
|
||||
|
||||
#define LOADING_PRELUDE \
|
||||
@@ -2638,6 +2744,40 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool i
|
||||
return fused_qkv;
|
||||
}
|
||||
|
||||
static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor * tensor, llama_split_tensor & split_tensor, const std::vector<int> & splits) {
|
||||
GGML_ASSERT(split_dim == 0 || split_dim == 1);
|
||||
GGML_ASSERT(splits.size() > 1);
|
||||
std::string name{tensor->name};
|
||||
split_tensor.tensor_splits.resize(splits.size());
|
||||
if (split_dim == 1) {
|
||||
size_t offset = 0;
|
||||
for (int i = 0; i < int(splits.size()); ++i) {
|
||||
if (splits[i] > 0) {
|
||||
split_tensor.tensor_splits[i] = ggml_view_3d(ctx, tensor, tensor->ne[0], splits[i], tensor->ne[2], tensor->nb[1], tensor->nb[2], offset);
|
||||
auto name_i = name + '.' + std::to_string(i);
|
||||
ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str());
|
||||
offset += tensor->nb[1]*splits[i];
|
||||
} else {
|
||||
split_tensor.tensor_splits[i] = nullptr;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < int(splits.size()); ++i) {
|
||||
if (splits[i] > 0) {
|
||||
split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, splits[i], tensor->ne[1], tensor->ne[2]);
|
||||
auto name_i = name + '.' + std::to_string(i);
|
||||
ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str());
|
||||
} else {
|
||||
split_tensor.tensor_splits[i] = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
split_tensor.ggml.n_device = splits.size();
|
||||
split_tensor.ggml.split_dim = split_dim;
|
||||
split_tensor.ggml.splits = split_tensor.tensor_splits.data();
|
||||
tensor->extra = (void *)&split_tensor.ggml;
|
||||
}
|
||||
|
||||
bool create_tensors_helper::create_tensors() {
|
||||
const auto tn = LLM_TN(model.arch);
|
||||
bool use_mmap_buffer = true;
|
||||
@@ -2761,6 +2901,71 @@ bool create_tensors_helper::create_tensors() {
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
if (model.split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
const auto & hparams = model.hparams;
|
||||
int gqa_ratio = hparams.n_head() / hparams.n_head_kv();
|
||||
printf("GQA ratio: %d\n", gqa_ratio);
|
||||
for (int il = 0; il < int(model.layers.size()); ++il) {
|
||||
auto & layer = model.layers[il];
|
||||
auto ctx_split = ctx_for_layer_split(il);
|
||||
if (layer.wo && layer.wq && layer.wk && layer.wv) {
|
||||
int attn_granularity = hparams.n_embd_head_k;
|
||||
if (ggml_is_quantized(layer.wo->type)) {
|
||||
auto tt = ggml_internal_get_type_traits(layer.wo->type);
|
||||
if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size;
|
||||
}
|
||||
GGML_ASSERT(attn_granularity % hparams.n_embd_head_k == 0);
|
||||
auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits);
|
||||
prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split);
|
||||
prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split);
|
||||
for (auto & s : split) s /= gqa_ratio;
|
||||
prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split);
|
||||
prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split);
|
||||
}
|
||||
|
||||
if (layer.ffn_down && layer.ffn_up && layer.ffn_gate) {
|
||||
int ffn_granularity = 16;
|
||||
if (ggml_is_quantized(layer.ffn_down->type)) {
|
||||
auto tt = ggml_internal_get_type_traits(layer.ffn_down->type);
|
||||
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
|
||||
}
|
||||
auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, model.splits);
|
||||
prepare_split_tensors(0, ctx_split, layer.ffn_down, layer.split_ffn_down, split);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_up, layer.split_ffn_up, split);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_gate, layer.split_ffn_gate, split);
|
||||
}
|
||||
|
||||
if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) {
|
||||
int ffn_granularity = 16;
|
||||
if (ggml_is_quantized(layer.ffn_down_shexp->type)) {
|
||||
auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
|
||||
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
|
||||
}
|
||||
auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits);
|
||||
prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split);
|
||||
}
|
||||
|
||||
if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) {
|
||||
int ffn_granularity = 16;
|
||||
if (ggml_is_quantized(layer.ffn_down_exps->type)) {
|
||||
auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type);
|
||||
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
|
||||
}
|
||||
auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits);
|
||||
prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split);
|
||||
}
|
||||
}
|
||||
|
||||
if (model.output) {
|
||||
auto ctx_split = ctx_map[model.buft_output.buft_matrix];
|
||||
auto split = create_split(model.output->ne[1], 16, model.splits);
|
||||
prepare_split_tensors(1, ctx_split, model.output, model.split_output, split);
|
||||
}
|
||||
}
|
||||
return use_mmap_buffer;
|
||||
}
|
||||
|
||||
|
||||
@@ -183,6 +183,21 @@ struct llama_layer {
|
||||
struct ggml_tensor * bqk = nullptr;
|
||||
struct ggml_tensor * bkv = nullptr;
|
||||
|
||||
llama_split_tensor split_wq;
|
||||
llama_split_tensor split_wk;
|
||||
llama_split_tensor split_wv;
|
||||
llama_split_tensor split_wo;
|
||||
llama_split_tensor split_wqkv;
|
||||
llama_split_tensor split_wqk;
|
||||
llama_split_tensor split_wkv;
|
||||
llama_split_tensor split_bq;
|
||||
llama_split_tensor split_bk;
|
||||
llama_split_tensor split_bv;
|
||||
llama_split_tensor split_bo;
|
||||
llama_split_tensor split_bqkv;
|
||||
llama_split_tensor split_bqk;
|
||||
llama_split_tensor split_bkv;
|
||||
|
||||
// relative position bias
|
||||
struct ggml_tensor * attn_rel_b = nullptr;
|
||||
struct ggml_tensor * attn_rel_b_enc = nullptr;
|
||||
@@ -205,12 +220,20 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_down_enc = nullptr;
|
||||
struct ggml_tensor * ffn_up_enc = nullptr;
|
||||
|
||||
llama_split_tensor split_ffn_up;
|
||||
llama_split_tensor split_ffn_gate;
|
||||
llama_split_tensor split_ffn_down;
|
||||
|
||||
// ff MoE
|
||||
struct ggml_tensor * ffn_gate_inp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||
|
||||
llama_split_tensor split_ffn_up_exps;
|
||||
llama_split_tensor split_ffn_gate_exps;
|
||||
llama_split_tensor split_ffn_down_exps;
|
||||
|
||||
// ff MoE bias
|
||||
struct ggml_tensor * ffn_gate_inp_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps_b = nullptr;
|
||||
@@ -226,6 +249,10 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_down_shexp = nullptr;
|
||||
struct ggml_tensor * ffn_up_shexp = nullptr;
|
||||
|
||||
llama_split_tensor split_ffn_up_shexp;
|
||||
llama_split_tensor split_ffn_gate_shexp;
|
||||
llama_split_tensor split_ffn_down_shexp;
|
||||
|
||||
// ff bias
|
||||
struct ggml_tensor * ffn_gate_b = nullptr;
|
||||
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
||||
@@ -298,6 +325,8 @@ struct llama_model {
|
||||
struct ggml_tensor * output_b;
|
||||
struct ggml_tensor * output_norm_enc;
|
||||
|
||||
llama_split_tensor split_output;
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
llama_split_mode split_mode;
|
||||
@@ -358,6 +387,11 @@ struct llama_model {
|
||||
}
|
||||
|
||||
void set_tensor_overrides(const llama_model_params& params);
|
||||
|
||||
int device_count() const;
|
||||
ggml_backend_buffer_type_t default_buffer_type_offload(int device) const;
|
||||
|
||||
std::vector<float> splits;
|
||||
};
|
||||
|
||||
struct llama_lora_weight {
|
||||
|
||||
114
src/llama.cpp
114
src/llama.cpp
@@ -483,6 +483,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
||||
GGML_UNUSED(tensor_split);
|
||||
}
|
||||
|
||||
int llama_model::device_count() const {
|
||||
return llama_get_device_count(*this);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t llama_model::default_buffer_type_offload(int device) const {
|
||||
return llama_default_buffer_type_offload(*this, device);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||
#if defined(GGML_USE_RPC)
|
||||
int dev_count = (int)llama_get_device_count(model);
|
||||
@@ -626,42 +634,35 @@ static bool llama_kv_cache_init(
|
||||
}
|
||||
}
|
||||
|
||||
bool split_cache = false;
|
||||
if (model.split_mode == LLAMA_SPLIT_MODE_ROW && model.arch != LLM_ARCH_DEEPSEEK2 && offload) {
|
||||
cache.split_k_l.reserve(n_layer);
|
||||
cache.split_v_l.reserve(n_layer);
|
||||
split_cache = true;
|
||||
}
|
||||
|
||||
// count used buffer types
|
||||
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||
if (offload) {
|
||||
for (int64_t i = 0; i < n_layer; ++i) {
|
||||
buft_layer_count[model.buft_layer[i].buft]++;
|
||||
if (split_cache) {
|
||||
buft_layer_count[model.buft_layer[i].buft_matrix]++;
|
||||
} else {
|
||||
buft_layer_count[model.buft_layer[i].buft]++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
|
||||
}
|
||||
|
||||
//if (cparams.fused_moe_up_gate) {
|
||||
// int nbad = 0;
|
||||
// for (int i = 0; i < (int) n_layer; i++) {
|
||||
// auto& layer = model.layers[i];
|
||||
// if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) {
|
||||
// ++nbad;
|
||||
// }
|
||||
// }
|
||||
// if (nbad > 0) {
|
||||
// if (nbad == (int)n_layer) {
|
||||
// LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n");
|
||||
// const_cast<llama_cparams&>(cparams).fused_moe_up_gate = false;
|
||||
// }
|
||||
// else {
|
||||
// LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n",
|
||||
// nbad, (int)n_layer);
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
// create a context for each buffer type
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
for (auto & it : buft_layer_count) {
|
||||
int n_layers = it.second;
|
||||
size_t ctx_mem_size = 5u*n_layers*ggml_tensor_overhead();
|
||||
if (split_cache) ctx_mem_size += 2*model.splits.size()*n_layers*ggml_tensor_overhead();
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ 5u*n_layers*ggml_tensor_overhead(),
|
||||
/*.mem_size =*/ ctx_mem_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
@@ -698,8 +699,8 @@ static bool llama_kv_cache_init(
|
||||
}
|
||||
}
|
||||
|
||||
cache.k_l.reserve(n_layer);
|
||||
bool needs_v_cache = true;
|
||||
cache.k_l.reserve(n_layer);
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) {
|
||||
needs_v_cache = cparams.mla_attn == 1 && !cparams.flash_attn;
|
||||
}
|
||||
@@ -711,11 +712,10 @@ static bool llama_kv_cache_init(
|
||||
const uint32_t n_head_kv = hparams.n_head_kv(i);
|
||||
const uint32_t n_embd_head_k= hparams.n_embd_head_k;
|
||||
|
||||
|
||||
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||
struct ggml_context * ctx = split_cache ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||
ggml_tensor * k;
|
||||
ggml_tensor * v;
|
||||
if (cparams.mla_attn) {
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) {
|
||||
// DeepSeek MLA
|
||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
||||
@@ -744,6 +744,36 @@ static bool llama_kv_cache_init(
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
cache.k_l.push_back(k);
|
||||
cache.v_l.push_back(v);
|
||||
if (split_cache) {
|
||||
auto K = model.layers[i].wk;
|
||||
auto V = model.layers[i].wv;
|
||||
if (K && V && K->extra && V->extra) {
|
||||
auto extra_K = (const ggml_split_tensor_t *)K->extra;
|
||||
auto extra_V = (const ggml_split_tensor_t *)V->extra;
|
||||
auto & split_k_l = cache.split_k_l.emplace_back();
|
||||
auto & split_v_l = cache.split_v_l.emplace_back();
|
||||
split_k_l.tensor_splits.resize(extra_K->n_device, nullptr);
|
||||
split_v_l.tensor_splits.resize(extra_V->n_device, nullptr);
|
||||
for (int is = 0; is < extra_K->n_device; ++is) {
|
||||
auto split = extra_K->splits[is];
|
||||
if (!split) continue;
|
||||
split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, split->ne[1]/n_embd_head_k * kv_size);
|
||||
}
|
||||
split_k_l.ggml.n_device = extra_K->n_device;
|
||||
split_k_l.ggml.split_dim = 0;
|
||||
split_k_l.ggml.splits = split_k_l.tensor_splits.data();
|
||||
for (int is = 0; is < extra_V->n_device; ++is) {
|
||||
auto split = extra_V->splits[is];
|
||||
if (!split) continue;
|
||||
split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, type_v, split->ne[1] * kv_size);
|
||||
}
|
||||
split_v_l.ggml.n_device = extra_V->n_device;
|
||||
split_v_l.ggml.split_dim = 0;
|
||||
split_v_l.ggml.splits = split_v_l.tensor_splits.data();
|
||||
} else {
|
||||
printf("Oops: don't have yet K and V for layer %d\n", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn && n_mla < n_layer && n_mla > 0) {
|
||||
@@ -1652,10 +1682,7 @@ static bool llm_load_tensors(
|
||||
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
|
||||
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
||||
// calculate the split points
|
||||
// int device_count = llama_get_device_count(model);
|
||||
int device_count = model.devices.size();
|
||||
if (int device_count = model.devices.size(); device_count > 1) {
|
||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
||||
std::vector<float> splits(device_count);
|
||||
if (all_zero) {
|
||||
@@ -1676,36 +1703,31 @@ static bool llm_load_tensors(
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
splits[i] /= split_sum;
|
||||
}
|
||||
model.splits = std::move(splits);
|
||||
} else {
|
||||
model.splits = { 1.0f };
|
||||
}
|
||||
|
||||
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
||||
|
||||
int device_count = model.splits.size();
|
||||
// assign the repeating layers to the devices according to the splits
|
||||
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
for (int i = i_gpu_start; i < n_layer; ++i) {
|
||||
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
||||
#ifndef NDEBUG
|
||||
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
|
||||
const char* name = ggml_backend_buft_name(buft);
|
||||
LLAMA_LOG_DEBUG("load_tensors: layers %3d assigned to backend %s\n", i,
|
||||
name);
|
||||
#endif
|
||||
int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin();
|
||||
model.buft_layer[i] = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
|
||||
}
|
||||
// assign the output layer
|
||||
if (n_gpu_layers > n_layer) {
|
||||
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
||||
#ifndef NDEBUG
|
||||
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
|
||||
const char* name = ggml_backend_buft_name(buft);
|
||||
LLAMA_LOG_DEBUG("load_tensors: output layers assigned to backend %s\n",
|
||||
name);
|
||||
#endif
|
||||
int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - model.splits.begin();
|
||||
model.buft_output = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
} else {
|
||||
ggml_backend_buffer_type_t split_buft;
|
||||
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], tensor_split);
|
||||
if (split_mode == LLAMA_SPLIT_MODE_ROW && model.splits.size() > 1) {
|
||||
split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], model.splits.data());
|
||||
} else {
|
||||
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
||||
split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
|
||||
|
||||
Reference in New Issue
Block a user