mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-23 22:54:10 +00:00
Make it work with partial offload
but no tensor overrides yet, just ngl < num_layers.
This commit is contained in:
@@ -179,6 +179,14 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
|
||||
create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_model & _model) : ml(_ml), model(_model) {
|
||||
|
||||
#if 0
|
||||
for (int i = 0; i < model.hparams.n_layer; ++i) {
|
||||
printf("Layer %2d: %s %s\n", i, ggml_backend_buft_name(model.buft_layer[i].buft_matrix), ggml_backend_buft_name(model.buft_layer[i].buft));
|
||||
}
|
||||
printf("Output: %s %s\n", ggml_backend_buft_name(model.buft_output.buft_matrix), ggml_backend_buft_name(model.buft_output.buft));
|
||||
printf(" Input: %s %s\n", ggml_backend_buft_name(model.buft_input.buft_matrix), ggml_backend_buft_name(model.buft_input.buft));
|
||||
#endif
|
||||
|
||||
const int n_layer = model.hparams.n_layer;
|
||||
buft_layer_count[model.buft_input.buft]++;
|
||||
buft_layer_count[model.buft_input.buft_matrix]++;
|
||||
@@ -2927,6 +2935,10 @@ bool create_tensors_helper::create_tensors() {
|
||||
int gqa_ratio = hparams.n_head() / hparams.n_head_kv();
|
||||
//printf("GQA ratio: %d\n", gqa_ratio);
|
||||
for (int il = 0; il < int(model.layers.size()); ++il) {
|
||||
if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) {
|
||||
LLAMA_LOG_INFO("%s: not splitting layer %d because buffer type is host\n", __func__, il);
|
||||
continue;
|
||||
}
|
||||
auto & layer = model.layers[il];
|
||||
auto ctx_split = ctx_for_layer_split(il);
|
||||
if (layer.attn_norm) {
|
||||
@@ -2994,9 +3006,13 @@ bool create_tensors_helper::create_tensors() {
|
||||
}
|
||||
|
||||
if (model.output) {
|
||||
auto ctx_split = ctx_map[model.buft_output.buft_matrix];
|
||||
auto split = create_split(model.output->ne[1], 16, model.splits);
|
||||
prepare_split_tensors(1, ctx_split, model.output, model.split_output, split, mem_used);
|
||||
if (ggml_backend_buft_is_host(model.buft_output.buft_matrix)) {
|
||||
LLAMA_LOG_INFO("%s: not splitting output tensor becausee buffer is host\n", __func__);
|
||||
} else {
|
||||
auto ctx_split = ctx_map[model.buft_output.buft_matrix];
|
||||
auto split = create_split(model.output->ne[1], 16, model.splits);
|
||||
prepare_split_tensors(1, ctx_split, model.output, model.split_output, split, mem_used);
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_INFO("Estimated model buffer size per device:\n");
|
||||
for (int i = 0; i < int(mem_used.size()); ++i) {
|
||||
|
||||
@@ -784,9 +784,10 @@ static bool llama_kv_cache_init(
|
||||
split_v_l.ggml.splits = split_v_l.tensor_splits.data();
|
||||
k->extra = (void *)&split_k_l.ggml;
|
||||
v->extra = (void *)&split_v_l.ggml;
|
||||
} else {
|
||||
printf("Oops: don't have yet K and V for layer %d\n", i);
|
||||
}
|
||||
//} else {
|
||||
// printf("Oops: don't have yet K and V for layer %d\n", i);
|
||||
//}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -800,14 +801,20 @@ static bool llama_kv_cache_init(
|
||||
for (auto it : ctx_map) {
|
||||
ggml_backend_buffer_type_t buft = it.first;
|
||||
ggml_context * ctx = it.second;
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (!buf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
|
||||
return false;
|
||||
int ntensor = 0;
|
||||
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
++ntensor;
|
||||
}
|
||||
if (ntensor > 0) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (!buf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
||||
cache.bufs.push_back(buf);
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
||||
cache.bufs.push_back(buf);
|
||||
}
|
||||
if (split_cache) {
|
||||
LLAMA_LOG_INFO("%s: KV cache size per device:\n", __func__);
|
||||
@@ -1868,24 +1875,33 @@ static bool llm_load_tensors(
|
||||
}
|
||||
#endif
|
||||
else {
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (buf == nullptr) {
|
||||
throw std::runtime_error("unable to allocate backend buffer");
|
||||
int ntensor = 0;
|
||||
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
++ntensor;
|
||||
}
|
||||
model.bufs.push_back(buf);
|
||||
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
||||
model.mlock_bufs.emplace_back(new llama_mlock);
|
||||
auto & mlock_buf = model.mlock_bufs.back();
|
||||
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
||||
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
||||
}
|
||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||
bufs.emplace(idx, buf);
|
||||
if (ntensor > 0) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (buf == nullptr) {
|
||||
LLAMA_LOG_ERROR("Failed to allocate buffer type %s\n", ggml_backend_buft_name(buft));
|
||||
throw std::runtime_error("unable to allocate backend buffer");
|
||||
}
|
||||
model.bufs.push_back(buf);
|
||||
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
||||
model.mlock_bufs.emplace_back(new llama_mlock);
|
||||
auto & mlock_buf = model.mlock_bufs.back();
|
||||
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
||||
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
||||
}
|
||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||
bufs.emplace(idx, buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bufs.empty()) {
|
||||
throw std::runtime_error("failed to allocate buffer");
|
||||
LLAMA_LOG_WARN("No tensors in buffer type %s\n", ggml_backend_buft_name(buft));
|
||||
continue;
|
||||
//throw std::runtime_error("failed to allocate buffer (1)");
|
||||
}
|
||||
|
||||
for (auto & buf : bufs) {
|
||||
|
||||
Reference in New Issue
Block a user