diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 90bd99f3..a2a9c327 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -179,6 +179,14 @@ struct create_tensors_helper : public create_tensors_helper_interface { create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_model & _model) : ml(_ml), model(_model) { +#if 0 + for (int i = 0; i < model.hparams.n_layer; ++i) { + printf("Layer %2d: %s %s\n", i, ggml_backend_buft_name(model.buft_layer[i].buft_matrix), ggml_backend_buft_name(model.buft_layer[i].buft)); + } + printf("Output: %s %s\n", ggml_backend_buft_name(model.buft_output.buft_matrix), ggml_backend_buft_name(model.buft_output.buft)); + printf(" Input: %s %s\n", ggml_backend_buft_name(model.buft_input.buft_matrix), ggml_backend_buft_name(model.buft_input.buft)); +#endif + const int n_layer = model.hparams.n_layer; buft_layer_count[model.buft_input.buft]++; buft_layer_count[model.buft_input.buft_matrix]++; @@ -2927,6 +2935,10 @@ bool create_tensors_helper::create_tensors() { int gqa_ratio = hparams.n_head() / hparams.n_head_kv(); //printf("GQA ratio: %d\n", gqa_ratio); for (int il = 0; il < int(model.layers.size()); ++il) { + if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) { + LLAMA_LOG_INFO("%s: not splitting layer %d because buffer type is host\n", __func__, il); + continue; + } auto & layer = model.layers[il]; auto ctx_split = ctx_for_layer_split(il); if (layer.attn_norm) { @@ -2994,9 +3006,13 @@ bool create_tensors_helper::create_tensors() { } if (model.output) { - auto ctx_split = ctx_map[model.buft_output.buft_matrix]; - auto split = create_split(model.output->ne[1], 16, model.splits); - prepare_split_tensors(1, ctx_split, model.output, model.split_output, split, mem_used); + if (ggml_backend_buft_is_host(model.buft_output.buft_matrix)) { + LLAMA_LOG_INFO("%s: not splitting output tensor becausee buffer is host\n", __func__); + } else { + auto ctx_split = ctx_map[model.buft_output.buft_matrix]; + auto split = create_split(model.output->ne[1], 16, model.splits); + prepare_split_tensors(1, ctx_split, model.output, model.split_output, split, mem_used); + } } LLAMA_LOG_INFO("Estimated model buffer size per device:\n"); for (int i = 0; i < int(mem_used.size()); ++i) { diff --git a/src/llama.cpp b/src/llama.cpp index c6b5ea83..8856807d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -784,9 +784,10 @@ static bool llama_kv_cache_init( split_v_l.ggml.splits = split_v_l.tensor_splits.data(); k->extra = (void *)&split_k_l.ggml; v->extra = (void *)&split_v_l.ggml; - } else { - printf("Oops: don't have yet K and V for layer %d\n", i); } + //} else { + // printf("Oops: don't have yet K and V for layer %d\n", i); + //} } } } @@ -800,14 +801,20 @@ static bool llama_kv_cache_init( for (auto it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); - return false; + int ntensor = 0; + for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + ++ntensor; + } + if (ntensor > 0) { + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + cache.bufs.push_back(buf); } - ggml_backend_buffer_clear(buf, 0); - LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); - cache.bufs.push_back(buf); } if (split_cache) { LLAMA_LOG_INFO("%s: KV cache size per device:\n", __func__); @@ -1868,24 +1875,33 @@ static bool llm_load_tensors( } #endif else { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend buffer"); + int ntensor = 0; + for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + ++ntensor; } - model.bufs.push_back(buf); - if (use_mlock && ggml_backend_buffer_is_host(buf)) { - model.mlock_bufs.emplace_back(new llama_mlock); - auto & mlock_buf = model.mlock_bufs.back(); - mlock_buf->init (ggml_backend_buffer_get_base(buf)); - mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); - } - for (uint32_t idx = 0; idx < ml.files.size(); idx++) { - bufs.emplace(idx, buf); + if (ntensor > 0) { + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf == nullptr) { + LLAMA_LOG_ERROR("Failed to allocate buffer type %s\n", ggml_backend_buft_name(buft)); + throw std::runtime_error("unable to allocate backend buffer"); + } + model.bufs.push_back(buf); + if (use_mlock && ggml_backend_buffer_is_host(buf)) { + model.mlock_bufs.emplace_back(new llama_mlock); + auto & mlock_buf = model.mlock_bufs.back(); + mlock_buf->init (ggml_backend_buffer_get_base(buf)); + mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); + } + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + bufs.emplace(idx, buf); + } } } if (bufs.empty()) { - throw std::runtime_error("failed to allocate buffer"); + LLAMA_LOG_WARN("No tensors in buffer type %s\n", ggml_backend_buft_name(buft)); + continue; + //throw std::runtime_error("failed to allocate buffer (1)"); } for (auto & buf : bufs) {