mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-27 16:44:21 +00:00
NUMA-aware KV cache buffer type (experimental)
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
This commit is contained in:
@@ -3249,6 +3249,15 @@ static bool llama_kv_cache_init(
|
||||
|
||||
bool warn = true;
|
||||
int n_mla = 0;
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
||||
bool is_numa = is_numa_fn();
|
||||
if (!offload && is_numa) {
|
||||
LLAMA_LOG_INFO("%s: NUMA usage detected, using NUMA-aware buffer for KV cache\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
|
||||
for (int i = 0; i < (int) n_layer; i++) {
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||
@@ -3257,6 +3266,7 @@ static bool llama_kv_cache_init(
|
||||
const uint32_t n_embd_head_k= hparams.n_embd_head_k;
|
||||
|
||||
|
||||
<<<<<<< HEAD
|
||||
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||
ggml_tensor * k;
|
||||
ggml_tensor * v;
|
||||
|
||||
Reference in New Issue
Block a user