From 262cb8cc6d9ae0a40c0e8582212989d64f41d2a0 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 12 Oct 2025 15:02:39 +0300 Subject: [PATCH] WIP --- src/llama.cpp | 76 ++++++++++++++++----------------------------------- 1 file changed, 24 insertions(+), 52 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5d39c9af..53ebb075 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2160,35 +2160,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { for (auto& w : workers) w = std::thread(compute, it++); compute(it); for (auto& w : workers) w.join(); - if (data) { - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data[i*n_kv + j] = -INFINITY; - } + int64_t n_tokens_padded = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); + if (n_tokens_padded > n_tokens) { + if (data) { + std::fill(data + int64_t(n_tokens)*n_kv, data + n_tokens_padded*n_kv, -INFINITY); } - } - if (data_f16) { - ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY); - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data_f16[i*n_kv + j] = h_inf; - } + if (data_f16) { + ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY); + std::fill(data_f16 + int64_t(n_tokens)*n_kv, data_f16 + n_tokens_padded*n_kv, h_inf); } - } - - if (data_swa) { - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data_swa[i*n_kv + j] = -INFINITY; - } + if (data_swa) { + std::fill(data_swa + int64_t(n_tokens)*n_kv, data_swa + n_tokens_padded*n_kv, -INFINITY); } - } - if (data_swa_f16) { - ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY); - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data_swa_f16[i*n_kv + j] = h_inf; - } + if (data_swa_f16) { + ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY); + std::fill(data_swa_f16 + int64_t(n_tokens)*n_kv, data_swa_f16 + n_tokens_padded*n_kv, h_inf); } } } @@ -2248,35 +2234,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (data) { - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; - } + int64_t n_tokens_padded = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); + if (n_tokens_padded > n_tokens) { + if (data) { + std::fill(data + int64_t(n_tokens)*n_kv, data + n_tokens_padded*n_kv, -INFINITY); } - } - if (data_f16) { - ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY); - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data_f16[h*(n_kv*n_tokens) + i*n_kv + j] = h_inf; - } + if (data_f16) { + ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY); + std::fill(data_f16 + int64_t(n_tokens)*n_kv, data_f16 + n_tokens_padded*n_kv, h_inf); } - } - - if (data_swa) { - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; - } + if (data_swa) { + std::fill(data_swa + int64_t(n_tokens)*n_kv, data_swa + n_tokens_padded*n_kv, -INFINITY); } - } - if (data_swa_f16) { - ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY); - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data_swa_f16[h*(n_kv*n_tokens) + i*n_kv + j] = h_inf; - } + if (data_swa_f16) { + ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY); + std::fill(data_swa_f16 + int64_t(n_tokens)*n_kv, data_swa_f16 + n_tokens_padded*n_kv, h_inf); } } }