diff --git a/common/common.cpp b/common/common.cpp index 9f1ce736..ad1bdce2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1394,6 +1394,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.merge_qkv = true; return true; } + if (arg == "-khad" || arg == "--k-cache-hadamard") { + params.k_cache_hadamard = true; + return true; + } if (arg == "--numa") { CHECK_ARG std::string value(argv[i]); @@ -2074,6 +2078,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-gr, --graph-reuse", "enable graph reuse (default: %s)", params.graph_reuse ? "enabled" : "disabled" }); options.push_back({ "*", "-ser, --smart-expert-reduction", "experts reduction (default: %d,%g)", params.min_experts, params.thresh_experts}); options.push_back({ "*", "-mqkv, --merge-qkv,", "merge Q,K,V (default: %d)", params.merge_qkv}); + options.push_back({ "*", "-khad, --k-cache-hadamard,", "Use Hadamard transform for K-cache (default: %d)", params.k_cache_hadamard}); options.push_back({ "*", "-vq, --validate-quants", "validate quantized data while loading the model (default: %d)", params.validate_quants}); options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n" "in conversation mode, this will be used as system prompt\n" @@ -3063,9 +3068,11 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.fused_mmad = params.fused_mmad; cparams.rope_cache = params.rope_cache; cparams.graph_reuse = params.graph_reuse; + cparams.k_cache_hadamard = params.k_cache_hadamard; cparams.min_experts = params.min_experts; cparams.thresh_experts = params.thresh_experts; cparams.only_active_experts = params.only_active_exps; + cparams.k_cache_hadamard = params.k_cache_hadamard; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_v = kv_cache_type_from_str(params.cache_type_v); @@ -4209,6 +4216,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "fused_mmad: %s # default: true\n", params.fused_mmad ? "true" : "false"); fprintf(stream, "rope_cache: %s # default: false\n", params.rope_cache ? "true" : "false"); fprintf(stream, "graph_reuse: %s # default: false\n", params.graph_reuse ? "true" : "false"); + fprintf(stream, "k_cache_hadamard: %s # default: false\n", params.k_cache_hadamard ? "true" : "false"); fprintf(stream, "ser: %d,%g # defaulr: -1,0\n", params.min_experts, params.thresh_experts); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); diff --git a/common/common.h b/common/common.h index 48eb1966..e23e18b3 100644 --- a/common/common.h +++ b/common/common.h @@ -276,6 +276,7 @@ struct gpt_params { bool validate_quants = false; // if true, check for NaNs while loading the model bool only_active_exps = true; // if true, offload only active experts (relevant only for hybrid CPU/GPU) bool merge_qkv = false; // if true, merge separate Q, K, V tensors into a single, contiguous tensor + bool k_cache_hadamard = false; // if true, use Hadamard transform for the K-cache (only makes sense with quantized cache) std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d2a0cf85..c1b4c8ef 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -621,6 +621,7 @@ extern "C" { GGML_OP_FUSED_UP_GATE, GGML_OP_MOE_FUSED_UP_GATE, GGML_OP_MUL_MULTI_ADD, + GGML_OP_HADAMARD, GGML_OP_SCALE, GGML_OP_SET, @@ -1092,6 +1093,11 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_hadamard( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n); + // dst = a // view(dst, nb1, nb2, nb3, offset) += b // return dst diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index cb9c7562..b82bab36 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4223,6 +4223,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "FUSED_UP_GATE", "MOE_FUSED_UP_GATE", "MUL_MULTI_ADD", + "HADAMARD", "SCALE", "SET", @@ -4292,7 +4293,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91"); +static_assert(GGML_OP_COUNT == 92, "GGML_OP_COUNT != 92"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -4330,6 +4331,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "X*Y1&X*Y2", "X*Y1&X*Y2", "x1*y1+x2*y2+...", + "hadamard(x)", "x*v", "y-\\>view(x)", @@ -4399,7 +4401,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)," }; -static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91"); +static_assert(GGML_OP_COUNT == 92, "GGML_OP_COUNT != 92"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -6147,6 +6149,38 @@ struct ggml_tensor * ggml_mul_multi_add( return result; } +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#include +#include +#include +#include +#include +inline int popcount(uint32_t x) { return __popcnt(x); } +#else +inline int popcount(uint32_t x) { return __builtin_popcount(x); } +#endif + +struct ggml_tensor * ggml_hadamard( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n) { + + GGML_ASSERT(a->type == GGML_TYPE_F32); // will not bother implementing for other data types + GGML_ASSERT(n > 1); // no point in Hadamard transforms with less than 2 elements + GGML_ASSERT(a->ne[0] % n == 0); + GGML_ASSERT(popcount(n) == 1); // must be a power of 2 + + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne); + + result->op = GGML_OP_HADAMARD; + result->src[0] = a; + + result->op_params[0] = n; + + return result; +} + // ggml_add_cast static struct ggml_tensor * ggml_add_cast_impl( @@ -22660,6 +22694,10 @@ static int ggml_compute_forward(struct ggml_compute_params * params, struct ggml { iqk_mul_multi_add(tensor, params->ith, params->nth); } break; + case GGML_OP_HADAMARD: + { + iqk_hadamard(tensor, params->ith, params->nth); + } break; case GGML_OP_ACC: { ggml_compute_forward_acc(params, tensor); @@ -23510,6 +23548,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ABORT("fatal error"); // TODO: implement } + case GGML_OP_HADAMARD: + { + GGML_ABORT("fatal error"); // TODO: implement + } case GGML_OP_CONCAT: { GGML_ABORT("fatal error"); // TODO: implement @@ -24625,6 +24667,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_ACC: case GGML_OP_MULTI_ADD: case GGML_OP_MUL_MULTI_ADD: + case GGML_OP_HADAMARD: { n_tasks = n_threads; } break; diff --git a/ggml/src/iqk/iqk_common.h b/ggml/src/iqk/iqk_common.h index 474db15e..2f677853 100644 --- a/ggml/src/iqk/iqk_common.h +++ b/ggml/src/iqk/iqk_common.h @@ -922,3 +922,22 @@ static IQK_ALWAYS_INLINE void prepare_iq4_nl_quants_r8(const int8x16_t& values, #endif #endif + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#include +#include +#include +#include +#include +inline int popcount(uint8_t x) { return __popcnt(x); } +inline int popcount(uint16_t x) { return __popcnt(x); } +inline int popcount(uint32_t x) { return __popcnt(x); } +inline int popcount(uint64_t x) { return _mm_popcnt_u64(x); } +#else +constexpr int popcount(uint8_t x) { return __builtin_popcount(x); } +constexpr int popcount(uint16_t x) { return __builtin_popcount(x); } +constexpr int popcount(uint32_t x) { return __builtin_popcount(x); } +constexpr int popcount(uint64_t x) { return __builtin_popcountll(x); } +#endif + diff --git a/ggml/src/iqk/iqk_cpu_ops.cpp b/ggml/src/iqk/iqk_cpu_ops.cpp index ab8590d7..9a20e0c8 100644 --- a/ggml/src/iqk/iqk_cpu_ops.cpp +++ b/ggml/src/iqk/iqk_cpu_ops.cpp @@ -8,6 +8,7 @@ #include "iqk_cpu_ops.h" #include "iqk_utils.h" +#include "iqk_common.h" #include "ggml.h" #include @@ -454,3 +455,51 @@ void iqk_mul_multi_add(struct ggml_tensor * dst, int ith, int nth) { } } } + +namespace { +template +void fast_ht(int n, T * values) { + constexpr float ksqrt2 = 0.707106781f; + float scale = 1; + for (int h = 1; h < n; h <<= 1) { + for (int i = 0; i < n; i += 2*h) { + for (int j = i; j < i + h; ++j) { + T x = values[j], y = values[j + h]; + values[j+0] = x + y; + values[j+h] = x - y; + } + } + scale *= ksqrt2; + } + for (int i = 0; i < n; ++i) values[i] *= scale; +} +} + +void iqk_hadamard(struct ggml_tensor * dst, int ith, int nth) { + auto src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_are_same_shape(src, dst)); + int nh = dst->op_params[0]; + GGML_ASSERT(nh > 1 && popcount(uint32_t(nh)) == 1); + GGML_ASSERT(dst->ne[0] % nh == 0); + + int nc = dst->ne[0]/nh; + int nr = ggml_nrows(dst) * nc; + + int npt = (nr + nth - 1)/nth; + int first = npt*ith; + int last = std::min(first + npt, nr); + + for (int ir = first; ir < last; ++ir) { + int i3 = ir / (dst->ne[1] * dst->ne[2] * nc); + int i2 = (ir - i3*dst->ne[1] * dst->ne[2] * nc)/(dst->ne[1] * nc); + int i1 = (ir - i3*dst->ne[1] * dst->ne[2] * nc - i2*dst->ne[1]*nc)/nc; + int ic = (ir - i3*dst->ne[1] * dst->ne[2] * nc - i2*dst->ne[1]*nc - i1*nc); + + auto x = (const float *)((const char *)src->data + i3*src->nb[3] + i2*src->nb[2] + i1*src->nb[1]) + ic*nh; + auto y = ( float *)(( char *)dst->data + i3*dst->nb[3] + i2*dst->nb[2] + i1*dst->nb[1]) + ic*nh; + std::memcpy(y, x, nh*sizeof(float)); + fast_ht(nh, y); + } +} diff --git a/ggml/src/iqk/iqk_cpu_ops.h b/ggml/src/iqk/iqk_cpu_ops.h index 833eb9a5..8656d7f1 100644 --- a/ggml/src/iqk/iqk_cpu_ops.h +++ b/ggml/src/iqk/iqk_cpu_ops.h @@ -28,6 +28,8 @@ void iqk_openai_experts(struct ggml_tensor * topk, struct ggml_tensor * softmax, void iqk_mul_multi_add(struct ggml_tensor * dst, int ith, int nth); +void iqk_hadamard(struct ggml_tensor * dst, int ith, int nth); + #ifdef __cplusplus } #endif diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index d310d134..038cb4fe 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -32,24 +32,6 @@ #include #include -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#include -#include -#include -#include -#include -inline int popcount(uint8_t x) { return __popcnt(x); } -inline int popcount(uint16_t x) { return __popcnt(x); } -inline int popcount(uint32_t x) { return __popcnt(x); } -inline int popcount(uint64_t x) { return _mm_popcnt_u64(x); } -#else -constexpr int popcount(uint8_t x) { return __builtin_popcount(x); } -constexpr int popcount(uint16_t x) { return __builtin_popcount(x); } -constexpr int popcount(uint32_t x) { return __builtin_popcount(x); } -constexpr int popcount(uint64_t x) { return __builtin_popcountll(x); } -#endif - namespace { inline int nearest_int(float fval) { diff --git a/include/llama.h b/include/llama.h index 3c9b331c..b425bc87 100644 --- a/include/llama.h +++ b/include/llama.h @@ -440,6 +440,7 @@ extern "C" { int min_experts; float thresh_experts; bool only_active_experts; + bool k_cache_hadamard; // if true, apply Hadamard transfrom to K-cache // Abort callback // if it returns true, execution of llama_decode() will be aborted diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index f6f70394..7809d855 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -52,6 +52,7 @@ llm_build_context::llm_build_context( fused_up_gate (cparams.fused_up_gate), fused_mmad (cparams.fused_mmad), rope_cache (cparams.rope_cache), + k_cache_hadamard (cparams.k_cache_hadamard), min_experts (cparams.min_experts), thresh_experts (cparams.thresh_experts), pooling_type (cparams.pooling_type), @@ -1466,6 +1467,13 @@ ggml_tensor * llm_build_context::llm_build_kv( const llama_hparams & hparams = lctx.model.hparams; const llama_cparams & cparams = lctx.cparams; + if (cparams.k_cache_hadamard) { + q_cur = ggml_hadamard(ctx, q_cur, hparams.n_embd_head_k); + k_cur = ggml_hadamard(ctx, k_cur, hparams.n_embd_head_k); + cb(q_cur, "Qcur_hadamard", il); + cb(k_cur, "Kcur_hadamard", il); + } + // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced ggml_build_forward_expand(graph, q_cur); @@ -9375,6 +9383,12 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale); cb(Qcur, "Qcur_temp_scaled", il_cb); } + if (cparams.k_cache_hadamard) { + Qcur = ggml_hadamard(ctx0, Qcur, hparams.n_embd_head_k); + Kcur = ggml_hadamard(ctx0, Kcur, hparams.n_embd_head_k); + cb(Qcur, "Qcur_hadamard", il_cb); + cb(Kcur, "Kcur_hadamard", il_cb); + } ggml_build_forward_expand(gf, Qcur); ggml_build_forward_expand(gf, Kcur); ggml_build_forward_expand(gf, Vcur); diff --git a/src/llama-build-context.h b/src/llama-build-context.h index cbf12817..c5a27654 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -82,6 +82,7 @@ struct llm_build_context { const bool fused_up_gate; const bool fused_mmad; const bool rope_cache; + const bool k_cache_hadamard; const int min_experts; const float thresh_experts; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 0d118369..d911378d 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -39,6 +39,7 @@ struct llama_cparams { bool fused_mmad; bool rope_cache; bool graph_reuse; + bool k_cache_hadamard; int min_experts; float thresh_experts; diff --git a/src/llama.cpp b/src/llama.cpp index 2124b180..21870df5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4048,6 +4048,7 @@ struct llama_context_params llama_context_default_params() { /*.min_experts =*/ -1, /*.thtesh_experts =*/ 0.0f, /*.only_active_experts =*/ false, + /*.k_cache_hadamard =*/ false, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, /*.offload_policy =*/ nullptr, @@ -4297,6 +4298,11 @@ struct llama_context * llama_new_context_with_model( return nullptr; } + if (params.k_cache_hadamard && !ggml_is_quantized(params.type_k)) { + LLAMA_LOG_WARN("%s: there is no point in Hadamard transforms with not quantized K-cache. Turning Hadamard off\n", __func__); + params.k_cache_hadamard = false; + } + llama_context * ctx = new llama_context(*model); // add devices to ctx->cparams from model @@ -4330,6 +4336,7 @@ struct llama_context * llama_new_context_with_model( cparams.fused_mmad = params.fused_mmad; cparams.rope_cache = params.rope_cache; cparams.graph_reuse = params.graph_reuse; + cparams.k_cache_hadamard = params.k_cache_hadamard; cparams.min_experts = params.min_experts; cparams.thresh_experts = params.thresh_experts; cparams.cuda_params = params.cuda_params; @@ -4417,6 +4424,7 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: fused_mmad = %d\n", __func__, cparams.fused_mmad); LLAMA_LOG_INFO("%s: rope_cache = %d\n", __func__, cparams.rope_cache); LLAMA_LOG_INFO("%s: graph_reuse = %d\n", __func__, cparams.graph_reuse); + LLAMA_LOG_INFO("%s: k_cache_hadam = %d\n", __func__, cparams.k_cache_hadamard); LLAMA_LOG_INFO("%s: ser = %d, %g\n", __func__, cparams.min_experts, cparams.thresh_experts); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);