From 026adac30ddefee19f6dbecd9635f2612e4ca2a7 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 15 Oct 2024 15:07:30 +0300 Subject: [PATCH] iq4_kss: CUDA works TG-128 performance is very decent with 131 t/s for LLaMA-3.1-8B. In comparison, we have 123 t/s for q4_0 and 128 t/s for iq4_ks. I.e., the reduced model size more than offsets the additional bit fiddling required for iq4_kss. --- ggml/src/ggml-cuda/iqk_mmvq.cu | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu index cc7dc409..d012610c 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cu +++ b/ggml/src/ggml-cuda/iqk_mmvq.cu @@ -249,22 +249,22 @@ __device__ __forceinline__ float vec_dot_iq4_kss_q8_1( const block_iq4_kss * bq4 = (const block_iq4_kss *)((const char *)vbq + sizeof(float)) + kbx; const uint8_t * all_values = (const uint8_t *)iq4k_values; - // TODO - return 0.f; - - //// iqs is 0...28 - //const int ib32 = iqs/4; // Why iqs/4 ? - //const int32_t * q8 = (const int *)bq8_1[ib32].qs; - //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; - //const float dl = scale * ((bq4->scales[ib32] & 254) - 127); - //int v1, v2; - //int sumi = 0; - //for (int j = 0; j < 4; ++j) { - // get_int_from_table_16_shift(q4[j], bq4->scales[ib32] & 1, all_values, v1, v2); - // sumi = ggml_cuda_dp4a(v1, q8[j+0], sumi); - // sumi = ggml_cuda_dp4a(v2, q8[j+4], sumi); - //} - //return dl * __low2float(bq8_1[ib32].ds) * sumi; + // iqs is 0...28 + const int ib32 = iqs/4; // Why iqs/4 ? + const int32_t * q8 = (const int *)bq8_1[ib32].qs; + const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; + const uint8_t ls = (q4[0] >> 30) | ((q4[1] >> 28) & 0x0c) | ((q4[2] >> 26) & 0x30) | ((q4[3] >> 24) & 0xc0); + const float dl = scale * ((ls & 254) - 127); + int v1, v2; + int sumi = 0; + for (int j = 0; j < 4; ++j) { + uint32_t aux32 = (q4[j] & 0x00007fff) | ((q4[j] << 1) & 0x7fff0000); + aux32 ^= (aux32 << 1); + get_int_from_table_16_shift(aux32, ls & 1, all_values, v1, v2); + sumi = ggml_cuda_dp4a(v1, q8[j+0], sumi); + sumi = ggml_cuda_dp4a(v2, q8[j+4], sumi); + } + return dl * __low2float(bq8_1[ib32].ds) * sumi; } #define VDR_IQ5_K_Q8_1_MMVQ 4