From 5e969613e49fe18bf8ab97a0fd1ced3c3f8f03dc Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sat, 26 Oct 2024 17:07:54 +0300 Subject: [PATCH] iq2_bn(CUDA): quants are not 4-byte aligned --- ggml/src/ggml-cuda/iqk_mmvq.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu index 26c5bfee..36dbb52a 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cu +++ b/ggml/src/ggml-cuda/iqk_mmvq.cu @@ -694,13 +694,13 @@ __device__ __forceinline__ float vec_dot_iq2_bn_q8_1( // iqs is 0 or 1 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - auto qs = (const uint16_t *)bq2->qs + 4*iqs; + auto qs = (const int *)bq2->qs + 2*iqs; auto q8l = (const int *)bq8_1[0].qs + 2*iqs; auto q8h = (const int *)bq8_1[1].qs + 2*iqs; int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0; for (int j = 0; j < 2; ++j) { - int vl = qs[2*j+0] | (uint32_t(qs[2*j+1]) << 16); - int vh = vl >> 4; + int vl = qs[j]; + int vh = qs[j] >> 4; sumi1 = __dp4a(vl & 0x03030303, q8l[j+0], sumi1); sumi2 = __dp4a(vl & 0x0c0c0c0c, q8l[j+4], sumi2); sumi3 = __dp4a(vh & 0x03030303, q8h[j+0], sumi3); @@ -708,6 +708,7 @@ __device__ __forceinline__ float vec_dot_iq2_bn_q8_1( } auto d8l = __half22float2(bq8_1[0].ds); auto d8h = __half22float2(bq8_1[1].ds); + return scale * (d8l.x * (sumi1 + 0.25f*sumi2) + d8h.x * (sumi3 + 0.25f * sumi4) - 0.5f*d8l.y - 0.5f*d8h.y); #else int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0; auto q8l = bq8_1[0].qs + 8*iqs; @@ -723,7 +724,6 @@ __device__ __forceinline__ float vec_dot_iq2_bn_q8_1( auto d8h = __half22float2(bq8_1[1].ds); return scale * (d8l.x * (sumi1 + 0.25f*sumi2) + 0.0625f * d8h.x*(sumi3 + 0.25f*sumi4) - 0.5f*d8l.y - 0.5f*d8h.y); #endif - return scale * (d8l.x * (sumi1 + 0.25f*sumi2) + d8h.x * (sumi3 + 0.25f * sumi4) - 0.5f*d8l.y - 0.5f*d8h.y); } } // namespace