iq2_ks: CUDA works

2026-04-27 01:49:28 +00:00 · 2024-10-12 12:42:41 +03:00
parent 103c8c053a
commit aa36d90684
1 changed files with 60 additions and 13 deletions
--- a/ggml/src/ggml-cuda/iqk_mmvq.cu
+++ b/ggml/src/ggml-cuda/iqk_mmvq.cu
@@ -239,18 +239,6 @@ __device__ __forceinline__ float vec_dot_iq4_ks_q8_1(
    return dl * __low2float(bq8_1[ib32].ds) * sumi;
 }

-#define VDR_IQ2_KS_Q8_1_MMVQ 4
-#define VDR_IQ2_KS_Q8_1_MMQ  4
-
-__device__ __forceinline__ float vec_dot_iq2_ks_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-
-    float scale = *(const float *)vbq;
-    const block_iq2_ks * bq2 = (const block_iq2_ks *)((const char *)vbq + sizeof(float)) + kbx;
-    // TODO
-    return 0.f;
-}
-
 #define VDR_IQ5_K_Q8_1_MMVQ 4
 #define VDR_IQ5_K_Q8_1_MMQ  4

@@ -466,6 +454,65 @@ __device__ __forceinline__ float vec_dot_iq2_k_q8_1(

 }

+#define VDR_IQ2_KS_Q8_1_MMVQ 4
+#define VDR_IQ2_KS_Q8_1_MMQ  4
+
+__device__ __forceinline__ float vec_dot_iq2_ks_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    float scale = *(const float *)vbq;
+    const block_iq2_ks * bq2 = (const block_iq2_ks *)((const char *)vbq + sizeof(float)) + kbx;
+
+    int i4 = iqs/4;  // 0...7. We will process q8 blocks 4*(i4/4), 4*(i4/4)+1, 4*(i4/4)+2, 4*(i4/4)+3
+    const int32_t  * q8_1 = (const int *)bq8_1[4*(i4/4)+0].qs + 2*(i4%4);
+    const int32_t  * q8_2 = (const int *)bq8_1[4*(i4/4)+1].qs + 2*(i4%4);
+    const int32_t  * q8_3 = (const int *)bq8_1[4*(i4/4)+2].qs + 2*(i4%4);
+    const int32_t  * q8_4 = (const int *)bq8_1[4*(i4/4)+3].qs + 2*(i4%4);
+
+    const uint16_t * q2 = (const uint16_t *)bq2->qs + 16*(i4/4) + 4*(i4%4);
+    const uint16_t extra = bq2->extra >> 4*(i4/4);
+
+    const int * all_values = (const int *)iq2k_table;
+    const int * values;
+
+    uint32_t val1 = q2[0] | (q2[1] << 16), val2 = q2[2] | (q2[3] << 16);
+
+    uint32_t aux32[2];
+    const uint8_t * a8 = (const uint8_t *)&aux32;
+    int v1, v2;
+
+    int8_t s8[4];
+    s8[0] = ((bq2->scales[2*(i4/4)+0] & 0xf) | ((extra >> 4) & 0x10)) - 16;
+    s8[1] = ((bq2->scales[2*(i4/4)+0] >>  4) | ((extra >> 5) & 0x10)) - 16;
+    s8[2] = ((bq2->scales[2*(i4/4)+1] & 0xf) | ((extra >> 6) & 0x10)) - 16;
+    s8[3] = ((bq2->scales[2*(i4/4)+1] >>  4) | ((extra >> 7) & 0x10)) - 16;
+
+    aux32[0] = ((val1 >> 0) & 0x03030303); aux32[1] = ((val2 >> 0) & 0x03030303); values = all_values + ((extra & 0x01) << 8);
+    v1 = int_from_table_4(a8 + 0, values);
+    v2 = int_from_table_4(a8 + 4, values);
+    int sumi1 = ggml_cuda_dp4a(v2, q8_1[1], ggml_cuda_dp4a(v1, q8_1[0], 0)) * s8[0];
+
+    aux32[0] = ((val1 >> 2) & 0x03030303); aux32[1] = ((val2 >> 2) & 0x03030303); values = all_values + ((extra & 0x02) << 7);
+    v1 = int_from_table_4(a8 + 0, values);
+    v2 = int_from_table_4(a8 + 4, values);
+    int sumi2 = ggml_cuda_dp4a(v2, q8_2[1], ggml_cuda_dp4a(v1, q8_2[0], 0)) * s8[1];
+
+    aux32[0] = ((val1 >> 4) & 0x03030303); aux32[1] = ((val2 >> 4) & 0x03030303); values = all_values + ((extra & 0x04) << 6);
+    v1 = int_from_table_4(a8 + 0, values);
+    v2 = int_from_table_4(a8 + 4, values);
+    int sumi3 = ggml_cuda_dp4a(v2, q8_3[1], ggml_cuda_dp4a(v1, q8_3[0], 0)) * s8[2];
+
+    aux32[0] = ((val1 >> 6) & 0x03030303); aux32[1] = ((val2 >> 6) & 0x03030303); values = all_values + ((extra & 0x08) << 5);
+    v1 = int_from_table_4(a8 + 0, values);
+    v2 = int_from_table_4(a8 + 4, values);
+    int sumi4 = ggml_cuda_dp4a(v2, q8_4[1], ggml_cuda_dp4a(v1, q8_4[0], 0)) * s8[3];
+
+    return scale * (__low2float(bq8_1[4*(i4/4)+0].ds) * sumi1
+                 +  __low2float(bq8_1[4*(i4/4)+1].ds) * sumi2
+                 +  __low2float(bq8_1[4*(i4/4)+2].ds) * sumi3
+                 +  __low2float(bq8_1[4*(i4/4)+3].ds) * sumi4);
+}
+
 #define VDR_IQ3_K_Q8_1_MMVQ 4
 #define VDR_IQ3_K_Q8_1_MMQ  4

@@ -660,7 +707,7 @@ void mul_mat_vec_iq2_ks_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {

-    iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ4_KS, VDR_IQ4_KS_Q8_1_MMVQ, vec_dot_iq2_ks_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ2_KS, VDR_IQ2_KS_Q8_1_MMVQ, vec_dot_iq2_ks_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }

 void mul_mat_vec_iq5_k_q8_1_cuda(