iq2_kt: faster f16 CUDA dot product

We arrive at 146 t/s (no FA), and 158 t/s (FA). This is measured for LLaMA-3.1-8B with output.weight left as f16.
2026-04-24 08:29:29 +00:00 · 2024-11-07 14:35:22 +02:00
parent 7cafafc69e
commit 7bf6e158a9
1 changed files with 7 additions and 7 deletions
--- a/ggml/src/ggml-cuda/dmmv.cu
+++ b/ggml/src/ggml-cuda/dmmv.cu
@@ -64,18 +64,18 @@ static __global__ void dequantize_mul_mat_vec_iq2_kt(const void * __restrict__ v
            bdot1 += __hmul2(y[k+ 0], {h[0]+h[1], h[2]+h[3]});
            bdot2 += __hmul2(y[k+64], {h[4]+h[5], h[6]+h[7]});
 #else
-            bdot.x += y[k+ 0].x * (float)(h[0] + h[1]);
-            bdot.y += y[k+ 0].y * (float)(h[2] + h[3]);
-            bdot.x += y[k+64].x * (float)(h[4] + h[5]);
-            bdot.y += y[k+64].y * (float)(h[6] + h[7]);
+            bdot1.x += y[k+ 0].x * (float)(h[0] + h[1]);
+            bdot1.y += y[k+ 0].y * (float)(h[2] + h[3]);
+            bdot2.x += y[k+64].x * (float)(h[4] + h[5]);
+            bdot2.y += y[k+64].y * (float)(h[6] + h[7]);
 #endif
        }
 #ifdef GGML_CUDA_F16
        tmp += __hmul2(dl1, bdot1);
        tmp += __hmul2(dl2, bdot2);
 #else
-        tmp.x += dl.x * bdot.x;
-        tmp.y += dl.y * bdot.y;
+        tmp.x += dl1.x * bdot1.x + dl2.x * bdot2.x;
+        tmp.y += dl1.y * bdot1.y + dl2.y * bdot2.y;
 #endif
    }

@@ -635,7 +635,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f

 static void dequantize_mul_mat_vec_iq2_kt_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 1;
+    constexpr int ny = 2;
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);