Improve TG speed (when not memory bound)

2026-02-25 15:44:10 +00:00 · 2024-09-04 19:37:05 +03:00
parent a4c55558d3
commit 8d47523e7e
1 changed files with 9 additions and 1 deletions
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -3336,7 +3336,7 @@ IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0,
 }
 template <int nrc_y>
 void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
-    constexpr int k_nx = 5;
+    constexpr int k_nx = nrc_y <= 2 ? 8 : 5;
    const char * cx = (const char *)vx;
    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
        mul_mat_Qx_Qy_MxN<nrc_y, k_nx>(n, cx, bx, ix*k_nx, info);
@@ -3344,6 +3344,14 @@ void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, in
    int last_x = k_nx*(nrc_x/k_nx);
    if (last_x == nrc_x) return;
    int nx = nrc_x - last_x;
+    if constexpr (nrc_y <= 2) {
+        if (nx >= 4) {
+            mul_mat_Qx_Qy_MxN<nrc_y, 4>(n, cx, bx, last_x, info);
+            last_x += 4;
+            if (last_x == nrc_x) return;
+            nx = nrc_x - last_x;
+        }
+    }
    switch (nx) {
        case 1: mul_mat_Qx_Qy_MxN<nrc_y, 1>(n, cx, bx, last_x, info); break;
        case 2: mul_mat_Qx_Qy_MxN<nrc_y, 2>(n, cx, bx, last_x, info); break;