Trying @louiehelm's multiplier

2026-02-24 07:04:11 +00:00 · 2025-06-13 19:38:11 +03:00
parent 57e882fd84
commit de4e6c797f
4 changed files with 17 additions and 21 deletions
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -341,9 +341,8 @@ inline __device__ int nearest_int(float fval) {
 }

 int __device__ __forceinline__ trellis_next_int(uint32_t& val) {
-    constexpr uint32_t ka = 89226354;
-    constexpr uint32_t kb = 64248484;
-    val = ka*val + kb;
+    constexpr uint32_t ka = 0xCBAC1FED;
+    val = ka*val;
    return ggml_cuda_dp4a(val & 0x3f3f3f3f, 0x01010101, -126);
 }

--- a/ggml/src/ggml-cuda/iqk_mmvq.cu
+++ b/ggml/src/ggml-cuda/iqk_mmvq.cu
@@ -436,8 +436,7 @@ __device__ __forceinline__ void vec_dot_iq4_ks_q8_1(
 __device__ __forceinline__ void vec_dot_iq4_kt_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs, float * result) {

-    constexpr uint32_t ka = 89226354;
-    constexpr uint32_t kb = 64248484;
+    constexpr uint32_t ka = 0xCBAC1FED;
    constexpr uint32_t km = 0x3f3f3f3f;

    float scale = *(const float *)vbq;
@@ -461,7 +460,7 @@ __device__ __forceinline__ void vec_dot_iq4_kt_q8_1(
        uint32_t val = ql[j] + ((qh[j] << shift1) & 0xf00) + ((sh & 7) << 12) + idx0;
        int v4 = 0;
        for (int k = 0; k < 4; ++k) {
-            val = ka*val + kb;
+            val *= ka;
            //int s = val & km;
            //sumi += q8[4*j+k] * ggml_cuda_dp4a(s, 0x01010101, -126);
            v4 |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
@@ -474,8 +473,7 @@ __device__ __forceinline__ void vec_dot_iq4_kt_q8_1(
 __device__ __forceinline__ void vec_dot_iq2_kt_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs, float * result) {

-    constexpr uint32_t ka = 89226354;
-    constexpr uint32_t kb = 64248484;
+    constexpr uint32_t ka = 0xCBAC1FED;
    constexpr uint32_t km = 0x3f3f3f3f;

    float scale = *(const float *)vbq;
@@ -492,13 +490,13 @@ __device__ __forceinline__ void vec_dot_iq2_kt_q8_1(
        uint32_t val = ql[4*ib32+j] + 4096;
        int v4 = 0;
        for (int k = 0; k < 4; ++k) {
-            val = ka*val + kb;
+            val *= ka;
            v4 |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
        }
        sumi = ggml_cuda_dp4a(v4, q8[2*j+0], sumi);
        v4 = 0;
        for (int k = 0; k < 4; ++k) {
-            val = ka*val + kb;
+            val *= ka;
            v4 |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
        }
        sumi = ggml_cuda_dp4a(v4, q8[2*j+1], sumi);
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2799,8 +2799,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_kt(
    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {

-    constexpr uint32_t ka = 89226354;
-    constexpr uint32_t kb = 64248484;
+    constexpr uint32_t ka = 0xCBAC1FED;
    constexpr uint32_t km = 0x3f3f3f3f;

 #ifdef INT8_MMA_AVAILABLE
@@ -2835,8 +2834,8 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
        uint32_t val2 = offset + ql[8*ib32+2*j+1] + ((qh[8*(ib32%4)+2*j+1] << (8 - 4*(ib32/4))) & 0xf00) + ((sh & 56) << 9);
        int2 v = {0, 0};
        for (int k = 0; k < 4; ++k) {
-            val1 = ka*val1 + kb;
-            val2 = ka*val2 + kb;
+            val1 *= ka;
+            val2 *= ka;
            v.x |= (ggml_cuda_dp4a(val1 & km, 0x01010101, -126) & 0xff) << 8*k;
            v.y |= (ggml_cuda_dp4a(val2 & km, 0x01010101, -126) & 0xff) << 8*k;
        }
@@ -2872,8 +2871,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_kt(
    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {

-    constexpr uint32_t ka = 89226354;
-    constexpr uint32_t kb = 64248484;
+    constexpr uint32_t ka = 0xCBAC1FED;
    constexpr uint32_t km = 0x3f3f3f3f;

 #ifdef INT8_MMA_AVAILABLE
@@ -2903,11 +2901,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
        uint32_t val = ql[4*ib32+j] + 4096;
        int2 v = {0, 0};
        for (int k = 0; k < 4; ++k) {
-            val = ka*val + kb;
+            val *= ka;
            v.x |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
        }
        for (int k = 0; k < 4; ++k) {
-            val = ka*val + kb;
+            val *= ka;
            v.y |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
        }
 #ifdef INT8_MMA_AVAILABLE
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -7419,18 +7419,19 @@ public:
    inline float find_best_inverse_scale(const float * xb, const float * weight, const int * best_idx) const;

    static inline void set_values(uint32_t i, float * result, float scale, int offset = 4096) {
-        constexpr uint32_t ka = 89226354;
-        constexpr uint32_t kb = 64248484;
        uint32_t x = i + offset;
        if constexpr (is_int) {
+            constexpr uint32_t ka = 0xCBAC1FED;
            uint32_t s;
            auto i8 = (const int8_t *)&s;
            for (int k = 0; k < kGroupSize; ++k) {
-                x = ka*x + kb;
+                x = ka*x;
                s = x & 0x3f3f3f3f;
                result[k] = scale*(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
            }
        } else {
+            constexpr uint32_t ka = 89226354;
+            constexpr uint32_t kb = 64248484;
            constexpr uint32_t kmask = 0x8fff8fff;
            constexpr uint32_t km32 = 0x3b603b60;
            for (int k = 0; k < kGroupSize; ++k) {