Trying @louiehelm's multiplier

This commit is contained in:
Iwan Kawrakow
2025-06-13 19:38:11 +03:00
parent 57e882fd84
commit de4e6c797f
4 changed files with 17 additions and 21 deletions

View File

@@ -341,9 +341,8 @@ inline __device__ int nearest_int(float fval) {
}
int __device__ __forceinline__ trellis_next_int(uint32_t& val) {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
val = ka*val + kb;
constexpr uint32_t ka = 0xCBAC1FED;
val = ka*val;
return ggml_cuda_dp4a(val & 0x3f3f3f3f, 0x01010101, -126);
}

View File

@@ -436,8 +436,7 @@ __device__ __forceinline__ void vec_dot_iq4_ks_q8_1(
__device__ __forceinline__ void vec_dot_iq4_kt_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs, float * result) {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
constexpr uint32_t ka = 0xCBAC1FED;
constexpr uint32_t km = 0x3f3f3f3f;
float scale = *(const float *)vbq;
@@ -461,7 +460,7 @@ __device__ __forceinline__ void vec_dot_iq4_kt_q8_1(
uint32_t val = ql[j] + ((qh[j] << shift1) & 0xf00) + ((sh & 7) << 12) + idx0;
int v4 = 0;
for (int k = 0; k < 4; ++k) {
val = ka*val + kb;
val *= ka;
//int s = val & km;
//sumi += q8[4*j+k] * ggml_cuda_dp4a(s, 0x01010101, -126);
v4 |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
@@ -474,8 +473,7 @@ __device__ __forceinline__ void vec_dot_iq4_kt_q8_1(
__device__ __forceinline__ void vec_dot_iq2_kt_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs, float * result) {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
constexpr uint32_t ka = 0xCBAC1FED;
constexpr uint32_t km = 0x3f3f3f3f;
float scale = *(const float *)vbq;
@@ -492,13 +490,13 @@ __device__ __forceinline__ void vec_dot_iq2_kt_q8_1(
uint32_t val = ql[4*ib32+j] + 4096;
int v4 = 0;
for (int k = 0; k < 4; ++k) {
val = ka*val + kb;
val *= ka;
v4 |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
}
sumi = ggml_cuda_dp4a(v4, q8[2*j+0], sumi);
v4 = 0;
for (int k = 0; k < 4; ++k) {
val = ka*val + kb;
val *= ka;
v4 |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
}
sumi = ggml_cuda_dp4a(v4, q8[2*j+1], sumi);

View File

@@ -2799,8 +2799,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_kt(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
constexpr uint32_t ka = 0xCBAC1FED;
constexpr uint32_t km = 0x3f3f3f3f;
#ifdef INT8_MMA_AVAILABLE
@@ -2835,8 +2834,8 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
uint32_t val2 = offset + ql[8*ib32+2*j+1] + ((qh[8*(ib32%4)+2*j+1] << (8 - 4*(ib32/4))) & 0xf00) + ((sh & 56) << 9);
int2 v = {0, 0};
for (int k = 0; k < 4; ++k) {
val1 = ka*val1 + kb;
val2 = ka*val2 + kb;
val1 *= ka;
val2 *= ka;
v.x |= (ggml_cuda_dp4a(val1 & km, 0x01010101, -126) & 0xff) << 8*k;
v.y |= (ggml_cuda_dp4a(val2 & km, 0x01010101, -126) & 0xff) << 8*k;
}
@@ -2872,8 +2871,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_kt(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
constexpr uint32_t ka = 0xCBAC1FED;
constexpr uint32_t km = 0x3f3f3f3f;
#ifdef INT8_MMA_AVAILABLE
@@ -2903,11 +2901,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
uint32_t val = ql[4*ib32+j] + 4096;
int2 v = {0, 0};
for (int k = 0; k < 4; ++k) {
val = ka*val + kb;
val *= ka;
v.x |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
}
for (int k = 0; k < 4; ++k) {
val = ka*val + kb;
val *= ka;
v.y |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
}
#ifdef INT8_MMA_AVAILABLE

View File

@@ -7419,18 +7419,19 @@ public:
inline float find_best_inverse_scale(const float * xb, const float * weight, const int * best_idx) const;
static inline void set_values(uint32_t i, float * result, float scale, int offset = 4096) {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
uint32_t x = i + offset;
if constexpr (is_int) {
constexpr uint32_t ka = 0xCBAC1FED;
uint32_t s;
auto i8 = (const int8_t *)&s;
for (int k = 0; k < kGroupSize; ++k) {
x = ka*x + kb;
x = ka*x;
s = x & 0x3f3f3f3f;
result[k] = scale*(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
}
} else {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
constexpr uint32_t kmask = 0x8fff8fff;
constexpr uint32_t km32 = 0x3b603b60;
for (int k = 0; k < kGroupSize; ++k) {