mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-24 07:04:11 +00:00
Trying @louiehelm's multiplier
This commit is contained in:
@@ -341,9 +341,8 @@ inline __device__ int nearest_int(float fval) {
|
||||
}
|
||||
|
||||
int __device__ __forceinline__ trellis_next_int(uint32_t& val) {
|
||||
constexpr uint32_t ka = 89226354;
|
||||
constexpr uint32_t kb = 64248484;
|
||||
val = ka*val + kb;
|
||||
constexpr uint32_t ka = 0xCBAC1FED;
|
||||
val = ka*val;
|
||||
return ggml_cuda_dp4a(val & 0x3f3f3f3f, 0x01010101, -126);
|
||||
}
|
||||
|
||||
|
||||
@@ -436,8 +436,7 @@ __device__ __forceinline__ void vec_dot_iq4_ks_q8_1(
|
||||
__device__ __forceinline__ void vec_dot_iq4_kt_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs, float * result) {
|
||||
|
||||
constexpr uint32_t ka = 89226354;
|
||||
constexpr uint32_t kb = 64248484;
|
||||
constexpr uint32_t ka = 0xCBAC1FED;
|
||||
constexpr uint32_t km = 0x3f3f3f3f;
|
||||
|
||||
float scale = *(const float *)vbq;
|
||||
@@ -461,7 +460,7 @@ __device__ __forceinline__ void vec_dot_iq4_kt_q8_1(
|
||||
uint32_t val = ql[j] + ((qh[j] << shift1) & 0xf00) + ((sh & 7) << 12) + idx0;
|
||||
int v4 = 0;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val = ka*val + kb;
|
||||
val *= ka;
|
||||
//int s = val & km;
|
||||
//sumi += q8[4*j+k] * ggml_cuda_dp4a(s, 0x01010101, -126);
|
||||
v4 |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
@@ -474,8 +473,7 @@ __device__ __forceinline__ void vec_dot_iq4_kt_q8_1(
|
||||
__device__ __forceinline__ void vec_dot_iq2_kt_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs, float * result) {
|
||||
|
||||
constexpr uint32_t ka = 89226354;
|
||||
constexpr uint32_t kb = 64248484;
|
||||
constexpr uint32_t ka = 0xCBAC1FED;
|
||||
constexpr uint32_t km = 0x3f3f3f3f;
|
||||
|
||||
float scale = *(const float *)vbq;
|
||||
@@ -492,13 +490,13 @@ __device__ __forceinline__ void vec_dot_iq2_kt_q8_1(
|
||||
uint32_t val = ql[4*ib32+j] + 4096;
|
||||
int v4 = 0;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val = ka*val + kb;
|
||||
val *= ka;
|
||||
v4 |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
}
|
||||
sumi = ggml_cuda_dp4a(v4, q8[2*j+0], sumi);
|
||||
v4 = 0;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val = ka*val + kb;
|
||||
val *= ka;
|
||||
v4 |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
}
|
||||
sumi = ggml_cuda_dp4a(v4, q8[2*j+1], sumi);
|
||||
|
||||
@@ -2799,8 +2799,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_kt(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
|
||||
|
||||
constexpr uint32_t ka = 89226354;
|
||||
constexpr uint32_t kb = 64248484;
|
||||
constexpr uint32_t ka = 0xCBAC1FED;
|
||||
constexpr uint32_t km = 0x3f3f3f3f;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
@@ -2835,8 +2834,8 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
uint32_t val2 = offset + ql[8*ib32+2*j+1] + ((qh[8*(ib32%4)+2*j+1] << (8 - 4*(ib32/4))) & 0xf00) + ((sh & 56) << 9);
|
||||
int2 v = {0, 0};
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val1 = ka*val1 + kb;
|
||||
val2 = ka*val2 + kb;
|
||||
val1 *= ka;
|
||||
val2 *= ka;
|
||||
v.x |= (ggml_cuda_dp4a(val1 & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
v.y |= (ggml_cuda_dp4a(val2 & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
}
|
||||
@@ -2872,8 +2871,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_kt(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
|
||||
|
||||
constexpr uint32_t ka = 89226354;
|
||||
constexpr uint32_t kb = 64248484;
|
||||
constexpr uint32_t ka = 0xCBAC1FED;
|
||||
constexpr uint32_t km = 0x3f3f3f3f;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
@@ -2903,11 +2901,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
uint32_t val = ql[4*ib32+j] + 4096;
|
||||
int2 v = {0, 0};
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val = ka*val + kb;
|
||||
val *= ka;
|
||||
v.x |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
}
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val = ka*val + kb;
|
||||
val *= ka;
|
||||
v.y |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
}
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
|
||||
@@ -7419,18 +7419,19 @@ public:
|
||||
inline float find_best_inverse_scale(const float * xb, const float * weight, const int * best_idx) const;
|
||||
|
||||
static inline void set_values(uint32_t i, float * result, float scale, int offset = 4096) {
|
||||
constexpr uint32_t ka = 89226354;
|
||||
constexpr uint32_t kb = 64248484;
|
||||
uint32_t x = i + offset;
|
||||
if constexpr (is_int) {
|
||||
constexpr uint32_t ka = 0xCBAC1FED;
|
||||
uint32_t s;
|
||||
auto i8 = (const int8_t *)&s;
|
||||
for (int k = 0; k < kGroupSize; ++k) {
|
||||
x = ka*x + kb;
|
||||
x = ka*x;
|
||||
s = x & 0x3f3f3f3f;
|
||||
result[k] = scale*(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
|
||||
}
|
||||
} else {
|
||||
constexpr uint32_t ka = 89226354;
|
||||
constexpr uint32_t kb = 64248484;
|
||||
constexpr uint32_t kmask = 0x8fff8fff;
|
||||
constexpr uint32_t km32 = 0x3b603b60;
|
||||
for (int k = 0; k < kGroupSize; ++k) {
|
||||
|
||||
Reference in New Issue
Block a user