mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-23 06:34:13 +00:00
Move scale fudge factors to quantization (#81)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -705,7 +705,7 @@ static __global__ void dequantize_block_iq2_k(const void * __restrict__ vx, dst_
|
||||
int ib128 = tid/16; // 0 or 1
|
||||
int il = tid%16; // 0...15
|
||||
dst_t * y = yy + i*QK_K + 128*ib128 + 2*il;
|
||||
const float d = (float)x[i].d * 1.025f; //1.0325f;
|
||||
const float d = (float)x[i].d;
|
||||
const float dl1 = d * (2*((x[i].scales[4*ib128+0] >> 4*(il/8)) & 0xf) - 15);
|
||||
const float dl2 = d * (2*((x[i].scales[4*ib128+1] >> 4*(il/8)) & 0xf) - 15);
|
||||
const float dl3 = d * (2*((x[i].scales[4*ib128+2] >> 4*(il/8)) & 0xf) - 15);
|
||||
@@ -730,7 +730,7 @@ static __global__ void dequantize_block_iq3_k(const void * __restrict__ vx, dst_
|
||||
int ib128 = tid/16; // 0 or 1
|
||||
int il = tid%16; // 0...15
|
||||
dst_t * y = yy + i*QK_K + 128*ib128 + 2*il;
|
||||
const float d = (float)x[i].d * 1.01f; //1.0125f;
|
||||
const float d = (float)x[i].d;
|
||||
const uint16_t sh = x[i].scales_h >> (8*ib128 + (il/8));
|
||||
const float dl1 = d * ((2*((x[i].scales_l[4*ib128+0] >> 4*(il/8)) & 0xf) + 1) * ((sh & 0x01) ? -1 : 1));
|
||||
const float dl2 = d * ((2*((x[i].scales_l[4*ib128+1] >> 4*(il/8)) & 0xf) + 1) * ((sh & 0x04) ? -1 : 1));
|
||||
|
||||
@@ -589,7 +589,6 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl
|
||||
if (!max_abs_scale) continue;
|
||||
|
||||
float d = max_abs_scale/15;
|
||||
y[ibl].d = GGML_FP32_TO_FP16(d);
|
||||
y[ibl].extra = extra;
|
||||
float id = 1/d;
|
||||
|
||||
@@ -624,7 +623,7 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl
|
||||
}
|
||||
}
|
||||
}
|
||||
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(sumqx/sumq2);
|
||||
y[ibl].d = GGML_FP32_TO_FP16(1.025f*(sumq2 > 0 ? sumqx/sumq2 : d));
|
||||
|
||||
}
|
||||
}
|
||||
@@ -854,7 +853,6 @@ static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, c
|
||||
if (!max_abs_scale) continue;
|
||||
|
||||
float d = max_abs_scale/31;
|
||||
y[ibl].d = GGML_FP32_TO_FP16(d);
|
||||
y[ibl].extra = extra;
|
||||
float id = 1/d;
|
||||
|
||||
@@ -892,7 +890,7 @@ static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, c
|
||||
}
|
||||
}
|
||||
}
|
||||
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(sumqx/sumq2);
|
||||
y[ibl].d = GGML_FP32_TO_FP16(1.01f*(sumq2 > 0 ? sumqx/sumq2 : d));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user