mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-28 10:21:48 +00:00
New SOTA quantization: 4.25 bpw IQ4_KS (#83)
* iq4_k_xxs: basics * WIP + adding iq3_kl quantization mix * iq4_xxs: this looks very viable compared to iq4_xs At the same 4.25 bpw PPL is always better, for some models significantly better. I'll rename to iq4_ks and keep it. * iq4_xxs: CUDA dot product We get TG-128 = 126 t/s for LLaMA-3.1-8B, compared to 123 t/s for q4_0. * iq4_xxs: scalar CPU dot product Also fix the breakage I caused with the dedicated work buffer quantization portion when the multiplication is not done via iqk_mul_mat. * iq4_xxs: Zen4 I noticed that iq4_xs is wrong on Zen4 (and possibly AVX2). Again the same mistake of packing int32_t back to int16_t, which overflows occasionally (just occasionally, that's why the result doesn't look completely wrong, so I didn't notice). * Fix iq4_xs (Zen4) * iq4_xxs: AVX2 * iq4_xxs: ARM_NEON * iq4_xxs: Metal * iq4_xxs: slightly faster TG on Metal * iq4_xxs: rename to iq4_ks After all, tt is a smaller variant of iq4_k. * iq3_kl: use iq4_ks instead of iq4_k/iq4_xs --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -2166,3 +2166,256 @@ void iqk_quantize_row_q8_K(const float * x, void * vy, int64_t k) {
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
namespace {
|
||||
static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int block_size,
|
||||
int n_per_row, const float * x, char * cy,
|
||||
float * all_scales, float * weight,
|
||||
const int8_t * values,
|
||||
const float * quant_weights,
|
||||
const int ntry) {
|
||||
|
||||
//GGML_ASSERT(super_block_size == 256 && block_size == 128);
|
||||
|
||||
float * dptr = (float *)cy;
|
||||
block_iq4_ks * y = (block_iq4_ks *)(dptr + 1);
|
||||
|
||||
const int8_t * shifted_values = values + 16;
|
||||
|
||||
float amax_scale = 0;
|
||||
|
||||
for (int ibl = 0; ibl < n_per_row/super_block_size; ++ibl) {
|
||||
memset(&y[ibl], 0, sizeof(block_iq4_ks));
|
||||
const float * xbl = x + ibl*super_block_size;
|
||||
auto scales = all_scales + ibl*(super_block_size/block_size);
|
||||
float sigma2 = 0;
|
||||
for (int j = 0; j < super_block_size; ++j) sigma2 += xbl[j]*xbl[j];
|
||||
sigma2 *= 2.f/super_block_size;
|
||||
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
||||
const float * xb = xbl + ib*block_size;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + ibl*super_block_size + ib*block_size;
|
||||
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||
} else {
|
||||
for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
|
||||
}
|
||||
float amax = 0, max = 0;
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
float ax = fabsf(xb[j]);
|
||||
if (ax > amax) {
|
||||
amax = ax; max = xb[j];
|
||||
}
|
||||
}
|
||||
if (!amax) {
|
||||
scales[ib] = 0;
|
||||
continue;
|
||||
}
|
||||
float d = ntry > 0 ? -max/values[0] : max/values[0];
|
||||
float id = 1/d;
|
||||
float sumqx_p = 0, sumq2_p = 0;
|
||||
float sumqx_m = 0, sumq2_m = 0;
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
float w = weight[j];
|
||||
float al = id*xb[j];
|
||||
int l = best_index_iq4nl(values, al);
|
||||
float q = values[l];
|
||||
sumqx_p += w*q*xb[j];
|
||||
sumq2_p += w*q*q;
|
||||
l = best_index_iq4nl(values, -al);
|
||||
q = values[l];
|
||||
sumqx_m += w*q*xb[j];
|
||||
sumq2_m += w*q*q;
|
||||
}
|
||||
d = sumqx_p/sumq2_p;
|
||||
bool is_shifted = false;
|
||||
float best = d*sumqx_p;
|
||||
if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) {
|
||||
d = sumqx_m/sumq2_m; best = d*sumqx_m;
|
||||
}
|
||||
for (int itry = -ntry; itry <= ntry; ++itry) {
|
||||
id = (itry + values[0])/max;
|
||||
sumqx_p = sumq2_p = 0;
|
||||
sumqx_m = sumq2_m = 0;
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
float w = weight[j];
|
||||
float al = id*xb[j];
|
||||
int l = best_index_iq4nl(values, al);
|
||||
float q = values[l];
|
||||
sumqx_p += w*q*xb[j];
|
||||
sumq2_p += w*q*q;
|
||||
l = best_index_iq4nl(values, -al);
|
||||
q = values[l];
|
||||
sumqx_m += w*q*xb[j];
|
||||
sumq2_m += w*q*q;
|
||||
}
|
||||
if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) {
|
||||
d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = false;
|
||||
}
|
||||
if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) {
|
||||
d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = false;
|
||||
}
|
||||
id = (itry + shifted_values[0])/max;
|
||||
sumqx_p = sumq2_p = 0;
|
||||
sumqx_m = sumq2_m = 0;
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
float w = weight[j];
|
||||
float al = id*xb[j];
|
||||
int l = best_index_iq4nl(shifted_values, al);
|
||||
float q = shifted_values[l];
|
||||
sumqx_p += w*q*xb[j];
|
||||
sumq2_p += w*q*q;
|
||||
l = best_index_iq4nl(shifted_values, -al);
|
||||
q = shifted_values[l];
|
||||
sumqx_m += w*q*xb[j];
|
||||
sumq2_m += w*q*q;
|
||||
}
|
||||
if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) {
|
||||
d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = true;
|
||||
}
|
||||
if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) {
|
||||
d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true;
|
||||
}
|
||||
}
|
||||
if (is_shifted) y[ibl].scales[ib] = 0x01;
|
||||
scales[ib] = d;
|
||||
amax_scale = std::max(amax_scale, std::abs(d));
|
||||
}
|
||||
}
|
||||
float d = amax_scale/127;
|
||||
*dptr = d;
|
||||
if (!d) return;
|
||||
float id = d ? 1/d : 0.f;
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
//float mse = 0;
|
||||
for (int ibl = 0; ibl < n_per_row/super_block_size; ++ibl) {
|
||||
const float * xbl = x + ibl*super_block_size;
|
||||
float sigma2 = 0;
|
||||
for (int j = 0; j < super_block_size; ++j) sigma2 += xbl[j]*xbl[j];
|
||||
sigma2 *= 2.f/super_block_size;
|
||||
auto scales = all_scales + (super_block_size/block_size)*ibl;
|
||||
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
||||
const int8_t * block_values = y[ibl].scales[ib] & 0x01 ? shifted_values : values;
|
||||
int l = nearest_int(0.5f*(id*scales[ib]+127.f));
|
||||
l = std::max(0, std::min(127, l)) << 1;
|
||||
//printf("d = %g, id = %g, scales = %g, l = %d, dl = %g\n", d, id, scales[ib], l, d*(l - 127));
|
||||
y[ibl].scales[ib] |= l;
|
||||
l -= 127;
|
||||
float dl = d * l;
|
||||
float idl = dl ? 1/dl : 0.f;
|
||||
const float * xb = xbl + ib*block_size;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + ibl*super_block_size + ib*block_size;
|
||||
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||
} else {
|
||||
for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
|
||||
}
|
||||
auto qs = y[ibl].qs + ib*(block_size/2);
|
||||
for (int j = 0; j < block_size/2; ++j) {
|
||||
uint8_t i1 = best_index_iq4nl(block_values, idl*xb[j]);
|
||||
uint8_t i2 = best_index_iq4nl(block_values, idl*xb[j+block_size/2]);
|
||||
qs[j] = i1 | (i2 << 4);
|
||||
float w1 = weight[j];
|
||||
float w2 = weight[j+block_size/2];
|
||||
float q1 = block_values[i1]*l;
|
||||
float q2 = block_values[i2]*l;
|
||||
sumqx += w1*q1*xb[j] + w2*q2*xb[j+block_size/2];
|
||||
sumq2 += w1*q1*q1 + w2*q2*q2;
|
||||
//float diff = xb[j] - d*q1; mse += diff*diff;
|
||||
//diff = xb[j+block_size/2] - d*q2; mse += diff*diff;
|
||||
}
|
||||
}
|
||||
}
|
||||
//printf("rmse = %g\n", sqrt(mse/n_per_row));
|
||||
if (sumq2 > 0) *dptr = sumqx/sumq2;
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_iq4_ks_ref(const float * x, block_iq4_ks * y, int64_t k) {
|
||||
quantize_iq4_ks(x, (void *)y, 1, k, nullptr);
|
||||
}
|
||||
|
||||
void quantize_row_iq4_ks(const float * x, void * y, int64_t k) {
|
||||
quantize_iq4_ks(x, (void *)y, 1, k, nullptr);
|
||||
}
|
||||
|
||||
size_t quantize_iq4_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
|
||||
//printf("============ %s(%d, %d)\n", __func__, int(nrows), int(n_per_row));
|
||||
constexpr int kBlockSize = 32; //128;
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
auto row_size = ggml_row_size(GGML_TYPE_IQ4_KS, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
float weight[kBlockSize];
|
||||
std::vector<float> all_scales(n_per_row/kBlockSize);
|
||||
for (int64_t row = 0; row < nrows; ++row) {
|
||||
quantize_row_iq4_k_impl_bs128(QK_K, kBlockSize, n_per_row, src, qrow, all_scales.data(), weight, iq4k_values, imatrix, 7);
|
||||
src += n_per_row;
|
||||
qrow += row_size;
|
||||
}
|
||||
return nrows * row_size;
|
||||
}
|
||||
|
||||
void dequantize_row_iq4_ks(const block_iq4_ks * x, float * y, int64_t k) {
|
||||
constexpr int kBlockSize = 32; //128;
|
||||
GGML_ASSERT(k%QK_K == 0);
|
||||
const float * dptr = (const float *)x;
|
||||
float d = *dptr;
|
||||
x = (const block_iq4_ks *)(dptr + 1);
|
||||
int nblock = k/QK_K;
|
||||
for (int ibl = 0; ibl < nblock; ++ibl) {
|
||||
auto qs = x[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
|
||||
float dl = d * ((int)(x[ibl].scales[ib] & 254) - 127);
|
||||
const int8_t * values = iq4k_values + ((x[ibl].scales[ib] & 1) << 4);
|
||||
for (int j = 0; j < kBlockSize/2; ++j) {
|
||||
y[j ] = dl * values[qs[j] & 0xf];
|
||||
y[j+kBlockSize/2] = dl * values[qs[j] >> 4];
|
||||
}
|
||||
y += kBlockSize;
|
||||
qs += kBlockSize/2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vec_dot_iq4_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
|
||||
constexpr int kBlockSize = 32;
|
||||
#if GGML_USE_IQK_MULMAT
|
||||
if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_KS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc == 1);
|
||||
GGML_UNUSED(bs);
|
||||
GGML_UNUSED(bx);
|
||||
GGML_UNUSED(by);
|
||||
const float * dptr = (const float *)vx;
|
||||
const float d = *dptr;
|
||||
//printf("%s: n = %d, d = %g\n", __func__, n, d);
|
||||
const block_iq4_ks * x = (const block_iq4_ks *)(dptr + 1);
|
||||
const block_q8_K * y = (const block_q8_K *)vy;
|
||||
int nblock = n/QK_K;
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nblock; ++ibl) {
|
||||
//int sumi = 0;
|
||||
auto qy = y[ibl].qs;
|
||||
auto qx = x[ibl].qs;
|
||||
float db = d * y[ibl].d;
|
||||
for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
|
||||
float dl = db * ((x[ibl].scales[ib] & 254) - 127);
|
||||
//int ls = (x[ibl].scales[ib] & 254) - 127;
|
||||
const int8_t * values = iq4k_values + ((x[ibl].scales[ib] & 1) << 4);
|
||||
int suml = 0;
|
||||
for (int j = 0; j < kBlockSize/2; ++j) {
|
||||
suml += qy[j ] * values[qx[j] & 0xf]
|
||||
+ qy[j + kBlockSize/2] * values[qx[j] >> 4];
|
||||
}
|
||||
sumf += dl * suml;
|
||||
//sumi += ls * suml;
|
||||
qy += kBlockSize;
|
||||
qx += kBlockSize/2;
|
||||
}
|
||||
//sumf += d * y[ibl].d * sumi;
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user