mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 03:11:51 +00:00
q2_K: allow it to detect ternary nets and quantize accordingly
This commit is contained in:
@@ -1995,7 +1995,52 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in
|
||||
|
||||
const float q4scale = 15.f;
|
||||
|
||||
// Detect TriNet
|
||||
{
|
||||
int n = k;
|
||||
float max = 0;
|
||||
for (int j = 0; j < n; ++j) {
|
||||
float ax = fabsf(x[j]);
|
||||
max = MAX(max, ax);
|
||||
}
|
||||
float mse0 = 0, mse = 0;
|
||||
for (int j = 0; j < n; ++j) {
|
||||
int l = x[j] < -0.5f*max ? -1 : x[j] < 0.5f*max ? 0 : 1;
|
||||
mse0 += x[j]*x[j];
|
||||
float diff = x[j] - max*l;
|
||||
mse += diff*diff;
|
||||
}
|
||||
if (mse < 0.1f*mse0) {
|
||||
// yes, most likely trinet
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
y[ibl].d = GGML_FP32_TO_FP16(max);
|
||||
y[ibl].dmin = GGML_FP32_TO_FP16(max);
|
||||
for (int ib = 0; ib < QK_K/16; ++ib) y[ibl].scales[ib] = 1 | (1 << 4);
|
||||
const float * xb = x + QK_K * ibl;
|
||||
for (int j = 0; j < QK_K; ++j) {
|
||||
L[j] = xb[j] < -0.5f*max ? 0 : xb[j] < 0.5f*max ? 1 : 2;
|
||||
}
|
||||
uint8_t * qs = y[ibl].qs;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
qs[l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
||||
}
|
||||
qs += 32;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
//{
|
||||
// float max = x[0], min = x[0];
|
||||
// for (int j = 1; j < 256; ++j) {
|
||||
// max = MAX(x[j], max);
|
||||
// min = MIN(x[j], min);
|
||||
// }
|
||||
// printf("%s: max = %g, min = %g\n", __func__, (double)max, (double)min);
|
||||
//}
|
||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||
float max_min = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
|
||||
Reference in New Issue
Block a user