iq2_kt: quantize / dequantize

I now see that I was comparing apples to oranges:
iq2_xxs was using a weight of sigma^2/4 + x^2, while
the Trellis approach wasn't (weight = 1). Once I use the same weight,
iq2_kt is actually slightly worse than iq2_xxs in terms
of rmse, so does not look promising at this point.
Also, once each group of 8 Trellis values no longer has a
constant sum(q^2) that we can precompute, quantization
becomes significantly slower (476 seconds for LLaMA-3.1-8B).
This commit is contained in:
Iwan Kawrakow
2024-11-05 18:50:08 +02:00
parent f1df1b7e15
commit a4f1ac8da4
6 changed files with 281 additions and 2 deletions

View File

@@ -257,6 +257,24 @@ static inline int nearest_int(float fval) {
return (i & 0x007fffff) - 0x00400000;
}
//static void fast_ht(int n, float * values) {
// constexpr float ksqrt2 = 0.707106781f;
// float scale = 1;
// int h = 1;
// while (h < n) {
// for (int i = 0; i < n; i += 2*h) {
// for (int j = i; j < i + h; ++j) {
// float x = values[j], y = values[j + h];
// values[j+0] = x + y;
// values[j+h] = x - y;
// }
// }
// h *= 2;
// scale *= ksqrt2;
// }
// for (int i = 0; i < n; ++i) values[i] *= scale;
//}
static const int8_t scale_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
//static std::vector<float> make_values(int nval, int n_per_val) {
@@ -374,6 +392,7 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float *
float lmse = 0, lmse_q = 0;
std::vector<float> scales(n_per_row/kBlockSize);
std::vector<int> best_idx(n_per_row/kBlockSize);
//float xtmp[kBlockSize];
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int first = counter; counter += chunk;
@@ -395,8 +414,13 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float *
for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
float best = 0, d = 0; int jbest = -1;
auto xb = xr + kBlockSize*ib;
//std::memcpy(xtmp, xb, kBlockSize*sizeof(float));
//fast_ht(kBlockSize, xtmp);
#ifdef __AVX2__
for (int l = 0; l < kBlockSize/8; ++l) vx[l] = _mm256_loadu_ps(xb+8*l);
for (int l = 0; l < kBlockSize/8; ++l) {
//vx[l] = _mm256_loadu_ps(xtmp+8*l);
vx[l] = _mm256_loadu_ps(xb+8*l);
}
auto vbest = _mm256_set1_ps(0.f);
auto best_index = _mm256_set1_epi32(-1);
for (int j = 0; j < kNumVal; j += 8) {
@@ -422,7 +446,8 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float *
}
auto qv = codes.data() + kBlockSize*jbest;
float sumqx = 0;
for (int k = 0; k < 8; ++k) sumqx += xb[k]*qv[k];
for (int k = 0; k < kBlockSize; ++k) sumqx += xb[k]*qv[k];
//for (int k = 0; k < kBlockSize; ++k) sumqx += xtmp[k]*qv[k];
d = sumqx*sumq2i[jbest];
#else
for (int j = 0; j < kNumVal; ++j) {
@@ -440,6 +465,7 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float *
best_idx[ib] = jbest;
for (int k = 0; k < kBlockSize; ++k) {
float diff = xb[k] - d*qv[k];
//float diff = xtmp[k] - d*qv[k];
lmse += diff*diff;
}
}
@@ -458,9 +484,12 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float *
int ls = best_index_scale(scale_values, id*scales[ib]);
float dl = d * scale_values[ls];
auto xb = xr + kBlockSize*ib;
//std::memcpy(xtmp, xb, kBlockSize*sizeof(float));
//fast_ht(kBlockSize, xtmp);
auto qv = codes.data() + kBlockSize*best_idx[ib];
for (int k = 0; k < kBlockSize; ++k) {
float diff = xb[k] - dl*qv[k];
//float diff = xtmp[k] - dl*qv[k];
lmse_q += diff*diff;
}
}

View File

@@ -406,6 +406,7 @@ extern "C" {
GGML_TYPE_IQ4_KS = 144,
GGML_TYPE_IQ2_KS = 145,
GGML_TYPE_IQ4_KSS = 146,
GGML_TYPE_IQ2_KT = 147,
GGML_TYPE_COUNT,
};
@@ -464,6 +465,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_KS = 137, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_KS = 138, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_KSS = 139, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_KT = 140, // except 1d tensors
};
// available tensor operations:

View File

@@ -454,6 +454,14 @@ typedef struct {
} block_iq2_ks;
static_assert(sizeof(block_iq2_ks) == sizeof(uint16_t) + QK_K/64 + QK_K/4, "wrong iq2_ks block size/padding");
typedef struct {
ggml_half d;
uint8_t scales[QK_K/16];
uint8_t qh[QK_K/16];
uint8_t ql[QK_K/8];
} block_iq2_kt;
static_assert(sizeof(block_iq2_kt) == sizeof(ggml_half) + QK_K/4, "wrong iq2_kt block size/padding");
typedef struct {
ggml_half d;
uint16_t extra;

View File

@@ -1193,6 +1193,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.nrows = 1,
.row_meta_size = 2,
},
[GGML_TYPE_IQ2_KT] = {
.type_name = "iq2_kt",
.blck_size = QK_K,
.type_size = sizeof(block_iq2_kt),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_kt,
.from_float = quantize_row_iq2_kt,
.from_float_ref = (ggml_from_float_t)quantize_row_iq2_kt_ref,
.vec_dot = vec_dot_iq2_kt_q8_k,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.row_meta_size = 0,
},
[GGML_TYPE_IQ3_K] = {
.type_name = "iq3_k",
.blck_size = QK_K,
@@ -3908,6 +3921,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ4_KSS: wtype = GGML_TYPE_IQ4_KSS; break;
case GGML_FTYPE_MOSTLY_IQ2_K: wtype = GGML_TYPE_IQ2_K; break;
case GGML_FTYPE_MOSTLY_IQ2_KS: wtype = GGML_TYPE_IQ2_KS; break;
case GGML_FTYPE_MOSTLY_IQ2_KT: wtype = GGML_TYPE_IQ2_KT; break;
case GGML_FTYPE_MOSTLY_IQ3_K: wtype = GGML_TYPE_IQ3_K; break;
case GGML_FTYPE_MOSTLY_IQ4_K: wtype = GGML_TYPE_IQ4_K; break;
case GGML_FTYPE_MOSTLY_IQ5_K: wtype = GGML_TYPE_IQ5_K; break;
@@ -10431,6 +10445,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ4_KSS:
case GGML_TYPE_IQ2_K:
case GGML_TYPE_IQ2_KS:
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ4_K:
case GGML_TYPE_IQ5_K:
@@ -10873,6 +10888,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ4_KSS:
case GGML_TYPE_IQ2_K:
case GGML_TYPE_IQ2_KS:
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ4_K:
case GGML_TYPE_IQ5_K:
@@ -11012,6 +11028,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ4_KSS:
case GGML_TYPE_IQ2_K:
case GGML_TYPE_IQ2_KS:
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ4_K:
case GGML_TYPE_IQ5_K:
@@ -14197,6 +14214,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ4_KSS:
case GGML_TYPE_IQ2_K:
case GGML_TYPE_IQ2_KS:
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ4_K:
case GGML_TYPE_IQ5_K:
@@ -14576,6 +14594,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ4_KSS:
case GGML_TYPE_IQ2_K:
case GGML_TYPE_IQ2_KS:
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ4_K:
case GGML_TYPE_IQ5_K:
@@ -14849,6 +14868,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ4_KSS:
case GGML_TYPE_IQ2_K:
case GGML_TYPE_IQ2_KS:
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ4_K:
case GGML_TYPE_IQ5_K:
@@ -15449,6 +15469,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ4_KSS:
case GGML_TYPE_IQ2_K:
case GGML_TYPE_IQ2_KS:
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ4_K:
case GGML_TYPE_IQ5_K:
@@ -22275,6 +22296,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ4_KSS: result = quantize_iq4_kss(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_KS: result = quantize_iq2_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_KT: result = quantize_iq2_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ5_K: result = quantize_iq5_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;

View File

@@ -3119,4 +3119,216 @@ void vec_dot_iq4_kss_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
GGML_UNUSED(by);
}
// ========================================== iq2_kt ====================================================
namespace {
class QuantizerIQ2KT {
public:
constexpr static int kSuperBlockSize = 256;
constexpr static int kBlockSize = 8;
constexpr static int kNblock = kSuperBlockSize/kBlockSize;
constexpr static int kNumVal = 1 << 12;
QuantizerIQ2KT();
const float * values() const { return m_values.data(); }
static inline void set_values(uint32_t i, float * result) {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
constexpr uint32_t kmask = 0x8fff8fff;
constexpr uint32_t km32 = 0x3b603b60;
uint32_t x = i + 4096;
for (int k = 0; k < kBlockSize; ++k) {
x = ka*x + kb;
uint32_t s = (x & kmask) ^ km32;
float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
int ival = nearest_int(16.f*val);
result[k] = ival;
}
}
private:
std::vector<float> m_values;
};
QuantizerIQ2KT::QuantizerIQ2KT() {
m_values.resize(kNumVal*kBlockSize);
float * data = m_values.data();
for (int i = 0; i < kNumVal; ++i) {
set_values(i, data);
data += kBlockSize;
}
}
const QuantizerIQ2KT& iq2kt_quantizer() {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
static QuantizerIQ2KT quantizer;
return quantizer;
}
#ifdef __AVX2__
__m256 hsum_float_8x8(__m256 * accm) {
for (int i = 0; i < 4; ++i) {
accm[i] = _mm256_set_m128(_mm_add_ps(_mm256_castps256_ps128(accm[i+4]), _mm256_extractf128_ps(accm[i+4], 1)),
_mm_add_ps(_mm256_castps256_ps128(accm[i+0]), _mm256_extractf128_ps(accm[i+0], 1)));
}
for (int i = 0; i < 2; ++i) accm[i] = _mm256_add_ps(_mm256_unpacklo_ps(accm[i], accm[i+2]), _mm256_unpackhi_ps(accm[i], accm[i+2]));
return _mm256_add_ps(_mm256_unpacklo_ps(accm[0], accm[1]), _mm256_unpackhi_ps(accm[0], accm[1]));
}
#endif
void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights) {
static_assert(QuantizerIQ2KT::kNumVal%8 == 0);
block_iq2_kt * y = (block_iq2_kt *)vy;
float weight[QuantizerIQ2KT::kBlockSize];
float scales[QuantizerIQ2KT::kNblock];
const int nblock = n_per_row/QuantizerIQ2KT::kSuperBlockSize;
auto& quantizer = iq2kt_quantizer();
auto values = quantizer.values();
#ifdef __AVX2__
__m256 sqx[8];
__m256 sq2[8];
__m256i add_idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
float sx[8];
int index[8];
#endif
for (int ibl = 0; ibl < nblock; ++ibl) {
memset(&y[ibl], 0, sizeof(block_iq2_kt));
const float * xbl = x + ibl*QuantizerIQ2KT::kSuperBlockSize;
float sumx2 = 0;
for (int j = 0; j < QuantizerIQ2KT::kSuperBlockSize; ++j) sumx2 += xbl[j]*xbl[j];
const float sigma2 = 1.5f*sumx2/QuantizerIQ2KT::kSuperBlockSize;
float amax_scale = 0, max_scale = 0;
for (int ib = 0; ib < QuantizerIQ2KT::kNblock; ++ib) {
const float * xb = xbl + QuantizerIQ2KT::kBlockSize*ib;
if (quant_weights) {
const float * qw = quant_weights + ibl*QuantizerIQ2KT::kSuperBlockSize + ib*QuantizerIQ2KT::kBlockSize;
for (int j = 0; j < QuantizerIQ2KT::kBlockSize; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
} else {
for (int j = 0; j < QuantizerIQ2KT::kBlockSize; ++j) weight[j] = 1;//0.25f*sigma2 + xb[j]*xb[j];
}
#ifdef __AVX2__
auto vw = _mm256_loadu_ps(weight);
auto vx = _mm256_loadu_ps(xb);
auto vbest = _mm256_set1_ps(0.f);
auto best_index = _mm256_set1_epi32(-1);
for (int j = 0; j < QuantizerIQ2KT::kNumVal; j += 8) {
auto idx = _mm256_add_epi32(_mm256_set1_epi32(j), add_idx);
for (int i = 0; i < 8; ++i) {
auto vq = _mm256_loadu_ps(values + QuantizerIQ2KT::kBlockSize*(j+i));
auto wqv = _mm256_mul_ps(vq, vw);
sqx[i] = _mm256_mul_ps(wqv, vx);
sq2[i] = _mm256_mul_ps(wqv, vq);
}
auto sumqx = hsum_float_8x8(sqx);
auto sumq2 = hsum_float_8x8(sq2);
//auto score = _mm256_div_ps(_mm256_mul_ps(sumqx, sumqx), sumq2);
auto score = _mm256_mul_ps(_mm256_mul_ps(sumqx, sumqx), _mm256_rcp_ps(sumq2));
auto mask = _mm256_cmp_ps(score, vbest, _CMP_GT_OQ);
best_index = _mm256_or_si256(_mm256_and_si256(_mm256_castps_si256(mask), idx),
_mm256_andnot_si256(_mm256_castps_si256(mask), best_index));
vbest = _mm256_max_ps(vbest, score);
}
_mm256_store_ps(sx, vbest);
_mm256_store_si256((__m256i *)index, best_index);
float best = sx[0]; int jbest = index[0];
for (int j = 1; j < 8; ++j) {
if (sx[j] > best) { best = sx[j]; jbest = index[j]; }
}
auto qv = values + QuantizerIQ2KT::kBlockSize*jbest;
float sumqx = 0, sumq2 = 0;
for (int k = 0; k < QuantizerIQ2KT::kBlockSize; ++k) {
sumqx += weight[k]*qv[k]*xb[k];
sumq2 += weight[k]*qv[k]*qv[k];
}
scales[ib] = sumqx/sumq2;
float abs_scale = std::abs(scales[ib]);
if (abs_scale > amax_scale) {
amax_scale = abs_scale; max_scale = scales[ib];
}
y[ibl].ql[ib] = (jbest & 255);
y[ibl].qh[ib%(QuantizerIQ2KT::kNblock/2)] |= ((jbest >> 8) << 4*(ib/(QuantizerIQ2KT::kNblock/2)));
#else
#endif
}
float d = max_scale/iq4k_values[0];
y[ibl].d = GGML_FP32_TO_FP16(d);
float id = d ? 1/d : 0.f;
for (int ib = 0; ib < QuantizerIQ2KT::kNblock; ++ib) {
int ls = best_index_iq4nl(iq4k_values, id*scales[ib]);
y[ibl].scales[ib%(QuantizerIQ2KT::kNblock/2)] |= (ls << 4*(ib/(QuantizerIQ2KT::kNblock/2)));
}
}
}
}
void quantize_row_iq2_kt_ref(const float * GGML_RESTRICT x, block_iq2_kt * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
quantize_iq2_kt(x, (void *)y, 1, k, nullptr);
}
void quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(k % QK_K == 0);
block_iq2_kt * y = (block_iq2_kt *)vy;
quantize_row_iq2_kt_ref(x, y, k);
}
size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
GGML_ASSERT(n_per_row%QK_K == 0);
auto row_size = ggml_row_size(GGML_TYPE_IQ2_KT, n_per_row);
char * qrow = (char *)dst;
for (int64_t row = 0; row < nrows; ++row) {
quantize_row_iq2_kt_impl(src, (void *)qrow, n_per_row, imatrix);
src += n_per_row;
qrow += row_size;
}
return nrows * row_size;
}
void dequantize_row_iq2_kt(const block_iq2_kt * x, float * y, int64_t k) {
assert(k % QuantizerIQ2KT::kSuperBlockSize == 0);
const int nb = k / QuantizerIQ2KT::kSuperBlockSize;
auto& deq = iq2kt_quantizer();
for (int ibl = 0; ibl < nb; ++ibl) {
const float d = GGML_FP16_TO_FP32(x[ibl].d);
auto yl = y + ibl*QuantizerIQ2KT::kSuperBlockSize;
auto yh = yl + QuantizerIQ2KT::kSuperBlockSize/2;
for (int ib = 0; ib < QuantizerIQ2KT::kNblock/2; ++ib) {
uint32_t idx1 = x[ibl].ql[ib] | ((x[ibl].qh[ib] & 0xf) << 8);
uint32_t idx2 = x[ibl].ql[ib+QuantizerIQ2KT::kNblock/2] | ((x[ibl].qh[ib] >> 4) << 8);
deq.set_values(idx1, yl);
deq.set_values(idx2, yh);
float s1 = d * iq4k_values[x[ibl].scales[ib] & 0xf];
float s2 = d * iq4k_values[x[ibl].scales[ib] >> 4];
for (int j = 0; j < QuantizerIQ2KT::kBlockSize; ++j) yl[j] *= s1;
for (int j = 0; j < QuantizerIQ2KT::kBlockSize; ++j) yh[j] *= s2;
yl += QuantizerIQ2KT::kBlockSize;
yh += QuantizerIQ2KT::kBlockSize;
}
}
}
void vec_dot_iq2_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
GGML_UNUSED(by);
GGML_UNUSED(bs);
#if GGML_USE_IQK_MULMAT
if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_KT, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
return;
}
#endif
}

View File

@@ -61,6 +61,12 @@ size_t quantize_iq2_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst
void dequantize_row_iq2_ks(const block_iq2_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void vec_dot_iq2_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void quantize_row_iq2_kt_ref(const float * GGML_RESTRICT x, block_iq2_kt * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
size_t quantize_iq2_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
void dequantize_row_iq2_kt(const block_iq2_kt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void vec_dot_iq2_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
#ifdef __cplusplus