diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index bdf35195..799fe3d3 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -393,6 +393,8 @@ static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst dst_t * y = yy + ii*QK_K + 8*ib; uint32_t idx1 = x[i].ql[2*ib+0] + ((x[i].qh[(2*ib+0)%32] << (8-4*((2*ib+0)/32))) & 0xf00) + 4096; uint32_t idx2 = x[i].ql[2*ib+1] + ((x[i].qh[(2*ib+1)%32] << (8-4*((2*ib+1)/32))) & 0xf00) + 4096; + //const int8_t * sv = (const int8_t *)x[i].scales; + //const float dl = scale * sv[ib/8] * 31.75f * 1.015f; const float dl = scale * scale_values[((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf)] * 31.75f * 1.015f; uint32_t s[2]; const half * h = (const half *)s; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 2c5f5a48..06b38959 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -3958,7 +3958,6 @@ const QuantizerIQ3KT& iq3kt_quantizer() { std::lock_guard lock(mutex); static std::unique_ptr quantizer; if (!quantizer) quantizer = std::make_unique(256, 16); - //if (!quantizer) quantizer = std::make_unique(64, 5); return *quantizer; } @@ -4014,10 +4013,9 @@ void quantize_row_iq3_kt_impl(const float * x, void * vy, int n_per_row, const f scales[ib] = 0; if (!amax) continue; - //float scale_0 = std::max(80.f, 127.f*amax/amax_row); float scale_0 = std::max(80.f, 127.f*amax/amax_row); float best = 0; - for (int itry = -3; itry <= 3; ++itry) { + for (int itry = -5; itry <= 5; ++itry) { quantizer.find_best_match(amax/(scale_0 + kStep*itry), xb, weight, best_idx); auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx); if (score_p > best) { @@ -4079,6 +4077,7 @@ void quantize_row_iq3_kt_impl(const float * x, void * vy, int n_per_row, const f } } + //float d = -max_scale/128; float id = d ? 1/d : 0.f; for (int ibl = 0; ibl < nblock; ++ibl) { auto scales = all_scales + ibl*Q::kNblock; @@ -4087,6 +4086,11 @@ void quantize_row_iq3_kt_impl(const float * x, void * vy, int n_per_row, const f int ls2 = best_index_iq4nl(scale_values, id*scales[ib + Q::kNblock/2]); y[ibl].scales[ib] = ls1 | (ls2 << 4); } + //int8_t * sv = (int8_t *)y[ibl].scales; + //for (int ib = 0; ib < Q::kNblock; ++ib) { + // int ls = nearest_int(id*scales[ib]); + // sv[ib] = std::min(ls, 127); + //} } *dptr = d; @@ -4098,10 +4102,12 @@ void quantize_row_iq3_kt_impl(const float * x, void * vy, int n_per_row, const f std::memset(y[ibl].qh, 0, kNumGroups/2); const float * xbl = x + ibl*Q::kSuperBlockSize; + //int8_t * sv = (int8_t *)y[ibl].scales; for (int ib = 0; ib < Q::kNblock; ++ib) { const float * xb = xbl + Q::kBlockSize*ib; const float * weight = all_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize; + //int ls = sv[ib]; int ls = scale_values[((y[ibl].scales[ib%(Q::kNblock/2)] >> 4*(ib/(Q::kNblock/2))) & 0xf)]; float dl = d*ls; quantizer.find_best_match(dl, xb, weight, best_idx); @@ -4129,6 +4135,32 @@ void quantize_row_iq3_kt_impl(const float * x, void * vy, int n_per_row, const f break; } } + + //float aux[Q::kGroupSize]; + //float sumd = 0, sumw = 0, mse = 0; + //for (int ibl = 0; ibl < nblock; ++ibl) { + // const float * xbl = x + ibl*Q::kSuperBlockSize; + // for (int ib = 0; ib < Q::kNblock; ++ib) { + // const float * xb = xbl + Q::kBlockSize*ib; + // const float * weight = all_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize; + // int ls = scale_values[((y[ibl].scales[ib%(Q::kNblock/2)] >> 4*(ib/(Q::kNblock/2))) & 0xf)]; + // float dl = d*ls*Q::kScale; + // for (int j = 0; j < Q::kNg; ++j) { + // int jj = ib*Q::kNg + j; + // int idx = y[ibl].ql[jj] + ((y[ibl].qh[jj%(kNumGroups/2)] << (8 - 4*(jj/(kNumGroups/2)))) & 0xf00); + // quantizer.set_values(idx, aux, dl); + // auto xl = xb + Q::kGroupSize*j; + // auto wl = weight + Q::kGroupSize*j; + // for (int k = 0; k < Q::kGroupSize; ++k) { + // float diff = xl[k] - aux[k]; + // sumw += wl[k]; + // sumd += wl[k]*diff; + // mse += diff*diff; + // } + // } + // } + //} + //printf("rmse = %g, delta = %g, %g\n", sqrt(mse/n_per_row), sumd/sumw, sumd/sumw/amax_row); } } @@ -4172,8 +4204,11 @@ void dequantize_row_iq3_kt(const block_iq3_kt * x, float * y, int64_t k) { auto yh = yl + Q::kSuperBlockSize/2; auto qll = x[ibl].ql; auto qlh = qll + kNumGroups/2; + //const int8_t * sv = (const int8_t *)x[ibl].scales; int jj = 0; for (int ib = 0; ib < Q::kNblock/2; ++ib) { + //float sl = d * sv[ib]; + //float sh = d * sv[ib+Q::kNblock/2]; float sl = d * scale_values[(x[ibl].scales[ib] & 0xf)]; float sh = d * scale_values[(x[ibl].scales[ib] >> 4)]; for (int ig = 0; ig < Q::kNg; ++ig) { @@ -4238,16 +4273,6 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f Q::set_weights(kSigmaScale, nblock, x, quant_weights, all_weights); - //float sigma_row = 0; - //for (int j = 0; j < n_per_row; ++j) sigma_row += x[j]*x[j]; - //if (!sigma_row) { - // *dptr = 0.f; - // std::memset(y, 0, nblock*sizeof(block_iq4_kt)); - // return; - //} - //sigma_row = sqrt(sigma_row/n_per_row); - //float row_scale = 4.f/sigma_row; - float amax_row = 0; for (int j = 0; j < n_per_row; ++j) amax_row = std::max(amax_row, std::abs(x[j])); if (!amax_row) { @@ -4282,7 +4307,6 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f } float best = 0; float scale_0 = std::max(92.f, 127.f*amax/amax_row); - //float scale_0 = 96.f; for (int itry = -2; itry <= 2; ++itry) { quantizer.find_best_match( amax/(8.f*itry + scale_0), xb, weight, best_idx); auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx); @@ -4297,43 +4321,8 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f for (int j = 0; j < Q::kNg; ++j) ql[j] = best_idx[j]; } } - //for (int j = 0; j < Q::kNg; ++j) best_idx[j] = ql[j]; - //auto inv_scale = quantizer.find_best_inverse_scale(xb, weight, best_idx); - //if (inv_scale) { - // quantizer.find_best_match(1/inv_scale, xb, weight, best_idx); - // auto [d, score] = quantizer.find_best_scale(xb, weight, best_idx); - // if (score > best) { - // if (score > 1.02f*best) printf("New best match: %g vs %g, score is %g vs %g\n", d, scales[ib], score, best); - // scales[ib] = d; - // for (int j = 0; j < Q::kNg; ++j) ql[j] = best_idx[j]; - // } - //} - ////float scale_0 = row_scale; - //quantizer.find_best_match( amax/scale_0, xb, weight, best_idx); - //auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx); - //quantizer.find_best_match(-amax/scale_0, xb, weight, best_idx+Q::kNg); - //auto [dm, score_m] = quantizer.find_best_scale(xb, weight, best_idx+Q::kNg); - //if (score_p > score_m) { - // scales[ib] = dp; - // for (int j = 0; j < Q::kNg; ++j) ql[j] = best_idx[j]; - //} else { - // scales[ib] = dm; - // for (int j = 0; j < Q::kNg; ++j) ql[j] = best_idx[j+Q::kNg]; - //} - - //float mse = 0; - //for (int j = 0; j < Q::kNg; ++j) { - // auto values = quantizer.values() + ql[j]*Q::kGroupSize; - // for (int k = 0; k < Q::kGroupSize; ++k) { - // float diff = scales[ib]*values[k] - xb[j*Q::kGroupSize+k]; - // mse += diff*diff; - // } - //} - //printf("rmse(%d) = %g\n", ib, sqrt(mse/Q::kBlockSize)); ql += Q::kNg; - //scales[ib] = score_p > score_m ? dp : dm; - float abs_scale = std::abs(scales[ib]); if (abs_scale > amax_scale) { amax_scale = abs_scale; @@ -4345,27 +4334,13 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f float d = -max_scale/128; float id = d ? 1/d : 0.f; - //float mse = 0; for (int ibl = 0; ibl < nblock; ++ibl) { auto scales = all_scales + ibl*Q::kNblock; - //const float * xbl = x + ibl*Q::kSuperBlockSize; - //auto ql = (uint16_t *)y[ibl].ql; for (int ib = 0; ib < Q::kNblock; ++ib) { int ls = nearest_int(id*scales[ib]); y[ibl].scales[ib] = std::min(ls, 127); - //float dl = d*y[ibl].scales[ib]; - //const float * xb = xbl + Q::kBlockSize*ib; - //for (int j = 0; j < Q::kNg; ++j) { - // auto values = quantizer.values() + ql[j]*Q::kGroupSize; - // for (int k = 0; k < Q::kGroupSize; ++k) { - // float diff = dl*values[k] - xb[j*Q::kGroupSize+k]; - // mse += diff*diff; - // } - // ql += Q::kNg; - //} } } - //printf("rmse = %g\n", sqrt(mse/n_per_row)); *dptr = d; if (!d) return; @@ -4392,7 +4367,6 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f qs[j] = best_idx[j]; auto xl = xb + Q::kGroupSize*j; auto wl = weight + Q::kGroupSize*j; - //auto ql = quantizer.values() + best_idx[j]*Q::kGroupSize; auto ql = quantizer.values() + qs[j]*Q::kGroupSize; for (int k = 0; k < Q::kGroupSize; ++k) { float q = ql[k]*ls;