diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index d74ef5e0..4c5d408a 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -258,45 +258,8 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -//static void fast_ht(int n, float * values) { -// constexpr float ksqrt2 = 0.707106781f; -// float scale = 1; -// int h = 1; -// while (h < n) { -// for (int i = 0; i < n; i += 2*h) { -// for (int j = i; j < i + h; ++j) { -// float x = values[j], y = values[j + h]; -// values[j+0] = x + y; -// values[j+h] = x - y; -// } -// } -// h *= 2; -// scale *= ksqrt2; -// } -// for (int i = 0; i < n; ++i) values[i] *= scale; -//} - static const int8_t scale_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; -//static std::vector make_values(int nval, int n_per_val) { -// GGML_ASSERT(n_per_val%4 == 0); -// std::vector result(nval*n_per_val); -// const uint32_t a = 89226354, b = 64248484; -// float * data = result.data(); -// uint32_t aux32; -// const uint8_t * q = (const uint8_t *)&aux32; -// for (int i = 0; i < nval; ++i) { -// uint32_t x = i + 32767; -// for (int k = 0; k < n_per_val/4; ++k) { -// x = a*x + b; -// aux32 = x & 0x0f0f0f0f; -// for (int l = 0; l < 4; ++l) data[4*k+l] = scale_values[q[l]]; -// } -// data += n_per_val; -// } -// return result; -//} - static std::vector make_values(int nval, int n_per_val, float scale = 16.f) { std::vector result(nval*n_per_val); uint16_t m16 = ggml_fp32_to_fp16(0.922f); @@ -317,22 +280,6 @@ static std::vector make_values(int nval, int n_per_val, float scale = 16. return result; } -//static std::vector make_values(int nval, int n_per_val) { -// std::vector result(nval*n_per_val); -// const uint32_t a = 34038481, b = 76625530; -// float * data = result.data(); -// for (int i = 0; i < nval; ++i) { -// uint32_t x = i + 4096; -// for (int k = 0; k < n_per_val; ++k) { -// x = a*x + b; -// uint32_t s = (x & 255) + ((x >> 8) & 255) + ((x >> 16) & 255) + ((x >> 24) & 255); -// data[k] = (s - 510.f)/147.8f; -// } -// data += n_per_val; -// } -// return result; -//} - #ifdef __AVX2__ static inline float hsum_float_4(__m128 x) { x = _mm_add_ps(x, _mm_movehl_ps(x, x)); @@ -684,42 +631,11 @@ static void analyze_x_v2(const char * name, int nrows, int n_per_row, const floa for (int i = 0; i < 8; ++i) { if (sx[i] < best) { best = sx[i]; jbest = index[i]; } } - //int jbest_cluster = jbest; - //best = INFINITY; jbest = -1; - //for (auto ip : points) { - // auto vc = codes.data() + ip*kGroupSize; - // float diff2 = 0; - // for (int k = 0; k < kGroupSize; ++k) { - // float delta = d*vc[k] - xl[k]; - // diff2 += wl[k]*delta*delta; - // } - // if (diff2 < best) { - // best = diff2; jbest = ip; - // } - //} if (jbest < 0) { printf("Oops: jbest = %d for cluster %d with %d points\n", jbest, jbest_cluster, int(points.size())); GGML_ASSERT(false); } GGML_ASSERT(jbest >= 0); - //for (int j = 0; j < kNumVal; j += 8) { - // auto idx = _mm256_add_epi32(_mm256_set1_epi32(j), add_idx); - // for (int i = 0; i < 8; ++i) { - // auto vq = _mm256_loadu_ps(codes.data() + kGroupSize*(j+i)); - // auto vdiff = _mm256_sub_ps(vq, vx); - // sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff)); - // } - // auto score = hsum_float_8x8(sqx); - // auto mask = _mm256_cmp_ps(score, vbest, _CMP_LT_OQ); - // best_index = _mm256_or_si256(_mm256_and_si256(_mm256_castps_si256(mask), idx), - // _mm256_andnot_si256(_mm256_castps_si256(mask), best_index)); - // vbest = _mm256_min_ps(vbest, score); - //} - //_mm256_store_ps(sx, vbest); - //_mm256_store_si256((__m256i *)index, best_index); - //for (int i = 0; i < 8; ++i) { - // if (sx[i] < best) { best = sx[i]; jbest = index[i]; } - //} best_idx[ib*kNg + l] = jbest; } auto vqx = _mm256_setzero_ps(); @@ -798,7 +714,6 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * float lmse = 0, lmse_q = 0; std::vector scales(n_per_row/kBlockSize); std::vector best_idx(n_per_row/kBlockSize); - //float xtmp[kBlockSize]; while (true) { std::unique_lock lock(mutex); int first = counter; counter += chunk; @@ -820,11 +735,8 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) { float best = 0, d = 0; int jbest = -1; auto xb = xr + kBlockSize*ib; - //std::memcpy(xtmp, xb, kBlockSize*sizeof(float)); - //fast_ht(kBlockSize, xtmp); #ifdef __AVX2__ for (int l = 0; l < kBlockSize/8; ++l) { - //vx[l] = _mm256_loadu_ps(xtmp+8*l); vx[l] = _mm256_loadu_ps(xb+8*l); } auto vbest = _mm256_set1_ps(0.f); @@ -853,7 +765,6 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * auto qv = codes.data() + kBlockSize*jbest; float sumqx = 0; for (int k = 0; k < kBlockSize; ++k) sumqx += xb[k]*qv[k]; - //for (int k = 0; k < kBlockSize; ++k) sumqx += xtmp[k]*qv[k]; d = sumqx*sumq2i[jbest]; #else for (int j = 0; j < kNumVal; ++j) { @@ -871,7 +782,6 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * best_idx[ib] = jbest; for (int k = 0; k < kBlockSize; ++k) { float diff = xb[k] - d*qv[k]; - //float diff = xtmp[k] - d*qv[k]; lmse += diff*diff; } } @@ -890,45 +800,12 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * int ls = best_index_scale(scale_values, id*scales[ib]); float dl = d * scale_values[ls]; auto xb = xr + kBlockSize*ib; - //std::memcpy(xtmp, xb, kBlockSize*sizeof(float)); - //fast_ht(kBlockSize, xtmp); auto qv = codes.data() + kBlockSize*best_idx[ib]; for (int k = 0; k < kBlockSize; ++k) { float diff = xb[k] - dl*qv[k]; - //float diff = xtmp[k] - dl*qv[k]; lmse_q += diff*diff; } } - //for (int ibl = 0; ibl < n_per_row/kSuperBlockSize; ++ibl) { - // auto sb = scales.data() + ibl*(kSuperBlockSize/kBlockSize); - // auto idx = best_idx.data() + ibl*(kSuperBlockSize/kBlockSize); - // auto xbl = xr + ibl*kSuperBlockSize; - // float amax_scale = 0, max_scale = 0; - // for (int ib = 0; ib < kSuperBlockSize/kBlockSize; ++ib) { - // float ax = std::abs(sb[ib]); - // if (ax > amax_scale) { - // amax_scale = ax; max_scale = sb[ib]; - // } - // //amax_scale = std::max(amax_scale, std::abs(sb[ib])); - // } - // float d = max_scale/scale_values[0]; - // float id = d ? 1/d : 0.f; - // //float id = amax_scale > 0 ? 15/amax_scale : 0; - // //float d = amax_scale/15; - // for (int ib = 0; ib < kSuperBlockSize/kBlockSize; ++ib) { - // int ls = best_index_scale(scale_values, id*sb[ib]); - // float dl = d * scale_values[ls]; - // //int ls = nearest_int(0.5f*(id*sb[ib]+15)); - // //ls = std::max(0, std::min(ls, 15)); - // //float dl = d*(2*ls - 15); - // auto xb = xbl + kBlockSize*ib; - // auto qv = codes.data() + kBlockSize*idx[ib]; - // for (int k = 0; k < kBlockSize; ++k) { - // float diff = xb[k] - dl*qv[k]; - // lmse_q += diff*diff; - // } - // } - //} } } }; @@ -992,17 +869,6 @@ static void analyze_iq4ks(const char * name, int nrows, int n_per_row, const flo lmse += diff4; } else { float best = std::numeric_limits::max(); - //for (int k = 0; k < 16; k += 4) { - // uint16_t v = v0 ^ (1 << k); - // uint8_t v1 = v; - // uint8_t v2 = v >> 8; - // diff1 = xb[j+ 0] - dl*values[v1 & 0xf]; - // diff2 = xb[j+16] - dl*values[v1 >> 4]; - // diff3 = xb[j+ 1] - dl*values[v2 & 0xf]; - // diff4 = xb[j+17] - dl*values[v2 >> 4]; - // float score = diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4; - // if (score < best) best = score; - //} for (int k = 0; k < 4; ++k) { uint16_t v = (v0 >> 4*k) & 0xf; auto pc = popcount(v); @@ -1040,7 +906,6 @@ static void analyze_iq4ks(const ggml_tensor * t, float& tot_mse, float& tot_mse_ return; } if (t->type == GGML_TYPE_F32) { - //analyze_iq4ks(t->name, t->ne[1], t->ne[0], (const float *)t->data, tot_mse, tot_elements); analyze_x_v2(t->name, t->ne[1], t->ne[0], (const float *)t->data, tot_mse, tot_mse_q, tot_elements); } else { std::vector aux(t->ne[0]*t->ne[1]); @@ -1049,7 +914,6 @@ static void analyze_iq4ks(const ggml_tensor * t, float& tot_mse, float& tot_mse_ } else { ggml_bf16_to_fp32_row((const ggml_bf16_t *)t->data, aux.data(), aux.size()); } - //analyze_iq4ks(t->name, t->ne[1], t->ne[0], aux.data(), tot_mse, tot_elements); analyze_x_v2(t->name, t->ne[1], t->ne[0], aux.data(), tot_mse, tot_mse_q, tot_elements); } } diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index fd8d8a50..2d8f023f 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -349,11 +349,6 @@ float __device__ __forceinline__ trellis_next(uint32_t& val) { const half * h = (const half *)&s; val = ka*val + kb; s = (val & kmask) ^ km32; - //float r = (float)(h[0] +h[1]); - //val = ka*val + kb; - //s = (val & kmask) ^ km32; - //r += (float)(h[0]+h[1]); - //return r; return (float)(h[0]+h[1]); } @@ -400,30 +395,6 @@ static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst } } -//template -//static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) { -// -// int64_t ii = blockIdx.x; -// int64_t row = (QK_K * ii) / n_per_row; -// const float * dptr = (const float *)((const char *)vx + row * row_size); -// float scale = dptr[0]; -// float alpha = dptr[1]; -// const block_iq3_kt * x = (const block_iq3_kt *)(dptr + 2); -// const int64_t i = ii - (row*n_per_row)/QK_K; -// -// const int64_t tid = threadIdx.x; -// const int64_t ib = tid; // 0...31 -// dst_t * y = yy + ii*QK_K + 8*ib; -// const uint16_t * ql = (const uint16_t *)x[i].ql; -// uint32_t idx = ql[ib] + 4096; -// const float dl = scale * ((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf) * 31.75f * 1.01f; //1.015f; -// uint8_t mask = 1 << (ib/4); -// for (int j = 0; j < 8; ++j) { -// float ay = std::abs(trellis_next(idx)); -// y[j] = dl * ay/(1 - alpha*ay) * (x[i].qh[(8*ib+j)%32] & mask ? -1.f : 1.f); -// } -//} - template static __global__ void dequantize_block_iq4_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) { diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 45da0854..50e6458d 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -41,30 +41,6 @@ static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& v #endif } -//static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) { -// const half * h = (const half *)s; -// s[0] = trellis_next(val1); -// s[1] = trellis_next(val1); -// s[2] = trellis_next(val1); -// s[3] = trellis_next(val1); -//#ifdef GGML_CUDA_F16 -// bdot1 = __hfma2(y[ 0], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot1); -//#else -// bdot1.x += y[ 0].x * (float)(h[0] + h[1] + h[2] + h[3]); -// bdot1.y += y[ 0].y * (float)(h[4] + h[5] + h[6] + h[7]); -//#endif -// s[0] = trellis_next(val2); -// s[1] = trellis_next(val2); -// s[2] = trellis_next(val2); -// s[3] = trellis_next(val2); -//#ifdef GGML_CUDA_F16 -// bdot2 = __hfma2(y[64], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot2); -//#else -// bdot2.x += y[64].x * (float)(h[0] + h[1] + h[2] + h[3]); -// bdot2.y += y[64].y * (float)(h[4] + h[5] + h[6] + h[7]); -//#endif -//} - static __device__ __forceinline__ void trellis_accum_abs(uint8_t signs1, uint8_t signs2, uint8_t mask1, uint8_t mask2, uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) { const half * h = (const half *)s; @@ -77,8 +53,6 @@ static __device__ __forceinline__ void trellis_accum_abs(uint8_t signs1, uint8_t half h10 = __habs(h[4]+h[5]), h11 = __habs(h[6]+h[7]); half2 h1 = {signs1 & mask1 ? -h00 : h00, signs2 & mask1 ? -h01 : h01}; half2 h2 = {signs1 & mask2 ? -h10 : h10, signs2 & mask2 ? -h11 : h11}; - //half2 h1 = __hmul2(__habs2({h[0]+h[1], h[2]+h[3]}), {signs1 & mask1 ? -1 : 1, signs2 & mask1 ? -1 : 1}); - //half2 h2 = __hmul2(__habs2({h[4]+h[5], h[6]+h[7]}), {signs1 & mask2 ? -1 : 1, signs2 & mask2 ? -1 : 1}); bdot1 = __hfma2(y[ 0], h1, bdot1); bdot2 = __hfma2(y[64], h2, bdot2); #else diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 0c4c1aef..80364373 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -446,9 +446,6 @@ void ggml_cuda_op_mul_mat_vec_q( case GGML_TYPE_IQ2_KS: mul_mat_vec_iq2_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; - //case GGML_TYPE_IQ2_KT: - // mul_mat_vec_iq2_kt_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - // break; case GGML_TYPE_IQ5_K: mul_mat_vec_iq5_k_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 18703657..5f5af45a 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -6653,16 +6653,6 @@ public: if constexpr (is_abs) result[k] = scale*std::abs(val); else result[k] = scale*val; } - //for (int k = 0; k < kGroupSize; ++k) { - // x = ka*x + kb; - // uint32_t s = (x & kmask) ^ km32; - // float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16); - // x = ka*x + kb; - // s = (x & kmask) ^ km32; - // val += GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16); - // if constexpr (is_abs) result[k] = scale*std::abs(0.5f*val); - // else result[k] = 0.5f*scale*val; - //} } static inline int bin4(float x) { @@ -6851,7 +6841,6 @@ void QuantizerIQKT::find_best_match(fl auto& points = m_in_cluster[jbest]; auto& values = points.empty() ? m_values : m_c_values[jbest]; int npoint = values.size()/kGroupSize; - //if (points.empty() || points.size()%8 != 0) printf("Oops: %d points in cluster %d\n", int(points.size()), jbest); GGML_ASSERT(npoint > 0 && npoint%8 == 0); int jbest_cluster = jbest; auto vbest = _mm256_set1_ps(INFINITY); @@ -6917,8 +6906,6 @@ void QuantizerIQKT::find_best_match(fl for (int i = 0; i < 4; ++i) { auto vq = _mm256_loadu_ps(m_clusters.data() + kGroupSize*(j+2*i)); auto vdiff = _mm256_sub_ps(vq, vx); - //vdiff = _mm256_mul_ps(vdiff, vdiff); - //sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff)); vdiff = _mm256_and_ps(sign_bit, vdiff); sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, _mm256_mul_ps(vdiff, vdiff))); } @@ -6947,10 +6934,7 @@ void QuantizerIQKT::find_best_match(fl for (int i = 0; i < 4; ++i) { auto vq = _mm256_loadu_ps(values.data() + kGroupSize*(j+2*i)); auto vdiff = _mm256_sub_ps(vq, vx); - //vdiff = _mm256_mul_ps(vdiff, vdiff); sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff)); - //vdiff = _mm256_and_ps(sign_bit, vdiff); - //sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, _mm256_mul_ps(vdiff, vdiff))); } auto score = hsum_float_4x8(sqx); auto mask = _mm256_cmp_ps(score, vbest, _CMP_LT_OQ); @@ -6981,7 +6965,6 @@ template std::vector> QuantizerIQKT::finalize_clusters(int num_neighbours, const std::vector& values, const std::vector& clusters, std::vector>& c_values) { int ncluster = clusters.size()/kGroupSize; - //GGML_ASSERT(ncluster%8 == 0); std::vector> p_in_cluster(ncluster); std::vector which_cluster(num_neighbours*kNumVal); std::vector ibest(num_neighbours); @@ -7167,28 +7150,11 @@ std::vector QuantizerIQKT::clus printf(" %d", l); } printf("\n"); - //GGML_ABORT("fatal error"); } else { for (int k = 0; k < ndim; ++k) result[ic*ndim + k] = sump[ic*ndim + k]/counts[ic]; } } if (nzero > 0) printf("%s: %d out of %d clusters dir not have any points\n", __func__, nzero, ncluster); - //counts.resize(ndim*ncluster); - //auto fcounts = (float *)counts.data(); - //std::memset(fcounts, 0, counts.size()*sizeof(float)); - //for (int ip = 0; ip < npoint; ++ip) { - // auto vp = points.data() + ndim*ip; - // uint8_t u = 0; - // for (int k = 0; k < ndim; ++k) u |= (bin4(vp[k]) << 2*k); - // for (int k = 0; k < ndim; ++k) { - // float w = std::abs(vp[k]); - // sump[ndim*u + k] += w*vp[k]; - // fcounts[ndim*u + k] += w; - // } - //} - //for (int ic = 0; ic < ncluster; ++ic) { - // for (int k = 0; k < ndim; ++k) result[ic*ndim + k] = fcounts[ic*ndim + k] > 0 ? sump[ic*ndim + k]/fcounts[ic*ndim + k] : 0.f; - //} return result; } std::mt19937 rndm(1234); @@ -7370,8 +7336,6 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f *dptr = d; if (!d) return; - //d *= 1.05f; - for (int iloop = 0; iloop < 1; ++iloop) { float sumqx = 0, sumq2 = 0;