cleanup

2026-03-03 10:30:27 +00:00 · 2025-05-20 06:29:12 +00:00
parent 922b22f1e9
commit d5eb74d719
5 changed files with 0 additions and 230 deletions
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -258,45 +258,8 @@ static inline int nearest_int(float fval) {
    return (i & 0x007fffff) - 0x00400000;
 }

-//static void fast_ht(int n, float * values) {
-//    constexpr float ksqrt2 = 0.707106781f;
-//    float scale = 1;
-//    int h = 1;
-//    while (h < n) {
-//        for (int i = 0; i < n; i += 2*h) {
-//            for (int j = i; j < i + h; ++j) {
-//                float x = values[j], y = values[j + h];
-//                values[j+0] = x + y;
-//                values[j+h] = x - y;
-//            }
-//        }
-//        h *= 2;
-//        scale *= ksqrt2;
-//    }
-//    for (int i = 0; i < n; ++i) values[i] *= scale;
-//}
-
 static const int8_t scale_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

-//static std::vector<float> make_values(int nval, int n_per_val) {
-//    GGML_ASSERT(n_per_val%4 == 0);
-//    std::vector<float> result(nval*n_per_val);
-//    const uint32_t a = 89226354, b = 64248484;
-//    float * data = result.data();
-//    uint32_t aux32;
-//    const uint8_t * q = (const uint8_t *)&aux32;
-//    for (int i = 0; i < nval; ++i) {
-//        uint32_t x = i + 32767;
-//        for (int k = 0; k < n_per_val/4; ++k) {
-//            x = a*x + b;
-//            aux32 = x & 0x0f0f0f0f;
-//            for (int l = 0; l < 4; ++l) data[4*k+l] = scale_values[q[l]];
-//        }
-//        data += n_per_val;
-//    }
-//    return result;
-//}
-
 static std::vector<float> make_values(int nval, int n_per_val, float scale = 16.f) {
    std::vector<float> result(nval*n_per_val);
    uint16_t m16 = ggml_fp32_to_fp16(0.922f);
@@ -317,22 +280,6 @@ static std::vector<float> make_values(int nval, int n_per_val, float scale = 16.
    return result;
 }

-//static std::vector<float> make_values(int nval, int n_per_val) {
-//    std::vector<float> result(nval*n_per_val);
-//    const uint32_t a = 34038481, b = 76625530;
-//    float * data = result.data();
-//    for (int i = 0; i < nval; ++i) {
-//        uint32_t x = i + 4096;
-//        for (int k = 0; k < n_per_val; ++k) {
-//            x = a*x + b;
-//            uint32_t s = (x & 255) + ((x >> 8) & 255) + ((x >> 16) & 255) + ((x >> 24) & 255);
-//            data[k] = (s - 510.f)/147.8f;
-//        }
-//        data += n_per_val;
-//    }
-//    return result;
-//}
-
 #ifdef __AVX2__
 static inline float hsum_float_4(__m128 x) {
    x = _mm_add_ps(x, _mm_movehl_ps(x, x));
@@ -684,42 +631,11 @@ static void analyze_x_v2(const char * name, int nrows, int n_per_row, const floa
                        for (int i = 0; i < 8; ++i) {
                            if (sx[i] < best) { best = sx[i]; jbest = index[i]; }
                        }
-                        //int jbest_cluster = jbest;
-                        //best = INFINITY; jbest = -1;
-                        //for (auto ip : points) {
-                        //    auto vc = codes.data() + ip*kGroupSize;
-                        //    float diff2 = 0;
-                        //    for (int k = 0; k < kGroupSize; ++k) {
-                        //        float delta = d*vc[k] - xl[k];
-                        //        diff2 += wl[k]*delta*delta;
-                        //    }
-                        //    if (diff2 < best) {
-                        //        best = diff2; jbest = ip;
-                        //    }
-                        //}
                        if (jbest < 0) {
                            printf("Oops: jbest = %d for cluster %d with %d points\n", jbest, jbest_cluster, int(points.size()));
                            GGML_ASSERT(false);
                        }
                        GGML_ASSERT(jbest >= 0);
-                        //for (int j = 0; j < kNumVal; j += 8) {
-                        //    auto idx = _mm256_add_epi32(_mm256_set1_epi32(j), add_idx);
-                        //    for (int i = 0; i < 8; ++i) {
-                        //        auto vq = _mm256_loadu_ps(codes.data() + kGroupSize*(j+i));
-                        //        auto vdiff = _mm256_sub_ps(vq, vx);
-                        //        sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff));
-                        //    }
-                        //    auto score = hsum_float_8x8(sqx);
-                        //    auto mask  = _mm256_cmp_ps(score, vbest, _CMP_LT_OQ);
-                        //    best_index = _mm256_or_si256(_mm256_and_si256(_mm256_castps_si256(mask), idx),
-                        //                              _mm256_andnot_si256(_mm256_castps_si256(mask), best_index));
-                        //    vbest = _mm256_min_ps(vbest, score);
-                        //}
-                        //_mm256_store_ps(sx, vbest);
-                        //_mm256_store_si256((__m256i *)index, best_index);
-                        //for (int i = 0; i < 8; ++i) {
-                        //    if (sx[i] < best) { best = sx[i]; jbest = index[i]; }
-                        //}
                        best_idx[ib*kNg + l] = jbest;
                    }
                    auto vqx = _mm256_setzero_ps();
@@ -798,7 +714,6 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float *
        float lmse = 0, lmse_q = 0;
        std::vector<float> scales(n_per_row/kBlockSize);
        std::vector<int> best_idx(n_per_row/kBlockSize);
-        //float xtmp[kBlockSize];
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int first = counter; counter += chunk;
@@ -820,11 +735,8 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float *
                for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
                    float best = 0, d = 0; int jbest = -1;
                    auto xb = xr + kBlockSize*ib;
-                    //std::memcpy(xtmp, xb, kBlockSize*sizeof(float));
-                    //fast_ht(kBlockSize, xtmp);
 #ifdef __AVX2__
                    for (int l = 0; l < kBlockSize/8; ++l) {
-                        //vx[l] = _mm256_loadu_ps(xtmp+8*l);
                        vx[l] = _mm256_loadu_ps(xb+8*l);
                    }
                    auto vbest = _mm256_set1_ps(0.f);
@@ -853,7 +765,6 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float *
                    auto qv = codes.data() + kBlockSize*jbest;
                    float sumqx = 0;
                    for (int k = 0; k < kBlockSize; ++k) sumqx += xb[k]*qv[k];
-                    //for (int k = 0; k < kBlockSize; ++k) sumqx += xtmp[k]*qv[k];
                    d = sumqx*sumq2i[jbest];
 #else
                    for (int j = 0; j < kNumVal; ++j) {
@@ -871,7 +782,6 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float *
                    best_idx[ib] = jbest;
                    for (int k = 0; k < kBlockSize; ++k) {
                        float diff = xb[k] - d*qv[k];
-                        //float diff = xtmp[k] - d*qv[k];
                        lmse += diff*diff;
                    }
                }
@@ -890,45 +800,12 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float *
                    int ls = best_index_scale(scale_values, id*scales[ib]);
                    float dl = d * scale_values[ls];
                    auto xb = xr + kBlockSize*ib;
-                    //std::memcpy(xtmp, xb, kBlockSize*sizeof(float));
-                    //fast_ht(kBlockSize, xtmp);
                    auto qv = codes.data() + kBlockSize*best_idx[ib];
                    for (int k = 0; k < kBlockSize; ++k) {
                        float diff = xb[k] - dl*qv[k];
-                        //float diff = xtmp[k] - dl*qv[k];
                        lmse_q += diff*diff;
                    }
                }
-                //for (int ibl = 0; ibl < n_per_row/kSuperBlockSize; ++ibl) {
-                //    auto sb = scales.data() + ibl*(kSuperBlockSize/kBlockSize);
-                //    auto idx = best_idx.data() + ibl*(kSuperBlockSize/kBlockSize);
-                //    auto xbl = xr + ibl*kSuperBlockSize;
-                //    float amax_scale = 0, max_scale = 0;
-                //    for (int ib = 0; ib < kSuperBlockSize/kBlockSize; ++ib) {
-                //        float ax = std::abs(sb[ib]);
-                //        if (ax > amax_scale) {
-                //            amax_scale = ax; max_scale = sb[ib];
-                //        }
-                //        //amax_scale = std::max(amax_scale, std::abs(sb[ib]));
-                //    }
-                //    float d = max_scale/scale_values[0];
-                //    float id = d ? 1/d : 0.f;
-                //    //float id = amax_scale > 0 ? 15/amax_scale : 0;
-                //    //float d = amax_scale/15;
-                //    for (int ib = 0; ib < kSuperBlockSize/kBlockSize; ++ib) {
-                //        int ls = best_index_scale(scale_values, id*sb[ib]);
-                //        float dl = d * scale_values[ls];
-                //        //int ls = nearest_int(0.5f*(id*sb[ib]+15));
-                //        //ls = std::max(0, std::min(ls, 15));
-                //        //float dl = d*(2*ls - 15);
-                //        auto xb = xbl + kBlockSize*ib;
-                //        auto qv = codes.data() + kBlockSize*idx[ib];
-                //        for (int k = 0; k < kBlockSize; ++k) {
-                //            float diff = xb[k] - dl*qv[k];
-                //            lmse_q += diff*diff;
-                //        }
-                //    }
-                //}
            }
        }
    };
@@ -992,17 +869,6 @@ static void analyze_iq4ks(const char * name, int nrows, int n_per_row, const flo
                                lmse += diff4;
                            } else {
                                float best = std::numeric_limits<float>::max();
-                                //for (int k = 0; k < 16; k += 4) {
-                                //    uint16_t v = v0 ^ (1 << k);
-                                //    uint8_t v1 = v;
-                                //    uint8_t v2 = v >> 8;
-                                //    diff1 = xb[j+ 0] - dl*values[v1 & 0xf];
-                                //    diff2 = xb[j+16] - dl*values[v1 >>  4];
-                                //    diff3 = xb[j+ 1] - dl*values[v2 & 0xf];
-                                //    diff4 = xb[j+17] - dl*values[v2 >>  4];
-                                //    float score = diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
-                                //    if (score < best) best = score;
-                                //}
                                for (int k = 0; k < 4; ++k) {
                                    uint16_t v = (v0 >> 4*k) & 0xf;
                                    auto pc = popcount(v);
@@ -1040,7 +906,6 @@ static void analyze_iq4ks(const ggml_tensor * t, float& tot_mse, float& tot_mse_
        return;
    }
    if (t->type == GGML_TYPE_F32) {
-        //analyze_iq4ks(t->name, t->ne[1], t->ne[0], (const float *)t->data, tot_mse, tot_elements);
        analyze_x_v2(t->name, t->ne[1], t->ne[0], (const float *)t->data, tot_mse, tot_mse_q, tot_elements);
    } else {
        std::vector<float> aux(t->ne[0]*t->ne[1]);
@@ -1049,7 +914,6 @@ static void analyze_iq4ks(const ggml_tensor * t, float& tot_mse, float& tot_mse_
        } else {
            ggml_bf16_to_fp32_row((const ggml_bf16_t *)t->data, aux.data(), aux.size());
        }
-        //analyze_iq4ks(t->name, t->ne[1], t->ne[0], aux.data(), tot_mse, tot_elements);
        analyze_x_v2(t->name, t->ne[1], t->ne[0], aux.data(), tot_mse, tot_mse_q, tot_elements);
    }
 }
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -349,11 +349,6 @@ float __device__ __forceinline__ trellis_next(uint32_t& val) {
    const half * h = (const half *)&s;
    val = ka*val + kb;
    s = (val & kmask) ^ km32;
-    //float r = (float)(h[0] +h[1]);
-    //val = ka*val + kb;
-    //s = (val & kmask) ^ km32;
-    //r += (float)(h[0]+h[1]);
-    //return r;
    return (float)(h[0]+h[1]);
 }

@@ -400,30 +395,6 @@ static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst
    }
 }

-//template<typename dst_t>
-//static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
-//
-//    int64_t ii  = blockIdx.x;
-//    int64_t row = (QK_K * ii) / n_per_row;
-//    const float * dptr = (const float *)((const char *)vx + row * row_size);
-//    float scale = dptr[0];
-//    float alpha = dptr[1];
-//    const block_iq3_kt * x = (const block_iq3_kt *)(dptr + 2);
-//    const int64_t i = ii - (row*n_per_row)/QK_K;
-//
-//    const int64_t tid = threadIdx.x;
-//    const int64_t ib = tid; // 0...31
-//    dst_t * y = yy + ii*QK_K + 8*ib;
-//    const uint16_t * ql = (const uint16_t *)x[i].ql;
-//    uint32_t idx = ql[ib] + 4096;
-//    const float dl = scale * ((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf) * 31.75f * 1.01f; //1.015f;
-//    uint8_t mask = 1 << (ib/4);
-//    for (int j = 0; j < 8; ++j) {
-//        float ay = std::abs(trellis_next(idx));
-//        y[j] = dl * ay/(1 - alpha*ay) * (x[i].qh[(8*ib+j)%32] & mask ? -1.f : 1.f);
-//    }
-//}
-
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {

--- a/ggml/src/ggml-cuda/dmmv.cu
+++ b/ggml/src/ggml-cuda/dmmv.cu
@@ -41,30 +41,6 @@ static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& v
 #endif
 }

-//static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) {
-//    const half * h = (const half *)s;
-//    s[0] = trellis_next(val1);
-//    s[1] = trellis_next(val1);
-//    s[2] = trellis_next(val1);
-//    s[3] = trellis_next(val1);
-//#ifdef GGML_CUDA_F16
-//    bdot1 = __hfma2(y[ 0], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot1);
-//#else
-//    bdot1.x += y[ 0].x * (float)(h[0] + h[1] + h[2] + h[3]);
-//    bdot1.y += y[ 0].y * (float)(h[4] + h[5] + h[6] + h[7]);
-//#endif
-//    s[0] = trellis_next(val2);
-//    s[1] = trellis_next(val2);
-//    s[2] = trellis_next(val2);
-//    s[3] = trellis_next(val2);
-//#ifdef GGML_CUDA_F16
-//    bdot2 = __hfma2(y[64], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot2);
-//#else
-//    bdot2.x += y[64].x * (float)(h[0] + h[1] + h[2] + h[3]);
-//    bdot2.y += y[64].y * (float)(h[4] + h[5] + h[6] + h[7]);
-//#endif
-//}
-
 static __device__ __forceinline__ void trellis_accum_abs(uint8_t signs1, uint8_t signs2, uint8_t mask1, uint8_t mask2,
        uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) {
    const half * h = (const half *)s;
@@ -77,8 +53,6 @@ static __device__ __forceinline__ void trellis_accum_abs(uint8_t signs1, uint8_t
    half h10 = __habs(h[4]+h[5]), h11 = __habs(h[6]+h[7]);
    half2 h1 = {signs1 & mask1 ? -h00 : h00, signs2 & mask1 ? -h01 : h01};
    half2 h2 = {signs1 & mask2 ? -h10 : h10, signs2 & mask2 ? -h11 : h11};
-    //half2 h1 = __hmul2(__habs2({h[0]+h[1], h[2]+h[3]}), {signs1 & mask1 ? -1 : 1, signs2 & mask1 ? -1 : 1});
-    //half2 h2 = __hmul2(__habs2({h[4]+h[5], h[6]+h[7]}), {signs1 & mask2 ? -1 : 1, signs2 & mask2 ? -1 : 1});
    bdot1 = __hfma2(y[ 0], h1, bdot1);
    bdot2 = __hfma2(y[64], h2, bdot2);
 #else
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -446,9 +446,6 @@ void ggml_cuda_op_mul_mat_vec_q(
        case GGML_TYPE_IQ2_KS:
            mul_mat_vec_iq2_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
-        //case GGML_TYPE_IQ2_KT:
-        //    mul_mat_vec_iq2_kt_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-        //    break;
        case GGML_TYPE_IQ5_K:
            mul_mat_vec_iq5_k_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -6653,16 +6653,6 @@ public:
            if constexpr (is_abs) result[k] = scale*std::abs(val);
            else result[k] = scale*val;
        }
-        //for (int k = 0; k < kGroupSize; ++k) {
-        //    x = ka*x + kb;
-        //    uint32_t s = (x & kmask) ^ km32;
-        //    float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
-        //    x = ka*x + kb;
-        //    s = (x & kmask) ^ km32;
-        //    val += GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
-        //    if constexpr (is_abs) result[k] = scale*std::abs(0.5f*val);
-        //    else result[k] = 0.5f*scale*val;
-        //}
    }

    static inline int bin4(float x) {
@@ -6851,7 +6841,6 @@ void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(fl
            auto& points = m_in_cluster[jbest];
            auto& values = points.empty() ? m_values : m_c_values[jbest];
            int npoint = values.size()/kGroupSize;
-            //if (points.empty() || points.size()%8 != 0) printf("Oops: %d points in cluster %d\n", int(points.size()), jbest);
            GGML_ASSERT(npoint > 0 && npoint%8 == 0);
            int jbest_cluster = jbest;
            auto vbest = _mm256_set1_ps(INFINITY);
@@ -6917,8 +6906,6 @@ void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(fl
                    for (int i = 0; i < 4; ++i) {
                        auto vq = _mm256_loadu_ps(m_clusters.data() + kGroupSize*(j+2*i));
                        auto vdiff = _mm256_sub_ps(vq, vx);
-                        //vdiff = _mm256_mul_ps(vdiff, vdiff);
-                        //sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff));
                        vdiff = _mm256_and_ps(sign_bit, vdiff);
                        sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, _mm256_mul_ps(vdiff, vdiff)));
                    }
@@ -6947,10 +6934,7 @@ void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(fl
                for (int i = 0; i < 4; ++i) {
                    auto vq = _mm256_loadu_ps(values.data() + kGroupSize*(j+2*i));
                    auto vdiff = _mm256_sub_ps(vq, vx);
-                    //vdiff = _mm256_mul_ps(vdiff, vdiff);
                    sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff));
-                    //vdiff = _mm256_and_ps(sign_bit, vdiff);
-                    //sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, _mm256_mul_ps(vdiff, vdiff)));
                }
                auto score = hsum_float_4x8(sqx);
                auto mask  = _mm256_cmp_ps(score, vbest, _CMP_LT_OQ);
@@ -6981,7 +6965,6 @@ template <int block_size, int group_size, int num_bits, bool is_abs>
 std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::finalize_clusters(int num_neighbours,
        const std::vector<float>& values, const std::vector<float>& clusters, std::vector<std::vector<float>>& c_values) {
    int ncluster = clusters.size()/kGroupSize;
-    //GGML_ASSERT(ncluster%8 == 0);
    std::vector<std::vector<int>> p_in_cluster(ncluster);
    std::vector<int> which_cluster(num_neighbours*kNumVal);
    std::vector<int> ibest(num_neighbours);
@@ -7167,28 +7150,11 @@ std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::clus
                    printf(" %d", l);
                }
                printf("\n");
-                //GGML_ABORT("fatal error");
            } else {
                for (int k = 0; k < ndim; ++k) result[ic*ndim + k] = sump[ic*ndim + k]/counts[ic];
            }
        }
        if (nzero > 0) printf("%s: %d out of %d clusters dir not have any points\n", __func__, nzero, ncluster);
-        //counts.resize(ndim*ncluster);
-        //auto fcounts = (float *)counts.data();
-        //std::memset(fcounts, 0, counts.size()*sizeof(float));
-        //for (int ip = 0; ip < npoint; ++ip) {
-        //    auto vp = points.data() + ndim*ip;
-        //    uint8_t u = 0;
-        //    for (int k = 0; k < ndim; ++k) u |= (bin4(vp[k]) << 2*k);
-        //    for (int k = 0; k < ndim; ++k) {
-        //        float w = std::abs(vp[k]);
-        //        sump[ndim*u + k] += w*vp[k];
-        //        fcounts[ndim*u + k] += w;
-        //    }
-        //}
-        //for (int ic = 0; ic < ncluster; ++ic) {
-        //    for (int k = 0; k < ndim; ++k) result[ic*ndim + k] = fcounts[ic*ndim + k] > 0 ? sump[ic*ndim + k]/fcounts[ic*ndim + k] : 0.f;
-        //}
        return result;
    }
    std::mt19937 rndm(1234);
@@ -7370,8 +7336,6 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
    *dptr = d;
    if (!d) return;

-    //d *= 1.05f;
-
    for (int iloop = 0; iloop < 1; ++iloop) {

        float sumqx = 0, sumq2 = 0;