mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-29 10:51:51 +00:00
cleanup
This commit is contained in:
@@ -349,11 +349,6 @@ float __device__ __forceinline__ trellis_next(uint32_t& val) {
|
||||
const half * h = (const half *)&s;
|
||||
val = ka*val + kb;
|
||||
s = (val & kmask) ^ km32;
|
||||
//float r = (float)(h[0] +h[1]);
|
||||
//val = ka*val + kb;
|
||||
//s = (val & kmask) ^ km32;
|
||||
//r += (float)(h[0]+h[1]);
|
||||
//return r;
|
||||
return (float)(h[0]+h[1]);
|
||||
}
|
||||
|
||||
@@ -400,30 +395,6 @@ static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst
|
||||
}
|
||||
}
|
||||
|
||||
//template<typename dst_t>
|
||||
//static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
|
||||
//
|
||||
// int64_t ii = blockIdx.x;
|
||||
// int64_t row = (QK_K * ii) / n_per_row;
|
||||
// const float * dptr = (const float *)((const char *)vx + row * row_size);
|
||||
// float scale = dptr[0];
|
||||
// float alpha = dptr[1];
|
||||
// const block_iq3_kt * x = (const block_iq3_kt *)(dptr + 2);
|
||||
// const int64_t i = ii - (row*n_per_row)/QK_K;
|
||||
//
|
||||
// const int64_t tid = threadIdx.x;
|
||||
// const int64_t ib = tid; // 0...31
|
||||
// dst_t * y = yy + ii*QK_K + 8*ib;
|
||||
// const uint16_t * ql = (const uint16_t *)x[i].ql;
|
||||
// uint32_t idx = ql[ib] + 4096;
|
||||
// const float dl = scale * ((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf) * 31.75f * 1.01f; //1.015f;
|
||||
// uint8_t mask = 1 << (ib/4);
|
||||
// for (int j = 0; j < 8; ++j) {
|
||||
// float ay = std::abs(trellis_next(idx));
|
||||
// y[j] = dl * ay/(1 - alpha*ay) * (x[i].qh[(8*ib+j)%32] & mask ? -1.f : 1.f);
|
||||
// }
|
||||
//}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
|
||||
|
||||
|
||||
@@ -41,30 +41,6 @@ static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& v
|
||||
#endif
|
||||
}
|
||||
|
||||
//static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) {
|
||||
// const half * h = (const half *)s;
|
||||
// s[0] = trellis_next(val1);
|
||||
// s[1] = trellis_next(val1);
|
||||
// s[2] = trellis_next(val1);
|
||||
// s[3] = trellis_next(val1);
|
||||
//#ifdef GGML_CUDA_F16
|
||||
// bdot1 = __hfma2(y[ 0], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot1);
|
||||
//#else
|
||||
// bdot1.x += y[ 0].x * (float)(h[0] + h[1] + h[2] + h[3]);
|
||||
// bdot1.y += y[ 0].y * (float)(h[4] + h[5] + h[6] + h[7]);
|
||||
//#endif
|
||||
// s[0] = trellis_next(val2);
|
||||
// s[1] = trellis_next(val2);
|
||||
// s[2] = trellis_next(val2);
|
||||
// s[3] = trellis_next(val2);
|
||||
//#ifdef GGML_CUDA_F16
|
||||
// bdot2 = __hfma2(y[64], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot2);
|
||||
//#else
|
||||
// bdot2.x += y[64].x * (float)(h[0] + h[1] + h[2] + h[3]);
|
||||
// bdot2.y += y[64].y * (float)(h[4] + h[5] + h[6] + h[7]);
|
||||
//#endif
|
||||
//}
|
||||
|
||||
static __device__ __forceinline__ void trellis_accum_abs(uint8_t signs1, uint8_t signs2, uint8_t mask1, uint8_t mask2,
|
||||
uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) {
|
||||
const half * h = (const half *)s;
|
||||
@@ -77,8 +53,6 @@ static __device__ __forceinline__ void trellis_accum_abs(uint8_t signs1, uint8_t
|
||||
half h10 = __habs(h[4]+h[5]), h11 = __habs(h[6]+h[7]);
|
||||
half2 h1 = {signs1 & mask1 ? -h00 : h00, signs2 & mask1 ? -h01 : h01};
|
||||
half2 h2 = {signs1 & mask2 ? -h10 : h10, signs2 & mask2 ? -h11 : h11};
|
||||
//half2 h1 = __hmul2(__habs2({h[0]+h[1], h[2]+h[3]}), {signs1 & mask1 ? -1 : 1, signs2 & mask1 ? -1 : 1});
|
||||
//half2 h2 = __hmul2(__habs2({h[4]+h[5], h[6]+h[7]}), {signs1 & mask2 ? -1 : 1, signs2 & mask2 ? -1 : 1});
|
||||
bdot1 = __hfma2(y[ 0], h1, bdot1);
|
||||
bdot2 = __hfma2(y[64], h2, bdot2);
|
||||
#else
|
||||
|
||||
@@ -446,9 +446,6 @@ void ggml_cuda_op_mul_mat_vec_q(
|
||||
case GGML_TYPE_IQ2_KS:
|
||||
mul_mat_vec_iq2_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
//case GGML_TYPE_IQ2_KT:
|
||||
// mul_mat_vec_iq2_kt_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
// break;
|
||||
case GGML_TYPE_IQ5_K:
|
||||
mul_mat_vec_iq5_k_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
|
||||
@@ -6653,16 +6653,6 @@ public:
|
||||
if constexpr (is_abs) result[k] = scale*std::abs(val);
|
||||
else result[k] = scale*val;
|
||||
}
|
||||
//for (int k = 0; k < kGroupSize; ++k) {
|
||||
// x = ka*x + kb;
|
||||
// uint32_t s = (x & kmask) ^ km32;
|
||||
// float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
|
||||
// x = ka*x + kb;
|
||||
// s = (x & kmask) ^ km32;
|
||||
// val += GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
|
||||
// if constexpr (is_abs) result[k] = scale*std::abs(0.5f*val);
|
||||
// else result[k] = 0.5f*scale*val;
|
||||
//}
|
||||
}
|
||||
|
||||
static inline int bin4(float x) {
|
||||
@@ -6851,7 +6841,6 @@ void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(fl
|
||||
auto& points = m_in_cluster[jbest];
|
||||
auto& values = points.empty() ? m_values : m_c_values[jbest];
|
||||
int npoint = values.size()/kGroupSize;
|
||||
//if (points.empty() || points.size()%8 != 0) printf("Oops: %d points in cluster %d\n", int(points.size()), jbest);
|
||||
GGML_ASSERT(npoint > 0 && npoint%8 == 0);
|
||||
int jbest_cluster = jbest;
|
||||
auto vbest = _mm256_set1_ps(INFINITY);
|
||||
@@ -6917,8 +6906,6 @@ void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(fl
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
auto vq = _mm256_loadu_ps(m_clusters.data() + kGroupSize*(j+2*i));
|
||||
auto vdiff = _mm256_sub_ps(vq, vx);
|
||||
//vdiff = _mm256_mul_ps(vdiff, vdiff);
|
||||
//sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff));
|
||||
vdiff = _mm256_and_ps(sign_bit, vdiff);
|
||||
sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, _mm256_mul_ps(vdiff, vdiff)));
|
||||
}
|
||||
@@ -6947,10 +6934,7 @@ void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(fl
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
auto vq = _mm256_loadu_ps(values.data() + kGroupSize*(j+2*i));
|
||||
auto vdiff = _mm256_sub_ps(vq, vx);
|
||||
//vdiff = _mm256_mul_ps(vdiff, vdiff);
|
||||
sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff));
|
||||
//vdiff = _mm256_and_ps(sign_bit, vdiff);
|
||||
//sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, _mm256_mul_ps(vdiff, vdiff)));
|
||||
}
|
||||
auto score = hsum_float_4x8(sqx);
|
||||
auto mask = _mm256_cmp_ps(score, vbest, _CMP_LT_OQ);
|
||||
@@ -6981,7 +6965,6 @@ template <int block_size, int group_size, int num_bits, bool is_abs>
|
||||
std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::finalize_clusters(int num_neighbours,
|
||||
const std::vector<float>& values, const std::vector<float>& clusters, std::vector<std::vector<float>>& c_values) {
|
||||
int ncluster = clusters.size()/kGroupSize;
|
||||
//GGML_ASSERT(ncluster%8 == 0);
|
||||
std::vector<std::vector<int>> p_in_cluster(ncluster);
|
||||
std::vector<int> which_cluster(num_neighbours*kNumVal);
|
||||
std::vector<int> ibest(num_neighbours);
|
||||
@@ -7167,28 +7150,11 @@ std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::clus
|
||||
printf(" %d", l);
|
||||
}
|
||||
printf("\n");
|
||||
//GGML_ABORT("fatal error");
|
||||
} else {
|
||||
for (int k = 0; k < ndim; ++k) result[ic*ndim + k] = sump[ic*ndim + k]/counts[ic];
|
||||
}
|
||||
}
|
||||
if (nzero > 0) printf("%s: %d out of %d clusters dir not have any points\n", __func__, nzero, ncluster);
|
||||
//counts.resize(ndim*ncluster);
|
||||
//auto fcounts = (float *)counts.data();
|
||||
//std::memset(fcounts, 0, counts.size()*sizeof(float));
|
||||
//for (int ip = 0; ip < npoint; ++ip) {
|
||||
// auto vp = points.data() + ndim*ip;
|
||||
// uint8_t u = 0;
|
||||
// for (int k = 0; k < ndim; ++k) u |= (bin4(vp[k]) << 2*k);
|
||||
// for (int k = 0; k < ndim; ++k) {
|
||||
// float w = std::abs(vp[k]);
|
||||
// sump[ndim*u + k] += w*vp[k];
|
||||
// fcounts[ndim*u + k] += w;
|
||||
// }
|
||||
//}
|
||||
//for (int ic = 0; ic < ncluster; ++ic) {
|
||||
// for (int k = 0; k < ndim; ++k) result[ic*ndim + k] = fcounts[ic*ndim + k] > 0 ? sump[ic*ndim + k]/fcounts[ic*ndim + k] : 0.f;
|
||||
//}
|
||||
return result;
|
||||
}
|
||||
std::mt19937 rndm(1234);
|
||||
@@ -7370,8 +7336,6 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
|
||||
*dptr = d;
|
||||
if (!d) return;
|
||||
|
||||
//d *= 1.05f;
|
||||
|
||||
for (int iloop = 0; iloop < 1; ++iloop) {
|
||||
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
|
||||
Reference in New Issue
Block a user