mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 03:11:51 +00:00
For now have only iq4_kt use the new trellis
This commit is contained in:
@@ -7397,7 +7397,7 @@ void dequantize_row_ms_i2s(const void * vx, float * y, int64_t k) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
template <int block_size, int group_size, int num_bits, bool is_abs = false>
|
template <int block_size, int group_size, int num_bits, bool is_abs = false, bool is_int = false>
|
||||||
class QuantizerIQKT {
|
class QuantizerIQKT {
|
||||||
static_assert(group_size == 8 || group_size == 4);
|
static_assert(group_size == 8 || group_size == 4);
|
||||||
static_assert(block_size >= 8 && block_size%8 == 0);
|
static_assert(block_size >= 8 && block_size%8 == 0);
|
||||||
@@ -7408,7 +7408,7 @@ public:
|
|||||||
constexpr static int kNg = kBlockSize/kGroupSize;
|
constexpr static int kNg = kBlockSize/kGroupSize;
|
||||||
constexpr static int kNblock = kSuperBlockSize/kBlockSize;
|
constexpr static int kNblock = kSuperBlockSize/kBlockSize;
|
||||||
constexpr static int kNumVal = 1 << num_bits; // i.e, 16 bits per group of 8
|
constexpr static int kNumVal = 1 << num_bits; // i.e, 16 bits per group of 8
|
||||||
constexpr static float kScale = 1.f; //31.75f;
|
constexpr static float kScale = is_int ? 1.f : 31.75f;
|
||||||
constexpr static bool kVerbose = false;
|
constexpr static bool kVerbose = false;
|
||||||
|
|
||||||
QuantizerIQKT(int num_clusters, int num_neighbours, int offset = 4096);
|
QuantizerIQKT(int num_clusters, int num_neighbours, int offset = 4096);
|
||||||
@@ -7421,19 +7421,25 @@ public:
|
|||||||
static inline void set_values(uint32_t i, float * result, float scale, int offset = 4096) {
|
static inline void set_values(uint32_t i, float * result, float scale, int offset = 4096) {
|
||||||
constexpr uint32_t ka = 89226354;
|
constexpr uint32_t ka = 89226354;
|
||||||
constexpr uint32_t kb = 64248484;
|
constexpr uint32_t kb = 64248484;
|
||||||
//constexpr uint32_t kmask = 0x8fff8fff;
|
|
||||||
//constexpr uint32_t km32 = 0x3b603b60;
|
|
||||||
uint32_t x = i + offset;
|
uint32_t x = i + offset;
|
||||||
uint32_t s;
|
if constexpr (is_int) {
|
||||||
auto i8 = (const int8_t *)&s;
|
uint32_t s;
|
||||||
for (int k = 0; k < kGroupSize; ++k) {
|
auto i8 = (const int8_t *)&s;
|
||||||
x = ka*x + kb;
|
for (int k = 0; k < kGroupSize; ++k) {
|
||||||
s = x & 0x3f3f3f3f;
|
x = ka*x + kb;
|
||||||
result[k] = scale*(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
|
s = x & 0x3f3f3f3f;
|
||||||
//uint32_t s = (x & kmask) ^ km32;
|
result[k] = scale*(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
|
||||||
//float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
|
}
|
||||||
//if constexpr (is_abs) result[k] = scale*std::abs(val);
|
} else {
|
||||||
//else result[k] = scale*val;
|
constexpr uint32_t kmask = 0x8fff8fff;
|
||||||
|
constexpr uint32_t km32 = 0x3b603b60;
|
||||||
|
for (int k = 0; k < kGroupSize; ++k) {
|
||||||
|
x = ka*x + kb;
|
||||||
|
uint32_t s = (x & kmask) ^ km32;
|
||||||
|
float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
|
||||||
|
if constexpr (is_abs) result[k] = scale*std::abs(val);
|
||||||
|
else result[k] = scale*val;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -7482,8 +7488,8 @@ private:
|
|||||||
float m_mid[4*kGroupSize];
|
float m_mid[4*kGroupSize];
|
||||||
};
|
};
|
||||||
|
|
||||||
template <int block_size, int group_size, int num_bits, bool is_abs>
|
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
|
||||||
QuantizerIQKT<block_size, group_size, num_bits, is_abs>::QuantizerIQKT(int num_clusters, int num_neighbours, int offset) {
|
QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::QuantizerIQKT(int num_clusters, int num_neighbours, int offset) {
|
||||||
m_values.resize(kNumVal*kGroupSize);
|
m_values.resize(kNumVal*kGroupSize);
|
||||||
float * data = m_values.data();
|
float * data = m_values.data();
|
||||||
for (int i = 0; i < kNumVal; ++i) {
|
for (int i = 0; i < kNumVal; ++i) {
|
||||||
@@ -7499,8 +7505,8 @@ QuantizerIQKT<block_size, group_size, num_bits, is_abs>::QuantizerIQKT(int num_c
|
|||||||
m_in_cluster = finalize_clusters(num_neighbours, m_values, m_clusters, m_c_values);
|
m_in_cluster = finalize_clusters(num_neighbours, m_values, m_clusters, m_c_values);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int block_size, int group_size, int num_bits, bool is_abs>
|
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
|
||||||
std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_scale(
|
std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_scale(
|
||||||
const float * xb, const float * weight, const int * best_idx) const {
|
const float * xb, const float * weight, const int * best_idx) const {
|
||||||
float sumqx = 0, sumq2 = 0;
|
float sumqx = 0, sumq2 = 0;
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
@@ -7532,8 +7538,8 @@ std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>:
|
|||||||
return sumq2 > 0 ? std::make_pair(sumqx/sumq2, sumqx*sumqx/sumq2) : std::make_pair(0.f, 0.f);
|
return sumq2 > 0 ? std::make_pair(sumqx/sumq2, sumqx*sumqx/sumq2) : std::make_pair(0.f, 0.f);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int block_size, int group_size, int num_bits, bool is_abs>
|
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
|
||||||
float QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_inverse_scale(
|
float QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_inverse_scale(
|
||||||
const float * xb, const float * weight, const int * best_idx) const {
|
const float * xb, const float * weight, const int * best_idx) const {
|
||||||
float sumqx = 0, sumx2 = 0;
|
float sumqx = 0, sumx2 = 0;
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
@@ -7565,8 +7571,8 @@ float QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_inverse
|
|||||||
return sumx2 > 0 ? sumqx/sumx2 : 0.f;
|
return sumx2 > 0 ? sumqx/sumx2 : 0.f;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int block_size, int group_size, int num_bits, bool is_abs>
|
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
|
||||||
void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const {
|
void QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const {
|
||||||
if (!d) {
|
if (!d) {
|
||||||
std::memset(best_idx, 0, kNg*sizeof(int));
|
std::memset(best_idx, 0, kNg*sizeof(int));
|
||||||
return;
|
return;
|
||||||
@@ -7744,8 +7750,8 @@ void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(fl
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int block_size, int group_size, int num_bits, bool is_abs>
|
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
|
||||||
std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::finalize_clusters(int num_neighbours,
|
std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::finalize_clusters(int num_neighbours,
|
||||||
const std::vector<float>& values, const std::vector<float>& clusters, std::vector<std::vector<float>>& c_values) {
|
const std::vector<float>& values, const std::vector<float>& clusters, std::vector<std::vector<float>>& c_values) {
|
||||||
int ncluster = clusters.size()/kGroupSize;
|
int ncluster = clusters.size()/kGroupSize;
|
||||||
std::vector<std::vector<int>> p_in_cluster(ncluster);
|
std::vector<std::vector<int>> p_in_cluster(ncluster);
|
||||||
@@ -7831,8 +7837,8 @@ std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is
|
|||||||
return p_in_cluster;
|
return p_in_cluster;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int block_size, int group_size, int num_bits, bool is_abs>
|
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
|
||||||
std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::cluster_points(const std::vector<float>& points, int ncluster, int niter, float * mid) {
|
std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::cluster_points(const std::vector<float>& points, int ncluster, int niter, float * mid) {
|
||||||
constexpr int ndim = kGroupSize;
|
constexpr int ndim = kGroupSize;
|
||||||
GGML_ASSERT(points.size() % ndim == 0);
|
GGML_ASSERT(points.size() % ndim == 0);
|
||||||
int npoint = points.size() / ndim;
|
int npoint = points.size() / ndim;
|
||||||
@@ -8526,7 +8532,7 @@ void vec_dot_iq3_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx
|
|||||||
|
|
||||||
namespace{
|
namespace{
|
||||||
|
|
||||||
using QuantizerIQ4KT = QuantizerIQKT<32, 4, 15>;
|
using QuantizerIQ4KT = QuantizerIQKT<32, 4, 15, false, true>;
|
||||||
|
|
||||||
const QuantizerIQ4KT& iq4kt_quantizer(bool with_offset = false) {
|
const QuantizerIQ4KT& iq4kt_quantizer(bool with_offset = false) {
|
||||||
static std::mutex mutex;
|
static std::mutex mutex;
|
||||||
|
|||||||
Reference in New Issue
Block a user