iq2_kt - this is better

Using blocks of 32 and 16 bits per group of 8 weights
it beats iq2_xxs in terms of PPL by a significant margin.
It is 0.0625 bpw larger, but even if we go to 15 bits per
group od 8 (so 0.0625 bpw less than iq2_xxs), PPL is still
lower.
This commit is contained in:
Iwan Kawrakow
2024-11-06 20:49:56 +02:00
parent 766fa600c8
commit 36e9c922b8
6 changed files with 388 additions and 127 deletions

View File

@@ -498,7 +498,7 @@ static std::vector<float> cluster_points(const std::vector<float>& points, int n
}
static void analyze_x_v2(const char * name, int nrows, int n_per_row, const float * values, float& tot_mse, float& tot_mse_q, float& tot_elements) {
constexpr int kNumVal = 1 << 16;
constexpr int kNumVal = 1 << 15;
constexpr int kBlockSize = 32;
constexpr int kGroupSize = 8;
constexpr int kNg = kBlockSize/kGroupSize;
@@ -508,7 +508,7 @@ static void analyze_x_v2(const char * name, int nrows, int n_per_row, const floa
static std::vector<std::vector<int>> p_in_cluster;
if (codes.empty()) {
codes = make_values(kNumVal, kGroupSize, 31.75f);
clusters = cluster_points(codes, kGroupSize, kNumVal/1024, 200);
clusters = cluster_points(codes, kGroupSize, kNumVal/512, 200);
if (clusters.empty()) { printf("Oops\n"); exit(1); }
int ncluster = clusters.size()/kGroupSize;
p_in_cluster.resize(ncluster);
@@ -623,7 +623,7 @@ static void analyze_x_v2(const char * name, int nrows, int n_per_row, const floa
sigma2 /= n_per_row;
for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
auto xb = xr + kBlockSize*ib;
for (int i = 0; i < kBlockSize; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
//for (int i = 0; i < kBlockSize; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
float d = find_best_scale(kBlockSize, xb, weight.data(), iq4k_values, 5);
float id = d ? 1/d : 0.f;
#ifdef __AVX2__