iq4_xxs: this looks very viable compared to iq4_xs

At the same 4.25 bpw PPL is always better, for some models
significantly better. I'll rename to iq4_ks and keep it.
This commit is contained in:
Iwan Kawrakow
2024-10-08 16:07:00 +03:00
parent 1dd6c40c15
commit 975292b6b9
3 changed files with 9 additions and 9 deletions

View File

@@ -442,10 +442,10 @@ typedef struct {
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
typedef struct {
uint8_t scales[QK_K/64];
uint8_t scales[QK_K/32];
uint8_t qs[QK_K/2];
} block_iq4_xxs;
static_assert(sizeof(block_iq4_xxs) == QK_K/64 + QK_K/2, "wrong iq4_xxs block size/padding");
static_assert(sizeof(block_iq4_xxs) == QK_K/32 + QK_K/2, "wrong iq4_xxs block size/padding");
typedef struct {
ggml_half d;

View File

@@ -626,15 +626,15 @@ static __global__ void dequantize_block_iq4_xxs(const void * __restrict__ vx, ds
const int64_t i = ii - (row*n_per_row)/QK_K;
const int64_t tid = threadIdx.x;
const int64_t ib = tid/8; // 0...3
const int64_t il = tid%8; // 0...7
dst_t * y = yy + ii*QK_K + 64*ib + 4*il;
const uint8_t * q4 = x[i].qs + 32*ib + 4*il;
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + ii*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
const float d = scale * ((x[i].scales[ib] & 254) - 127);
const int8_t * values = iq4k_values + ((x[i].scales[ib] & 1) << 4);
for (int j = 0; j < 4; ++j) {
y[j+ 0] = d * values[q4[j] & 0xf];
y[j+32] = d * values[q4[j] >> 4];
y[j+16] = d * values[q4[j] >> 4];
}
}

View File

@@ -2340,7 +2340,7 @@ void quantize_row_iq4_xxs(const float * x, void * y, int64_t k) {
size_t quantize_iq4_xxs(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
//printf("============ %s(%d, %d)\n", __func__, int(nrows), int(n_per_row));
constexpr int kBlockSize = 64; //128;
constexpr int kBlockSize = 32; //128;
GGML_ASSERT(n_per_row%QK_K == 0);
auto row_size = ggml_row_size(GGML_TYPE_IQ4_XXS, n_per_row);
char * qrow = (char *)dst;
@@ -2355,7 +2355,7 @@ size_t quantize_iq4_xxs(const float * src, void * dst, int64_t nrows, int64_t n_
}
void dequantize_row_iq4_xxs(const block_iq4_xxs * x, float * y, int64_t k) {
constexpr int kBlockSize = 64; //128;
constexpr int kBlockSize = 32; //128;
GGML_ASSERT(k%QK_K == 0);
const float * dptr = (const float *)x;
float d = *dptr;