Use bf16 instead of fp16 block scales for q8_1 (#292)

* WIP - not working

* q8_0 without bells and wistles works

* It works for q8_0

* Use bf16 instead of f16,int16

* q4_0_r8

* q5_0_r4

* q6_0_r4

* Also q4_1 and q5_1

* q8_0_r8 on avx2

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-03-27 05:49:16 +01:00
committed by GitHub
parent a22250df93
commit d0b52076da
6 changed files with 348 additions and 255 deletions

View File

@@ -798,13 +798,14 @@ void quantize_row_q8_0_x4(const float * x, void * vy, int64_t k) {
#endif
}
void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) {
namespace {
template <typename Block, typename Block_x4>
void quantize_row_q8_1_x4_T(const float * x, Block * y, int64_t k) {
assert(k % QK8_1 == 0);
const int nb = k / QK8_1;
const int nb4 = 4*(nb/4);
block_q8_1 * y = (block_q8_1 *)vy;
block_q8_1_x4 * y4 = (block_q8_1_x4 *)vy;
Block_x4 * y4 = (Block_x4 *)y;
#if defined(__aarch64__)
for (int i = 0; i < nb; i++) {
int i4 = i/4, ir = i%4;
@@ -851,10 +852,18 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) {
accv = vaddq_s32(accv, vi);
}
if (i < nb4) {
y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
if constexpr (std::is_same_v<Block, block_q8_1>) {
if (i < nb4) {
y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
} else {
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
}
} else {
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
if (i < nb4) {
y4[i4].s[ir] = vaddvq_s32(accv);
} else {
y[i].s = vaddvq_s32(accv);
}
}
}
#else
@@ -880,13 +889,25 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) {
const float max_scalar = _mm_cvtss_f32( max4 );
// Quantize these floats
const float d = max_scalar / 127.f;
if (i < nb4) {
y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
float d = max_scalar / 127.f;
if constexpr (std::is_same_v<Block, block_q8_1>) {
if (i < nb4) {
y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
} else {
y[i].d = GGML_FP32_TO_FP16(d);
}
} else {
y[i].d = GGML_FP32_TO_FP16(d);
if (i < nb4) {
auto t = GGML_FP32_TO_BF16(d);
y4[i4].d[ir] = t.bits;
d = ggml_bf16_to_fp32(t);
} else {
auto t = GGML_FP32_TO_BF16(d);
y[i].d = t.bits;
d = ggml_bf16_to_fp32(t);
}
}
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
const float id = d > 0 ? 1/d : 0.f;
const __m256 mul = _mm256_set1_ps( id );
// Apply the multiplier
@@ -908,10 +929,19 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) {
__m256i i3 = _mm256_cvtps_epi32( v3 );
// Compute the sum of the quants and set y[i].s
if (i < nb4) {
y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
int isum = hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
if constexpr (std::is_same_v<Block, block_q8_1>) {
if (i < nb4) {
y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * isum);
} else {
y[i].s = GGML_FP32_TO_FP16(d * isum);
}
} else {
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
if (i < nb4) {
y4[i4].d[ir+4] = GGML_FP32_TO_BF16(d * isum).bits;
} else {
y[i].s = GGML_FP32_TO_BF16(d * isum).bits;
}
}
// Convert int32 to int16
@@ -934,6 +964,15 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) {
}
#endif
}
}
void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) {
quantize_row_q8_1_x4_T<block_q8_1, block_q8_1_x4>(x, (block_q8_1 *)vy, k);
}
void quantize_row_q8_2_x4(const float * x, void * vy, int64_t k) {
quantize_row_q8_1_x4_T<block_q8_2, block_q8_2_x4>(x, (block_q8_2 *)vy, k);
}
//
// ============================================== iq2_K