mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-29 19:01:47 +00:00
Much faster CPU prompt processing (part 3) (#534)
* Repack q4_0 and q8_0 to q8_0_R8 q8_0 is fine, but I observe a very significant PPL increase for q4_0. Best guess: precision loss with the 32 bit <-> 16 bit scale conversions. * Change q8_2_x4 to store in16_t sums With that q4_0 now works. I need to check all quants that use q8_2_x4! * q5_0 and use a dequntizing template * q6_0 129 t/s -> 296 t/s. q6_0_r4 is at 244 t/s. * iq4_nl 137 t/s -> 293 t/s. iq4_nl is at 251 t/s. * q4_1: 135 t/s -> 262 t/s * q5_1: 125 t/s -> 253 t/s * iq3_xs 178 t/s -> 363 t/s. iq4_xs_r4 is at 275 t/s. * q2_K 202 t/s -> 364 t/s. q2_k_r4 is at 247 t/s. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -875,14 +875,12 @@ void quantize_row_q8_1_x4_T(const float * x, Block * y, int64_t k) {
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
}
|
||||
} else {
|
||||
auto t = GGML_FP32_TO_BF16(d);
|
||||
d = ggml_bf16_to_fp32(t);
|
||||
if (i < nb4) {
|
||||
auto t = GGML_FP32_TO_BF16(d);
|
||||
y4[i4].d[ir] = t.bits;
|
||||
d = ggml_bf16_to_fp32(t);
|
||||
} else {
|
||||
auto t = GGML_FP32_TO_BF16(d);
|
||||
y[i].d = t.bits;
|
||||
d = ggml_bf16_to_fp32(t);
|
||||
}
|
||||
}
|
||||
const float id = d > 0 ? 1/d : 0.f;
|
||||
@@ -916,9 +914,11 @@ void quantize_row_q8_1_x4_T(const float * x, Block * y, int64_t k) {
|
||||
}
|
||||
} else {
|
||||
if (i < nb4) {
|
||||
y4[i4].d[ir+4] = GGML_FP32_TO_BF16(d * isum).bits;
|
||||
auto i16 = (int16_t *)y4[i4].d;
|
||||
i16[ir+4] = isum;
|
||||
} else {
|
||||
y[i].s = GGML_FP32_TO_BF16(d * isum).bits;
|
||||
auto i16 = (int16_t *)&y[i].s;
|
||||
i16[0] = isum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user