mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-24 07:04:11 +00:00
q6_0
74.2 t/s -> 128.8 t/s. q6_0_r4 is at 107.2 t/s.
This commit is contained in:
@@ -2807,6 +2807,23 @@ struct DeqQ50 {
|
||||
const uint8x16_t mh = vdupq_n_u8(0xf0);
|
||||
};
|
||||
|
||||
struct DeqQ60 {
|
||||
|
||||
inline int8x16x2_t dequant(const block_q6_0& x) const {
|
||||
int8x16x2_t r;
|
||||
bits.prepare1(x.qs, r.val);
|
||||
auto qh8 = vld1_u8(x.qh);
|
||||
auto qh = vcombine_u8(vshl_n_u8(qh8, 4), qh8);
|
||||
r.val[0] = vaddq_s8(vorrq_u8(r.val[0], vandq_u8(qh, hmask)), m32);
|
||||
r.val[1] = vaddq_s8(vorrq_u8(r.val[1], vandq_u8(vshrq_n_u8(qh, 2), hmask)), m32);
|
||||
return r;
|
||||
}
|
||||
|
||||
Q4LegacyBits bits;
|
||||
const int8x16_t m32 = vdupq_n_s8(-32);
|
||||
const uint8x16_t hmask = vdupq_n_u8(0x30);
|
||||
};
|
||||
|
||||
template <typename Block, typename Dequantizer>
|
||||
void iqk_convert_qX_q80_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
GGML_ASSERT(n%QK4_0 == 0);
|
||||
@@ -2849,7 +2866,7 @@ bool iqk_convert_legacy_quants_q8_r8(int type, int n, const void * vx, size_t bx
|
||||
// case GGML_TYPE_Q4_1 : iqk_convert_qX_1_q8_1_r8<block_q4_1, Q4_1_Dequantizer>(n, vx, bx, vy, nrc_x); break;
|
||||
case GGML_TYPE_Q5_0 : iqk_convert_qX_q80_r8<block_q5_0, DeqQ50>(n, vx, bx, vy, nrc_x); break;
|
||||
// case GGML_TYPE_Q5_1 : iqk_convert_qX_1_q8_1_r8<block_q5_1, Q5_1_Dequantizer<block_q5_1>>(n, vx, bx, vy, nrc_x); break;
|
||||
// case GGML_TYPE_Q6_0 : iqk_convert_qX_q80_r8<block_q6_0, Q6_0_Dequantizer>(n, vx, bx, vy, nrc_x); break;
|
||||
case GGML_TYPE_Q6_0 : iqk_convert_qX_q80_r8<block_q6_0, DeqQ60>(n, vx, bx, vy, nrc_x); break;
|
||||
// case GGML_TYPE_IQ4_NL: iqk_convert_qX_q80_r8<block_iq4_nl, IQ4_NL0_Dequantizer>(n, vx, bx, vy, nrc_x); break;
|
||||
// case GGML_TYPE_Q8_0 : iqk_convert_q80_q80_r8(n, vx, bx, vy, nrc_x); break;
|
||||
default: return false;
|
||||
|
||||
@@ -273,6 +273,7 @@ struct MulMat {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_Q5_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_Q6_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_IQ2_KT : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_IQ3_KT : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_IQ4_KT : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
|
||||
Reference in New Issue
Block a user