diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 731ea3ff..00cd3cf0 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -65,6 +65,7 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", }, + { "Q6_K_R4", LLAMA_FTYPE_MOSTLY_Q6_K_R4, "Q6_K repacked", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index a53e08f3..d6c3fc39 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -3248,6 +3248,71 @@ static void mul_mat_q4_k_r4_q8_k(int n, const void * vx, size_t bx, const DataIn } #endif +template +static void mul_mat_q6_k_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + GGML_ASSERT(nrc_x%4 == 0); + Q8 q8(info); + auto m4 = _mm256_set1_epi8(0xf); + auto m3 = _mm256_set1_epi8(0x30); + auto m32 = _mm256_set1_epi8(-32); +#ifndef HAVE_FANCY_SIMD + auto m1 = _mm256_set1_epi16(1); +#endif + int nbl = n / QK_K; + __m256 acc[nrc_y] = {}; + __m256i qx[4]; + for (int ix = 0; ix < nrc_x; ix += 4) { + const block_q6_k_r4 * iq6 = (const block_q6_k_r4 *)((const char *)vx + (ix+0)*bx); + for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256 + auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq6[ibl].d)); + auto d4 = _mm256_set_m128(dl, dl); + const uint32_t * scales = (const uint32_t *)iq6[ibl].scales; + for (int ib = 0; ib < QK_K/32; ++ib) { + auto iscales = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i *)(scales + 2*ib))); + auto scales = _mm256_mul_ps(d4, _mm256_cvtepi32_ps(iscales)); +//#ifdef HAVE_FANCY_SIMD +// auto scales_m = _mm256_mul_ps(scales, _mm256_set1_ps(-64.f)); +//#endif + auto lbits1 = _mm256_loadu_si256((const __m256i *)iq6[ibl].ql+2*ib+0); + auto lbits2 = _mm256_loadu_si256((const __m256i *)iq6[ibl].ql+2*ib+1); + auto hbits = _mm256_loadu_si256((const __m256i *)iq6[ibl].qh+ib); + qx[0] = _mm256_add_epi8(_mm256_or_si256(_mm256_and_si256(lbits1, m4), _mm256_and_si256(m3, _mm256_slli_epi16(hbits, 4))), m32); + qx[1] = _mm256_add_epi8(_mm256_or_si256(_mm256_and_si256(lbits2, m4), _mm256_and_si256(m3, hbits)), m32); + qx[2] = _mm256_add_epi8(_mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(lbits1, 4), m4), _mm256_and_si256(m3, _mm256_slli_epi16(hbits, 2))), m32); + qx[3] = _mm256_add_epi8(_mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(lbits2, 4), m4), _mm256_and_si256(m3, _mm256_srli_epi16(hbits, 2))), m32); +//#ifndef HAVE_FANCY_SIMD + auto s1 = _mm256_sign_epi8(qx[0], qx[0]); + auto s2 = _mm256_sign_epi8(qx[1], qx[1]); + auto s3 = _mm256_sign_epi8(qx[2], qx[2]); + auto s4 = _mm256_sign_epi8(qx[3], qx[3]); +//#endif + for (int iy = 0; iy < nrc_y; ++iy) { + auto y = _mm256_loadu_si256((const __m256i*)q8.y[iy][ibl].qs+ib); +#ifdef HAVE_FANCY_SIMD + auto sumi = _mm256_setzero_si256(); + sumi = _mm256_dpbusd_epi32(sumi, s1, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x00), qx[0])); + sumi = _mm256_dpbusd_epi32(sumi, s2, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x55), qx[1])); + sumi = _mm256_dpbusd_epi32(sumi, s3, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xaa), qx[2])); + sumi = _mm256_dpbusd_epi32(sumi, s4, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xff), qx[3])); +#else + auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(s1, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x00), qx[0])), + _mm256_maddubs_epi16(s2, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x55), qx[1]))); + auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(s3, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xaa), qx[2])), + _mm256_maddubs_epi16(s4, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xff), qx[3]))); + auto sumi = _mm256_add_epi32(_mm256_madd_epi16(m1, sumi1), _mm256_madd_epi16(m1, sumi2)); +#endif + acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(scales, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]); + } + } + } + for (int iy = 0; iy < nrc_y; ++iy) { + auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1)); + acc[iy] = _mm256_setzero_ps(); + info.store(ix+0, iy, sum); + } + } +} + template inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) { if (j == 0) { @@ -5255,6 +5320,18 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { mm.funcs[7] = mul_mat_q4_k_r4_q8_k<8>; expected_typeB = GGML_TYPE_Q8_K32; break; + case GGML_TYPE_Q6_K_R4: + assert (ne00 % QK_K == 0); + mm.funcs[0] = mul_mat_q6_k_r4_q8_k<1>; + mm.funcs[1] = mul_mat_q6_k_r4_q8_k<2>; + mm.funcs[2] = mul_mat_q6_k_r4_q8_k<3>; + mm.funcs[3] = mul_mat_q6_k_r4_q8_k<4>; + mm.funcs[4] = mul_mat_q6_k_r4_q8_k<5>; + mm.funcs[5] = mul_mat_q6_k_r4_q8_k<6>; + mm.funcs[6] = mul_mat_q6_k_r4_q8_k<7>; + mm.funcs[7] = mul_mat_q6_k_r4_q8_k<8>; + expected_typeB = GGML_TYPE_Q8_K; + break; case GGML_TYPE_Q4_0_R4: assert (ne00 % QK4_NL == 0); mm.funcs[0] = mul_mat_q4_0_r4_q8_1<1>; diff --git a/include/llama.h b/include/llama.h index 2fa78879..92234d6c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -184,6 +184,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q8_0_R4 = 207, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0_R4 = 208, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_K_R4 = 214, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q6_K_R4 = 218, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 = 225, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS_R4 = 230, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_0_R4 = 235, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index 18c6e111..3b617b06 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3840,6 +3840,7 @@ struct llama_model_loader { case GGML_TYPE_Q4_K_R4: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_R4; break; case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; + case GGML_TYPE_Q6_K_R4: ftype = LLAMA_FTYPE_MOSTLY_Q6_K_R4; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; case GGML_TYPE_IQ2_KS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_KS; break; @@ -4552,6 +4553,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; + case LLAMA_FTYPE_MOSTLY_Q6_K_R4: return "Q6_K_R4"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_KS: return "IQ2_KS - 2.1875 bpw"; @@ -15757,7 +15759,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS) && !qs.has_output) { new_type = GGML_TYPE_IQ5_K; } - else if (new_type != GGML_TYPE_Q8_0 && new_type != GGML_TYPE_Q8_0_R4 && new_type != GGML_TYPE_IQ6_K) { + else if (new_type != GGML_TYPE_Q8_0 && new_type != GGML_TYPE_Q8_0_R4 && new_type != GGML_TYPE_IQ6_K && new_type != GGML_TYPE_Q6_K_R4) { new_type = GGML_TYPE_Q6_K; } } @@ -15791,6 +15793,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (new_type == GGML_TYPE_Q4_K_R4) { new_type = GGML_TYPE_Q4_K; } + else if (new_type == GGML_TYPE_Q6_K_R4) { + new_type = GGML_TYPE_Q6_K; + } else if (new_type == GGML_TYPE_Q4_0_R4) { new_type = GGML_TYPE_Q4_0; } @@ -16062,7 +16067,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_IQ4_K || new_type == GGML_TYPE_IQ2_K || new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || new_type == GGML_TYPE_Q4_K_R4 || new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ4_KS || new_type == GGML_TYPE_IQ4_XS_R4 || - new_type == GGML_TYPE_IQ2_KS || new_type == GGML_TYPE_IQ4_KSS) { + new_type == GGML_TYPE_IQ2_KS || new_type == GGML_TYPE_IQ4_KSS || new_type == GGML_TYPE_Q6_K_R4) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -16102,6 +16107,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n case GGML_TYPE_IQ5_K: case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q6_0; break; case GGML_TYPE_IQ6_K: + case GGML_TYPE_Q6_K_R4: case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); } @@ -16194,6 +16200,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_K_S: case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break; + case LLAMA_FTYPE_MOSTLY_Q6_K_R4: default_type = GGML_TYPE_Q6_K_R4; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ2_KS: default_type = GGML_TYPE_IQ2_KS; break; @@ -16597,6 +16604,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_K; else chunk_size_multiplier = 4; } + else if (new_type == GGML_TYPE_Q6_K_R4) { + if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_K; + else chunk_size_multiplier = 4; + } else if (new_type == GGML_TYPE_IQ2_BN_R4) { if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_BN; else chunk_size_multiplier = 4;