diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index bf098ef7..fc49d019 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -3085,27 +3085,6 @@ static void mul_mat_q4_k_r4_q8_k_avx2(int n, const void * vx, size_t bx, const D auto m1 = _mm256_set1_epi16(1); #endif int nbl = n / QK_K; - //float q4[1024]; - //float a[4*nrc_y]; - //for (int ix = 0; ix < nrc_x; ix += 4) { - // std::memset(a, 0, 4*nrc_y*sizeof(float)); - // const block_q4_k_r4 * iq4 = (const block_q4_k_r4 *)((const char *)vx + (ix+0)*bx); - // for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256 - // dequantize_row_q4_k_r4(iq4 + ibl, q4, 1024); - // for (int iy = 0; iy < nrc_y; ++iy) { - // float d8 = q8.scale(iy, ibl); - // for (int j = 0; j < 256; ++j) { - // float v = d8*q8.y[iy][ibl].qs[j]; - // a[4*iy+0] += v * q4[j+ 0]; - // a[4*iy+1] += v * q4[j+256]; - // a[4*iy+2] += v * q4[j+512]; - // a[4*iy+3] += v * q4[j+768]; - // } - // } - // } - // for (int iy = 0; iy < nrc_y; ++iy) for (int k = 0; k < 4; ++k) info.store(ix+k, iy, a[4*iy+k]); - //} - //return; union { __m256i vec; uint32_t val[8]; } hd, hm; __m256 acc[nrc_y] = {}; __m256i qx[4]; @@ -3161,11 +3140,11 @@ static void mul_mat_q4_k_r4_q8_k_avx2(int n, const void * vx, size_t bx, const D template static void mul_mat_q4_k_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - mul_mat_q4_k_r4_q8_k_avx2(n, vx, bx, info, nrc_x); - return; - //if constexpr (nrc_y == 1){ - // mul_mat_iq4_xs_r4_q8_k_avx2<1>(n, vx, bx, info, nrc_x); - //} else { + //mul_mat_q4_k_r4_q8_k_avx2(n, vx, bx, info, nrc_x); + //return; + if constexpr (nrc_y == 1){ + mul_mat_q4_k_r4_q8_k_avx2<1>(n, vx, bx, info, nrc_x); + } else { GGML_ASSERT(nrc_x%8 == 0); Q8 q8(info); auto mf = _mm512_set1_epi8(0xf); @@ -3243,7 +3222,7 @@ static void mul_mat_q4_k_r4_q8_k(int n, const void * vx, size_t bx, const DataIn info.store(ix+4, iy, sum2); } } - //} + } } template diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index be02c3eb..93a3a0ea 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -4002,22 +4002,6 @@ static void repack_q4_k(int nrows, int n_per_row, const block_q4_K * x, block_q4 } } } - //for (int ib = 0; ib < QK_K/32; ++ib) { - // for (int k = 0; k < 4; ++k) for (int i = 0; i < 4; ++i) { - // uint8_t l1 = (x4[k][ibl].qs[32*(ib/2)+i+ 0] >> 4*(ib%2)) & 0xf; - // uint8_t l2 = (x4[k][ibl].qs[32*(ib/2)+i+ 8] >> 4*(ib%2)) & 0xf; - // y[ibl].qs[64*ib+4*k+i+ 0] = l1 | (l2 << 4); - // l1 = (x4[k][ibl].qs[32*(ib/2)+i+16] >> 4*(ib%2)) & 0xf; - // l2 = (x4[k][ibl].qs[32*(ib/2)+i+24] >> 4*(ib%2)) & 0xf; - // y[ibl].qs[64*ib+4*k+i+16] = l1 | (l2 << 4); - // l1 = (x4[k][ibl].qs[32*(ib/2)+i+ 4] >> 4*(ib%2)) & 0xf; - // l2 = (x4[k][ibl].qs[32*(ib/2)+i+12] >> 4*(ib%2)) & 0xf; - // y[ibl].qs[64*ib+4*k+i+32] = l1 | (l2 << 4); - // l1 = (x4[k][ibl].qs[32*(ib/2)+i+20] >> 4*(ib%2)) & 0xf; - // l2 = (x4[k][ibl].qs[32*(ib/2)+i+28] >> 4*(ib%2)) & 0xf; - // y[ibl].qs[64*ib+4*k+i+48] = l1 | (l2 << 4); - // } - //} } x += 4*nblock; y += nblock; diff --git a/src/llama.cpp b/src/llama.cpp index daa270b0..18c6e111 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3837,6 +3837,7 @@ struct llama_model_loader { case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; + case GGML_TYPE_Q4_K_R4: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_R4; break; case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; @@ -15787,6 +15788,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (new_type == GGML_TYPE_IQ4_XS_R4) { new_type = GGML_TYPE_IQ4_XS; } + else if (new_type == GGML_TYPE_Q4_K_R4) { + new_type = GGML_TYPE_Q4_K; + } else if (new_type == GGML_TYPE_Q4_0_R4) { new_type = GGML_TYPE_Q4_0; }