mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-10 22:10:20 +00:00
q4_k_r4: finally works on Zen4
I had forgotten to prevent token_embd.weight being quantized with q4_k_r4!
This commit is contained in:
@@ -3085,27 +3085,6 @@ static void mul_mat_q4_k_r4_q8_k_avx2(int n, const void * vx, size_t bx, const D
|
||||
auto m1 = _mm256_set1_epi16(1);
|
||||
#endif
|
||||
int nbl = n / QK_K;
|
||||
//float q4[1024];
|
||||
//float a[4*nrc_y];
|
||||
//for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
// std::memset(a, 0, 4*nrc_y*sizeof(float));
|
||||
// const block_q4_k_r4 * iq4 = (const block_q4_k_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
// for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
// dequantize_row_q4_k_r4(iq4 + ibl, q4, 1024);
|
||||
// for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
// float d8 = q8.scale(iy, ibl);
|
||||
// for (int j = 0; j < 256; ++j) {
|
||||
// float v = d8*q8.y[iy][ibl].qs[j];
|
||||
// a[4*iy+0] += v * q4[j+ 0];
|
||||
// a[4*iy+1] += v * q4[j+256];
|
||||
// a[4*iy+2] += v * q4[j+512];
|
||||
// a[4*iy+3] += v * q4[j+768];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// for (int iy = 0; iy < nrc_y; ++iy) for (int k = 0; k < 4; ++k) info.store(ix+k, iy, a[4*iy+k]);
|
||||
//}
|
||||
//return;
|
||||
union { __m256i vec; uint32_t val[8]; } hd, hm;
|
||||
__m256 acc[nrc_y] = {};
|
||||
__m256i qx[4];
|
||||
@@ -3161,11 +3140,11 @@ static void mul_mat_q4_k_r4_q8_k_avx2(int n, const void * vx, size_t bx, const D
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_q4_k_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
mul_mat_q4_k_r4_q8_k_avx2<nrc_y>(n, vx, bx, info, nrc_x);
|
||||
return;
|
||||
//if constexpr (nrc_y == 1){
|
||||
// mul_mat_iq4_xs_r4_q8_k_avx2<1>(n, vx, bx, info, nrc_x);
|
||||
//} else {
|
||||
//mul_mat_q4_k_r4_q8_k_avx2<nrc_y>(n, vx, bx, info, nrc_x);
|
||||
//return;
|
||||
if constexpr (nrc_y == 1){
|
||||
mul_mat_q4_k_r4_q8_k_avx2<1>(n, vx, bx, info, nrc_x);
|
||||
} else {
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
auto mf = _mm512_set1_epi8(0xf);
|
||||
@@ -3243,7 +3222,7 @@ static void mul_mat_q4_k_r4_q8_k(int n, const void * vx, size_t bx, const DataIn
|
||||
info.store(ix+4, iy, sum2);
|
||||
}
|
||||
}
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Bits>
|
||||
|
||||
@@ -4002,22 +4002,6 @@ static void repack_q4_k(int nrows, int n_per_row, const block_q4_K * x, block_q4
|
||||
}
|
||||
}
|
||||
}
|
||||
//for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
// for (int k = 0; k < 4; ++k) for (int i = 0; i < 4; ++i) {
|
||||
// uint8_t l1 = (x4[k][ibl].qs[32*(ib/2)+i+ 0] >> 4*(ib%2)) & 0xf;
|
||||
// uint8_t l2 = (x4[k][ibl].qs[32*(ib/2)+i+ 8] >> 4*(ib%2)) & 0xf;
|
||||
// y[ibl].qs[64*ib+4*k+i+ 0] = l1 | (l2 << 4);
|
||||
// l1 = (x4[k][ibl].qs[32*(ib/2)+i+16] >> 4*(ib%2)) & 0xf;
|
||||
// l2 = (x4[k][ibl].qs[32*(ib/2)+i+24] >> 4*(ib%2)) & 0xf;
|
||||
// y[ibl].qs[64*ib+4*k+i+16] = l1 | (l2 << 4);
|
||||
// l1 = (x4[k][ibl].qs[32*(ib/2)+i+ 4] >> 4*(ib%2)) & 0xf;
|
||||
// l2 = (x4[k][ibl].qs[32*(ib/2)+i+12] >> 4*(ib%2)) & 0xf;
|
||||
// y[ibl].qs[64*ib+4*k+i+32] = l1 | (l2 << 4);
|
||||
// l1 = (x4[k][ibl].qs[32*(ib/2)+i+20] >> 4*(ib%2)) & 0xf;
|
||||
// l2 = (x4[k][ibl].qs[32*(ib/2)+i+28] >> 4*(ib%2)) & 0xf;
|
||||
// y[ibl].qs[64*ib+4*k+i+48] = l1 | (l2 << 4);
|
||||
// }
|
||||
//}
|
||||
}
|
||||
x += 4*nblock;
|
||||
y += nblock;
|
||||
|
||||
@@ -3837,6 +3837,7 @@ struct llama_model_loader {
|
||||
case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
|
||||
case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
|
||||
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
||||
case GGML_TYPE_Q4_K_R4: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_R4; break;
|
||||
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
||||
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
||||
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
||||
@@ -15787,6 +15788,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if (new_type == GGML_TYPE_IQ4_XS_R4) {
|
||||
new_type = GGML_TYPE_IQ4_XS;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_K_R4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_0_R4) {
|
||||
new_type = GGML_TYPE_Q4_0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user