mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-25 07:34:10 +00:00
Some cleanup
This commit is contained in:
@@ -7378,7 +7378,7 @@ template <int nrc> struct Q8_16 {
|
||||
};
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq2_bn_r4_q8_k16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
static IQK_NOINLINE void mul_mat_iq2_bn_r4_q8_k16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
if (nrc_x%4) {
|
||||
printf("%s: %d is not a multiple of 4\n", __func__, nrc_x);
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -7387,73 +7387,24 @@ static void mul_mat_iq2_bn_r4_q8_k16(int n, const void * vx, size_t bx, const Da
|
||||
auto m3 = vdupq_n_u8(0x3);
|
||||
int nb = n / QK_IQ1BN;
|
||||
if constexpr (nrc_y == 1) {
|
||||
//uint8x16x4_t shuff = {
|
||||
// vreinterpretq_u8_u32(vdupq_n_u32(0x03020100)),
|
||||
// vreinterpretq_u8_u32(vdupq_n_u32(0x07060504)),
|
||||
// vreinterpretq_u8_u32(vdupq_n_u32(0x0b0a0908)),
|
||||
// vreinterpretq_u8_u32(vdupq_n_u32(0x0f0e0d0c)),
|
||||
//};
|
||||
auto mc = vdupq_n_u8(0xc);
|
||||
int32x4_t acc[8];
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
for (int k = 0; k < 8; ++k) acc[k] = vdupq_n_s32(0);
|
||||
//acc[0] = acc[1] = acc[2] = acc[3] = vdupq_n_s32(0);
|
||||
//acc[4] = acc[5] = acc[6] = acc[7] = vdupq_n_s32(0);
|
||||
const float * dptr = (const float *)((const char *)vx + ix*bx);
|
||||
auto dl = vld1q_f32(dptr);
|
||||
const uint8_t * iq2 = (const uint8_t *)(dptr + 4);
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
auto y = q8.load_quants(0, ib);
|
||||
auto bits1 = vld1q_u8(iq2 + 64*ib);
|
||||
auto bits2 = vshrq_n_u8(bits1, 4);
|
||||
acc[0] = vdotq_laneq_s32(acc[0], vandq_u8(bits1, m3), y.val[0], 0);
|
||||
acc[1] = vdotq_laneq_s32(acc[1], vandq_u8(bits1, mc), y.val[0], 1);
|
||||
acc[0] = vdotq_laneq_s32(acc[0], vandq_u8(bits2, m3), y.val[0], 2);
|
||||
acc[1] = vdotq_laneq_s32(acc[1], vandq_u8(bits2, mc), y.val[0], 3);
|
||||
//acc[0] = vdotq_s32(acc[0], vandq_u8(bits1, m3), vqtbl1q_s8(y.val[0], shuff.val[0]));
|
||||
//acc[1] = vdotq_s32(acc[1], vandq_u8(bits1, mc), vqtbl1q_s8(y.val[0], shuff.val[1]));
|
||||
//acc[0] = vdotq_s32(acc[0], vandq_u8(bits2, m3), vqtbl1q_s8(y.val[0], shuff.val[2]));
|
||||
//acc[1] = vdotq_s32(acc[1], vandq_u8(bits2, mc), vqtbl1q_s8(y.val[0], shuff.val[3]));
|
||||
bits1 = vld1q_u8(iq2 + 64*ib + 16);
|
||||
bits2 = vshrq_n_u8(bits1, 4);
|
||||
acc[2] = vdotq_laneq_s32(acc[2], vandq_u8(bits1, m3), y.val[1], 0);
|
||||
acc[3] = vdotq_laneq_s32(acc[3], vandq_u8(bits1, mc), y.val[1], 1);
|
||||
acc[2] = vdotq_laneq_s32(acc[2], vandq_u8(bits2, m3), y.val[1], 2);
|
||||
acc[3] = vdotq_laneq_s32(acc[3], vandq_u8(bits2, mc), y.val[1], 3);
|
||||
//acc[2] = vdotq_s32(acc[2], vandq_u8(bits1, m3), vqtbl1q_s8(y.val[1], shuff.val[0]));
|
||||
//acc[3] = vdotq_s32(acc[3], vandq_u8(bits1, mc), vqtbl1q_s8(y.val[1], shuff.val[1]));
|
||||
//acc[2] = vdotq_s32(acc[2], vandq_u8(bits2, m3), vqtbl1q_s8(y.val[1], shuff.val[2]));
|
||||
//acc[3] = vdotq_s32(acc[3], vandq_u8(bits2, mc), vqtbl1q_s8(y.val[1], shuff.val[3]));
|
||||
bits1 = vld1q_u8(iq2 + 64*ib + 32);
|
||||
bits2 = vshrq_n_u8(bits1, 4);
|
||||
acc[4] = vdotq_laneq_s32(acc[4], vandq_u8(bits1, m3), y.val[2], 0);
|
||||
acc[5] = vdotq_laneq_s32(acc[5], vandq_u8(bits1, mc), y.val[2], 1);
|
||||
acc[4] = vdotq_laneq_s32(acc[4], vandq_u8(bits2, m3), y.val[2], 2);
|
||||
acc[5] = vdotq_laneq_s32(acc[5], vandq_u8(bits2, mc), y.val[2], 3);
|
||||
//acc[4] = vdotq_s32(acc[4], vandq_u8(bits1, m3), vqtbl1q_s8(y.val[2], shuff.val[0]));
|
||||
//acc[5] = vdotq_s32(acc[5], vandq_u8(bits1, mc), vqtbl1q_s8(y.val[2], shuff.val[1]));
|
||||
//acc[4] = vdotq_s32(acc[4], vandq_u8(bits2, m3), vqtbl1q_s8(y.val[2], shuff.val[2]));
|
||||
//acc[5] = vdotq_s32(acc[5], vandq_u8(bits2, mc), vqtbl1q_s8(y.val[2], shuff.val[3]));
|
||||
bits1 = vld1q_u8(iq2 + 64*ib + 48);
|
||||
bits2 = vshrq_n_u8(bits1, 4);
|
||||
acc[6] = vdotq_laneq_s32(acc[6], vandq_u8(bits1, m3), y.val[3], 0);
|
||||
acc[7] = vdotq_laneq_s32(acc[7], vandq_u8(bits1, mc), y.val[3], 1);
|
||||
acc[6] = vdotq_laneq_s32(acc[6], vandq_u8(bits2, m3), y.val[3], 2);
|
||||
acc[7] = vdotq_laneq_s32(acc[7], vandq_u8(bits2, mc), y.val[3], 3);
|
||||
//acc[6] = vdotq_s32(acc[6], vandq_u8(bits1, m3), vqtbl1q_s8(y.val[3], shuff.val[0]));
|
||||
//acc[7] = vdotq_s32(acc[7], vandq_u8(bits1, mc), vqtbl1q_s8(y.val[3], shuff.val[1]));
|
||||
//acc[6] = vdotq_s32(acc[6], vandq_u8(bits2, m3), vqtbl1q_s8(y.val[3], shuff.val[2]));
|
||||
//acc[7] = vdotq_s32(acc[7], vandq_u8(bits2, mc), vqtbl1q_s8(y.val[3], shuff.val[3]));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
auto bits1 = vld1q_u8(iq2 + 64*ib + 16*j);
|
||||
auto bits2 = vshrq_n_u8(bits1, 4);
|
||||
acc[2*j+0] = vdotq_laneq_s32(acc[2*j+0], vandq_u8(bits1, m3), y.val[j], 0);
|
||||
acc[2*j+1] = vdotq_laneq_s32(acc[2*j+1], vandq_u8(bits1, mc), y.val[j], 1);
|
||||
acc[2*j+0] = vdotq_laneq_s32(acc[2*j+0], vandq_u8(bits2, m3), y.val[j], 2);
|
||||
acc[2*j+1] = vdotq_laneq_s32(acc[2*j+1], vandq_u8(bits2, mc), y.val[j], 3);
|
||||
}
|
||||
}
|
||||
//auto dy = q8.scale(0);
|
||||
//auto sumf1 = vmulq_f32( vcvtq_f32_s32(acc[0]), vmulq_laneq_f32(dl, dy, 0));
|
||||
//auto sumf2 = vmulq_f32( vcvtq_f32_s32(acc[1]), vmulq_laneq_f32(dl, dy, 0));
|
||||
//sumf1 = vfmaq_f32(sumf1, vcvtq_f32_s32(acc[2]), vmulq_laneq_f32(dl, dy, 1));
|
||||
//sumf2 = vfmaq_f32(sumf2, vcvtq_f32_s32(acc[3]), vmulq_laneq_f32(dl, dy, 1));
|
||||
//sumf1 = vfmaq_f32(sumf1, vcvtq_f32_s32(acc[4]), vmulq_laneq_f32(dl, dy, 2));
|
||||
//sumf2 = vfmaq_f32(sumf2, vcvtq_f32_s32(acc[5]), vmulq_laneq_f32(dl, dy, 2));
|
||||
//sumf1 = vfmaq_f32(sumf1, vcvtq_f32_s32(acc[6]), vmulq_laneq_f32(dl, dy, 3));
|
||||
//sumf2 = vfmaq_f32(sumf2, vcvtq_f32_s32(acc[7]), vmulq_laneq_f32(dl, dy, 3));
|
||||
auto dy = vmulq_f32(dl, vdupq_n_f32(q8.scale(0, 0)));
|
||||
auto sumf1 = vmulq_f32( vcvtq_f32_s32(acc[0]), dy);
|
||||
auto sumf2 = vmulq_f32( vcvtq_f32_s32(acc[1]), dy);
|
||||
|
||||
Reference in New Issue
Block a user