mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 11:21:56 +00:00
iq1_bn(Metal): 87.9 -> 89.0 t/s for TG-128
This commit is contained in:
@@ -5468,7 +5468,8 @@ void kernel_mul_mv_iq1_bn_f32_impl(
|
|||||||
|
|
||||||
for (int ib32 = ix; ib32 < nb32; ib32 += 16) {
|
for (int ib32 = ix; ib32 < nb32; ib32 += 16) {
|
||||||
|
|
||||||
for (int j = 0; j < 16; ++j) yl[j] = y4[j];
|
float sumy = 0;
|
||||||
|
for (int j = 0; j < 16; ++j) { yl[j] = y4[j]; sumy += y4[j]; }
|
||||||
|
|
||||||
const int ibl = ib32 / (QK_IQ1BN / 32);
|
const int ibl = ib32 / (QK_IQ1BN / 32);
|
||||||
device const block_iq1_bn * xr = x + ibl;
|
device const block_iq1_bn * xr = x + ibl;
|
||||||
@@ -5482,19 +5483,18 @@ void kernel_mul_mv_iq1_bn_f32_impl(
|
|||||||
for (int k = 0; k < 3; ++k) {
|
for (int k = 0; k < 3; ++k) {
|
||||||
uint16_t q = ql[k];
|
uint16_t q = ql[k];
|
||||||
for (int j = 4; j >= 0; --j) {
|
for (int j = 4; j >= 0; --j) {
|
||||||
uint8_t v = q;
|
uint16_t v = q & 0xff;
|
||||||
v = 3*v >> 8;
|
v += v << 1;
|
||||||
acc += yy[j] * values[v];
|
acc += yy[j] * (v & 0xff00);
|
||||||
q += (q << 1);
|
q += q << 1;
|
||||||
}
|
}
|
||||||
yy += 5;
|
yy += 5;
|
||||||
}
|
}
|
||||||
uint8_t v = k_mult[i16]*extra[0];
|
uint16_t v = (k_mult[i16]*extra[0]) & 0xff;
|
||||||
v = 3*v >> 8;
|
v += v << 1;
|
||||||
//v = (v + (v << 1)) >> 8;
|
acc += yl[15] * (v & 0xff00);
|
||||||
acc += yl[15] * values[v];
|
|
||||||
|
|
||||||
sumf[row] += acc;
|
sumf[row] += 0.00390625f * acc - sumy;
|
||||||
|
|
||||||
extra += row_size;
|
extra += row_size;
|
||||||
ql += row_size;
|
ql += row_size;
|
||||||
|
|||||||
Reference in New Issue
Block a user