mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-20 22:49:31 +00:00
iq4_kss: new bit arrangement - CUDA and Zen4 work
Did not lose performance on CUDA. Zen4 is decent, but not great: PP-512(LLaMA-3.1-8B) = 163 t/s. TG-128 is of course better than other 4-bit quants due to smaller model size. We get 14.5 t/s @ 8 threads.
This commit is contained in:
@@ -653,12 +653,13 @@ static __global__ void dequantize_block_iq4_kss(const void * __restrict__ vx, ds
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + ii*QK_K + 32*ib + 4*il;
|
||||
const uint32_t * q4 = x[i].qs + 4*ib;
|
||||
uint8_t ls = (q4[0] >> 30) | ((q4[1] >> 28) & 0x0c) | ((q4[2] >> 26) & 0x30) | ((q4[3] >> 24) & 0xc0);
|
||||
uint32_t s32 = (q4[0] & 0x00010001) | ((q4[1] & 0x00010001) << 2) | ((q4[2] & 0x00010001) << 4) | ((q4[3] & 0x00010001) << 6);
|
||||
uint8_t ls = (s32 | (s32 >> 15)) & 0xff;
|
||||
const float d = scale * ((ls & 254) - 127);
|
||||
const int8_t * values = iq4k_values + ((ls & 1) << 4);
|
||||
uint32_t aux32[2];
|
||||
aux32[0] = (q4[il] & 0x00007fff) | ((q4[il] << 1) & 0x7fff0000);
|
||||
aux32[0] ^= (aux32[0] << 1);
|
||||
aux32[0] = q4[il] & 0xfffefffe;
|
||||
aux32[0] ^= (aux32[0] >> 1);
|
||||
aux32[1] = ((aux32[0] >> 4) & 0x0f0f0f0f);
|
||||
aux32[0] &= 0x0f0f0f0f;
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
|
||||
@@ -253,13 +253,14 @@ __device__ __forceinline__ float vec_dot_iq4_kss_q8_1(
|
||||
const int ib32 = iqs/4; // Why iqs/4 ?
|
||||
const int32_t * q8 = (const int *)bq8_1[ib32].qs;
|
||||
const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
|
||||
const uint8_t ls = (q4[0] >> 30) | ((q4[1] >> 28) & 0x0c) | ((q4[2] >> 26) & 0x30) | ((q4[3] >> 24) & 0xc0);
|
||||
uint32_t s32 = (q4[0] & 0x00010001) | ((q4[1] & 0x00010001) << 2) | ((q4[2] & 0x00010001) << 4) | ((q4[3] & 0x00010001) << 6);
|
||||
uint8_t ls = (s32 | (s32 >> 15)) & 0xff;
|
||||
const float dl = scale * ((ls & 254) - 127);
|
||||
int v1, v2;
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
uint32_t aux32 = (q4[j] & 0x00007fff) | ((q4[j] << 1) & 0x7fff0000);
|
||||
aux32 ^= (aux32 << 1);
|
||||
uint32_t aux32 = q4[j] & 0xfffefffe;
|
||||
aux32 ^= (aux32 >> 1);
|
||||
get_int_from_table_16_shift(aux32, ls & 1, all_values, v1, v2);
|
||||
sumi = ggml_cuda_dp4a(v1, q8[j+0], sumi);
|
||||
sumi = ggml_cuda_dp4a(v2, q8[j+4], sumi);
|
||||
|
||||
@@ -1209,6 +1209,67 @@ struct DequantizerIQ4KS final : public BaseDequantizer<block_iq4_ks, true> {
|
||||
};
|
||||
};
|
||||
|
||||
struct DequantizerIQ4KSS final : public BaseDequantizer<block_iq4_kss, true> {
|
||||
DequantizerIQ4KSS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {}
|
||||
template <typename Q8>
|
||||
inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
|
||||
uint32_t aux32[2];
|
||||
auto b1 = _mm512_loadu_si512((const __m512i *)x[i].qs + 0);
|
||||
auto b2 = _mm512_loadu_si512((const __m512i *)x[i].qs + 1);
|
||||
auto bs1 = _mm512_and_si512(b1, mask15);
|
||||
bs1 = _mm512_xor_si512(bs1, _mm512_srli_epi16(bs1, 1));
|
||||
auto bs2 = _mm512_and_si512(b2, mask15);
|
||||
bs2 = _mm512_xor_si512(bs2, _mm512_srli_epi16(bs2, 1));
|
||||
bits.values[0] = _mm512_and_si512(bs1, bits.ml);
|
||||
bits.values[1] = _mm512_and_si512(_mm512_srli_epi16(bs1, 4), bits.ml);
|
||||
bits.values[2] = _mm512_and_si512(bs2, bits.ml);
|
||||
bits.values[3] = _mm512_and_si512(_mm512_srli_epi16(bs2, 4), bits.ml);
|
||||
auto tmp = _mm512_permutex2var_epi64(bits.values[0], permute1, bits.values[1]);
|
||||
bits.values[1] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[0], permute2, bits.values[1]));
|
||||
bits.values[0] = _mm512_shuffle_epi8(values, tmp);
|
||||
tmp = _mm512_permutex2var_epi64(bits.values[2], permute1, bits.values[3]);
|
||||
bits.values[3] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[2], permute2, bits.values[3]));
|
||||
bits.values[2] = _mm512_shuffle_epi8(values, tmp);
|
||||
//
|
||||
// Now the more difficult part - prepare the scales
|
||||
//
|
||||
aux32[0] = _mm512_cmpeq_epi16_mask(_mm512_and_si512(b1, mask1), mask1);
|
||||
aux32[1] = _mm512_cmpeq_epi16_mask(_mm512_and_si512(b2, mask1), mask1);
|
||||
|
||||
auto scales128 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i *)aux32));
|
||||
auto m1 = _mm512_castsi512_si128(mask1);
|
||||
auto shifts = _mm_and_si128(_mm_cmpeq_epi16(_mm_and_si128(scales128, m1), m1), m4);
|
||||
scales128 = _mm_add_epi16(_mm_and_si128(scales128, mask), m127);
|
||||
auto scales_s = _mm_mullo_epi16(scales128, _mm_add_epi16(m128, shifts));
|
||||
s8k.accum_mins(scales_s, q8, i, d, accm);
|
||||
auto scales256 = MM256_SET_M128I(scales128, scales128);
|
||||
auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1);
|
||||
scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]);
|
||||
scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]);
|
||||
scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]);
|
||||
scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]);
|
||||
}
|
||||
|
||||
Q4Bits bits;
|
||||
Scales8KBase s8k;
|
||||
const __m512i values;
|
||||
const __m512i mask15 = _mm512_set1_epi16(0xfffe);
|
||||
const __m512i mask1 = _mm512_set1_epi16(1);
|
||||
const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
|
||||
const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
|
||||
const __m128i mask = _mm_set1_epi16(254);
|
||||
const __m128i m127 = _mm_set1_epi16(-127);
|
||||
const __m128i m128 = _mm_set1_epi16(-128);
|
||||
const __m128i m4 = _mm_set1_epi16(4);
|
||||
const __m512i shuffles[4] = {
|
||||
_mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1),
|
||||
_mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1),
|
||||
_mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1),
|
||||
_mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1),
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template <typename Q8>
|
||||
inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) {
|
||||
const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0));
|
||||
@@ -3848,7 +3909,8 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
|
||||
std::is_same_v<Dequantizer, DequantizerIQ4K> ||
|
||||
std::is_same_v<Dequantizer, DequantizerIQ3K> ||
|
||||
std::is_same_v<Dequantizer, DequantizerIQ4XS>||
|
||||
std::is_same_v<Dequantizer, DequantizerIQ4KS>) {
|
||||
std::is_same_v<Dequantizer, DequantizerIQ4KS>||
|
||||
std::is_same_v<Dequantizer, DequantizerIQ4KSS>) {
|
||||
m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 1>;
|
||||
m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 2>;
|
||||
m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 3>;
|
||||
@@ -4012,6 +4074,10 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
|
||||
assert (ne00 % QK_K == 0);
|
||||
MulMat::set_functions<DequantizerIQ4KS>(mm);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_KSS:
|
||||
assert (ne00 % QK_K == 0);
|
||||
MulMat::set_functions<DequantizerIQ4KSS>(mm);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_K:
|
||||
assert (ne00 % QK_K == 0);
|
||||
MulMat::set_functions<DequantizerIQ2K>(mm);
|
||||
|
||||
@@ -3092,15 +3092,16 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy,
|
||||
}
|
||||
l += 127;
|
||||
if (mse_m < mse_p) l |= 1;
|
||||
for (int k = 0; k < block_size/8; ++k) {
|
||||
auto v1 = table[v[2*k+0] & 0x7fff];
|
||||
auto v2 = table[v[2*k+1] & 0x7fff];
|
||||
y[ibl].qs[(block_size/8)*ib + k] = v1 | (v2 << 15) | (((l >> 2*k) & 3) << 30);
|
||||
uint16_t * q16 = (uint16_t *)y[ibl].qs + (block_size/4)*ib;
|
||||
for (int k = 0; k < block_size/4; ++k) {
|
||||
auto val = table[v[k] & 0x7fff];
|
||||
q16[k] = (val << 1) | ((l >> k) & 1);
|
||||
}
|
||||
} else {
|
||||
l += 127;
|
||||
for (int k = 0; k < block_size/8; ++k) {
|
||||
y[ibl].qs[(block_size/8)*ib + k] |= (((l >> 2*k) & 3) << 30);
|
||||
uint16_t * q16 = (uint16_t *)y[ibl].qs + (block_size/4)*ib;
|
||||
for (int k = 0; k < block_size/4; ++k) {
|
||||
q16[k] = ((l >> k) & 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3208,10 +3209,10 @@ void dequantize_row_iq4_kss(const block_iq4_kss * x, float * y, int64_t k) {
|
||||
const float * dptr = (const float *)x;
|
||||
const float d = *dptr;
|
||||
x = (const block_iq4_kss *)(dptr + 1);
|
||||
uint32_t aux32[4];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
uint16_t aux16[8];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux16;
|
||||
for (int ibl = 0; ibl < k/QK_K; ++ibl) {
|
||||
auto qs = x[ibl].qs;
|
||||
auto qs = (const uint16_t *)x[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
//uint8_t ls = ((qs[0] >> 30) | ((qs[1] >> 28) & 0x0c) | ((qs[2] >> 26) & 0x30) | ((qs[3] >> 24) & 0xc0));
|
||||
//const int8_t * values = iq4k_values + ((ls & 1) << 4);
|
||||
@@ -3227,10 +3228,10 @@ void dequantize_row_iq4_kss(const block_iq4_kss * x, float * y, int64_t k) {
|
||||
// }
|
||||
//}
|
||||
int16_t ls = 0;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
aux32[k] = (qs[k] & 0x00007fff) | ((qs[k] << 1) & 0x7fff0000);
|
||||
aux32[k] ^= (aux32[k] << 1);
|
||||
ls |= (qs[k] >> 30) << 2*k;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
aux16[k] = qs[k] & 0xfffe;
|
||||
aux16[k] ^= (aux16[k] >> 1);
|
||||
ls |= (qs[k] & 1) << k;
|
||||
}
|
||||
const int8_t * values = iq4k_values + ((ls & 1) << 4);
|
||||
float dl = d * ((ls & 254) - 127);
|
||||
@@ -3239,7 +3240,7 @@ void dequantize_row_iq4_kss(const block_iq4_kss * x, float * y, int64_t k) {
|
||||
y[j+16] = dl * values[aux8[j] >> 4];
|
||||
}
|
||||
y += 32;
|
||||
qs += 4;
|
||||
qs += 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user