mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-25 15:44:10 +00:00
Faster iq4_k: CUDA
Not actually faster than the original.
This commit is contained in:
@@ -622,19 +622,20 @@ static __global__ void dequantize_block_iq4_k(const void * __restrict__ vx, dst_
|
||||
const block_iq4_k * x = (const block_iq4_k *)vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
||||
const int64_t il = tid/16; // 0 or 1
|
||||
const int64_t ib = tid%16; // 0...15
|
||||
dst_t * y = yy + i*QK_K + 4*ib + 2*il;
|
||||
const uint8_t * q4 = x[i].qs + 4*ib + 2*il;
|
||||
const float d = (float)x[i].d;
|
||||
const uint8_t sh = x[i].scales_h[ib/2] >> 4*(ib%2);
|
||||
const float d1 = d * (((x[i].scales_l[ib] & 0xf) | ((sh << 4) & 0x30)) - 32);
|
||||
const float d2 = d * (((x[i].scales_l[ib] >> 4) | ((sh << 2) & 0x30)) - 32);
|
||||
const int8_t * values1 = iq4k_values + 16*((x[i].extra >> (2*ib+0)) & 1);
|
||||
const int8_t * values2 = iq4k_values + 16*((x[i].extra >> (2*ib+1)) & 1);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = d1 * values1[q4[j] & 0xf];
|
||||
y[j+16] = d2 * values2[q4[j] >> 4];
|
||||
const uint16_t * scales_h = (uint16_t *)x[i].scales_h;
|
||||
const uint8_t sh = scales_h[ib/8] >> 2*(ib%8);
|
||||
const float dl = d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | ((sh << 4) & 0x30)) - 32);
|
||||
const int8_t * values = iq4k_values + (((x[i].extra >> ib) & 1) << 4);
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
y[j+ 0] = dl * values[q4[j+ 0] & 0xf];
|
||||
y[j+ 64] = dl * values[q4[j+ 0] >> 4];
|
||||
y[j+128] = dl * values[q4[j+64] & 0xf];
|
||||
y[j+192] = dl * values[q4[j+64] >> 4];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -184,6 +184,37 @@ __device__ __forceinline__ void get_int_from_table_16_shift(const uint32_t & q4,
|
||||
val2 = v1 | (v2 << 16);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void get_int_from_table_16_shift_4(const uint32_t * q4, uint16_t shift, const uint8_t * all_values,
|
||||
int * val) {
|
||||
|
||||
uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
|
||||
uint16_t v1, v2;
|
||||
|
||||
const uint8_t * values = all_values + 16*(shift & 1);
|
||||
|
||||
aux32 = q4[0] & 0x0f0f0f0f;
|
||||
v1 = values[q8[0]] | (values[q8[1]] << 8);
|
||||
v2 = values[q8[2]] | (values[q8[3]] << 8);
|
||||
val[0] = v1 | (v2 << 16);
|
||||
|
||||
aux32 = (q4[0] >> 4) & 0x0f0f0f0f;
|
||||
v1 = values[q8[0]] | (values[q8[1]] << 8);
|
||||
v2 = values[q8[2]] | (values[q8[3]] << 8);
|
||||
val[2] = v1 | (v2 << 16);
|
||||
|
||||
values = all_values + 8*(shift & 2);
|
||||
|
||||
aux32 = q4[1] & 0x0f0f0f0f;
|
||||
v1 = values[q8[0]] | (values[q8[1]] << 8);
|
||||
v2 = values[q8[2]] | (values[q8[3]] << 8);
|
||||
val[1] = v1 | (v2 << 16);
|
||||
|
||||
aux32 = (q4[1] >> 4) & 0x0f0f0f0f;
|
||||
v1 = values[q8[0]] | (values[q8[1]] << 8);
|
||||
v2 = values[q8[2]] | (values[q8[3]] << 8);
|
||||
val[3] = v1 | (v2 << 16);
|
||||
}
|
||||
|
||||
#define VDR_IQ4_K_Q8_1_MMVQ 4
|
||||
#define VDR_IQ4_K_Q8_1_MMQ 4
|
||||
|
||||
@@ -193,25 +224,56 @@ __device__ __forceinline__ float vec_dot_iq4_k_q8_1(
|
||||
const block_iq4_k * bq4 = (const block_iq4_k *) vbq + kbx;
|
||||
const uint8_t * all_values = (const uint8_t *)iq4k_values;
|
||||
|
||||
// iqs is 0...28
|
||||
const int ib32 = iqs/4;
|
||||
// Why iqs/4 ?
|
||||
const int32_t * q8 = (const int *)bq8_1[ib32].qs;
|
||||
const uint16_t * q4 = (const uint16_t *)bq4->qs + 8*ib32;
|
||||
const uint16_t extra = bq4->extra >> 2*ib32;
|
||||
int v1, v2;
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
const uint32_t aux32 = q4[2*j+0] | (q4[2*j+1] << 16);
|
||||
get_int_from_table_16_shift(aux32, extra, all_values, v1, v2);
|
||||
sumi1 = ggml_cuda_dp4a(v1, q8[j+0], sumi1);
|
||||
sumi2 = ggml_cuda_dp4a(v2, q8[j+4], sumi2);
|
||||
}
|
||||
const float d = __half2float(bq4->d) * __low2float(bq8_1[ib32].ds);
|
||||
// iqs is 0, 4, 8, 12, 16, 20, 24, 28
|
||||
// we want iqs = 0 to process quants 0....7 + 64...71 + 128...135 + 192...199
|
||||
// = 4 to process quants 8...15 + 72...79 + 136...143 + 200...207
|
||||
// ...
|
||||
// = 28 to process quants 56..63 + 120..127+ 184...192 + 248...255
|
||||
const int ib32 = iqs/4; // 0...7
|
||||
const uint32_t * q4 = (const uint32_t *)bq4->qs + 2*ib32;
|
||||
const uint8_t sh = bq4->scales_h[ib32/2] >> 4*(ib32%2);
|
||||
const int ls1 = ((bq4->scales_l[ib32] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
||||
const int ls2 = ((bq4->scales_l[ib32] >> 4) | ((sh << 2) & 0x30)) - 32;
|
||||
return d * (sumi1 * ls1 + sumi2 * ls2);
|
||||
const uint16_t extra = bq4->extra >> 2*ib32;
|
||||
int val[4];
|
||||
|
||||
//const int32_t * q8 = (const int *)bq8_1[ib32].qs;
|
||||
|
||||
//get_int_from_table_16_shift_4(q4, extra, all_values, val);
|
||||
//int sumi1 = ggml_cuda_dp4a(val[0], q8[0], ggml_cuda_dp4a(val[2], q8[2], 0));
|
||||
//int sumi2 = ggml_cuda_dp4a(val[1], q8[1], ggml_cuda_dp4a(val[3], q8[3], 0));
|
||||
|
||||
//get_int_from_table_16_shift_4(q4+16, extra, all_values, val);
|
||||
//sumi1 = ggml_cuda_dp4a(val[0], q8[4], ggml_cuda_dp4a(val[2], q8[6], sumi1));
|
||||
//sumi2 = ggml_cuda_dp4a(val[1], q8[5], ggml_cuda_dp4a(val[3], q8[7], sumi2));
|
||||
|
||||
//return __half2float(bq4->d) * __low2float(bq8_1[ib32].ds) * (sumi1*ls1 + sumi2*ls2);
|
||||
|
||||
get_int_from_table_16_shift_4(q4, extra, all_values, val);
|
||||
const int32_t * q8 = (const int *)bq8_1[ib32/4].qs + 2*(ib32%4);
|
||||
int sumi1 = ggml_cuda_dp4a(val[0], q8[0], 0);
|
||||
int sumi2 = ggml_cuda_dp4a(val[1], q8[1], 0);
|
||||
float sumf = (sumi1*ls1 + sumi2*ls2) * __low2float(bq8_1[ib32/4].ds);
|
||||
|
||||
q8 = (const int *)bq8_1[ib32/4+2].qs + 2*(ib32%4);
|
||||
sumi1 = ggml_cuda_dp4a(val[2], q8[0], 0);
|
||||
sumi2 = ggml_cuda_dp4a(val[3], q8[1], 0);
|
||||
sumf += (sumi1*ls1 + sumi2*ls2) * __low2float(bq8_1[ib32/4+2].ds);
|
||||
|
||||
get_int_from_table_16_shift_4(q4+16, extra, all_values, val);
|
||||
|
||||
q8 = (const int *)bq8_1[ib32/4+4].qs + 2*(ib32%4);
|
||||
sumi1 = ggml_cuda_dp4a(val[0], q8[0], 0);
|
||||
sumi2 = ggml_cuda_dp4a(val[1], q8[1], 0);
|
||||
sumf += (sumi1*ls1 + sumi2*ls2) * __low2float(bq8_1[ib32/4+4].ds);
|
||||
|
||||
q8 = (const int *)bq8_1[ib32/4+6].qs + 2*(ib32%4);
|
||||
sumi1 = ggml_cuda_dp4a(val[2], q8[0], 0);
|
||||
sumi2 = ggml_cuda_dp4a(val[3], q8[1], 0);
|
||||
sumf += (sumi1*ls1 + sumi2*ls2) * __low2float(bq8_1[ib32/4+6].ds);
|
||||
|
||||
return __half2float(bq4->d) * sumf;
|
||||
|
||||
}
|
||||
|
||||
#define VDR_IQ4_KS_Q8_1_MMVQ 4
|
||||
|
||||
@@ -37,6 +37,48 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||
}
|
||||
|
||||
/*
|
||||
static __global__ void quantize_q8_1_iqk(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
|
||||
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (ix0 >= kx0_padded) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t ix1 = blockIdx.y;
|
||||
|
||||
const int64_t i_padded = ix1*kx0_padded + ix0;
|
||||
|
||||
block_q8_1 * y = (block_q8_1 *) vy;
|
||||
|
||||
const int64_t ib256 = i_padded / QK_K; // block index
|
||||
const int64_t iqs256 = i_padded % QK_K; // quant index
|
||||
|
||||
const int64_t ib32 = 8*ib256 + iqs256/32;
|
||||
const int64_t iqs = iqs256%32;
|
||||
const int64_t idx = QK_K*ib256 + 8*(iqs%8) + 64*(iqs/8);
|
||||
|
||||
const float xi = idx < kx ? x[ix1*kx + idx] : 0.0f;
|
||||
float amax = fabsf(xi);
|
||||
float sum = xi;
|
||||
|
||||
amax = warp_reduce_max(amax);
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
const float d = amax / 127;
|
||||
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
||||
|
||||
y[ib32].qs[iqs] = q;
|
||||
|
||||
if (iqs > 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
reinterpret_cast<half&>(y[ib32].ds.x) = d;
|
||||
reinterpret_cast<half&>(y[ib32].ds.y) = sum;
|
||||
}
|
||||
*/
|
||||
|
||||
template <mmq_q8_1_ds_layout ds_layout>
|
||||
static __global__ void quantize_mmq_q8_1(
|
||||
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
|
||||
@@ -129,11 +171,16 @@ void quantize_row_q8_1_cuda(
|
||||
|
||||
GGML_ASSERT(kx0_padded % QK8_1 == 0);
|
||||
|
||||
//printf("%s: kx0=%d, kx1=%d, channels=%d, kx0_padded=%d\n", __func__, (int)kx0, (int)kx1, (int)channels, (int)kx0_padded);
|
||||
|
||||
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const dim3 num_blocks(block_num_x, kx1*channels, 1);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||
//switch (type_x) {
|
||||
// case GGML_TYPE_IQ4_K: quantize_q8_1_iqk<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded); break;
|
||||
// default: quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
|
||||
//}
|
||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
|
||||
|
||||
GGML_UNUSED(type_x);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user