Faster iq4_k: CUDA

Not actually faster than the original.
This commit is contained in:
Iwan Kawrakow
2024-11-04 14:37:50 +02:00
parent 48974c7acd
commit 68e6d168a2
3 changed files with 139 additions and 29 deletions

View File

@@ -622,19 +622,20 @@ static __global__ void dequantize_block_iq4_k(const void * __restrict__ vx, dst_
const block_iq4_k * x = (const block_iq4_k *)vx;
const int64_t tid = threadIdx.x;
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
const int64_t il = tid/16; // 0 or 1
const int64_t ib = tid%16; // 0...15
dst_t * y = yy + i*QK_K + 4*ib + 2*il;
const uint8_t * q4 = x[i].qs + 4*ib + 2*il;
const float d = (float)x[i].d;
const uint8_t sh = x[i].scales_h[ib/2] >> 4*(ib%2);
const float d1 = d * (((x[i].scales_l[ib] & 0xf) | ((sh << 4) & 0x30)) - 32);
const float d2 = d * (((x[i].scales_l[ib] >> 4) | ((sh << 2) & 0x30)) - 32);
const int8_t * values1 = iq4k_values + 16*((x[i].extra >> (2*ib+0)) & 1);
const int8_t * values2 = iq4k_values + 16*((x[i].extra >> (2*ib+1)) & 1);
for (int j = 0; j < 4; ++j) {
y[j+ 0] = d1 * values1[q4[j] & 0xf];
y[j+16] = d2 * values2[q4[j] >> 4];
const uint16_t * scales_h = (uint16_t *)x[i].scales_h;
const uint8_t sh = scales_h[ib/8] >> 2*(ib%8);
const float dl = d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | ((sh << 4) & 0x30)) - 32);
const int8_t * values = iq4k_values + (((x[i].extra >> ib) & 1) << 4);
for (int j = 0; j < 2; ++j) {
y[j+ 0] = dl * values[q4[j+ 0] & 0xf];
y[j+ 64] = dl * values[q4[j+ 0] >> 4];
y[j+128] = dl * values[q4[j+64] & 0xf];
y[j+192] = dl * values[q4[j+64] >> 4];
}
}

View File

@@ -184,6 +184,37 @@ __device__ __forceinline__ void get_int_from_table_16_shift(const uint32_t & q4,
val2 = v1 | (v2 << 16);
}
__device__ __forceinline__ void get_int_from_table_16_shift_4(const uint32_t * q4, uint16_t shift, const uint8_t * all_values,
int * val) {
uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
uint16_t v1, v2;
const uint8_t * values = all_values + 16*(shift & 1);
aux32 = q4[0] & 0x0f0f0f0f;
v1 = values[q8[0]] | (values[q8[1]] << 8);
v2 = values[q8[2]] | (values[q8[3]] << 8);
val[0] = v1 | (v2 << 16);
aux32 = (q4[0] >> 4) & 0x0f0f0f0f;
v1 = values[q8[0]] | (values[q8[1]] << 8);
v2 = values[q8[2]] | (values[q8[3]] << 8);
val[2] = v1 | (v2 << 16);
values = all_values + 8*(shift & 2);
aux32 = q4[1] & 0x0f0f0f0f;
v1 = values[q8[0]] | (values[q8[1]] << 8);
v2 = values[q8[2]] | (values[q8[3]] << 8);
val[1] = v1 | (v2 << 16);
aux32 = (q4[1] >> 4) & 0x0f0f0f0f;
v1 = values[q8[0]] | (values[q8[1]] << 8);
v2 = values[q8[2]] | (values[q8[3]] << 8);
val[3] = v1 | (v2 << 16);
}
#define VDR_IQ4_K_Q8_1_MMVQ 4
#define VDR_IQ4_K_Q8_1_MMQ 4
@@ -193,25 +224,56 @@ __device__ __forceinline__ float vec_dot_iq4_k_q8_1(
const block_iq4_k * bq4 = (const block_iq4_k *) vbq + kbx;
const uint8_t * all_values = (const uint8_t *)iq4k_values;
// iqs is 0...28
const int ib32 = iqs/4;
// Why iqs/4 ?
const int32_t * q8 = (const int *)bq8_1[ib32].qs;
const uint16_t * q4 = (const uint16_t *)bq4->qs + 8*ib32;
const uint16_t extra = bq4->extra >> 2*ib32;
int v1, v2;
int sumi1 = 0, sumi2 = 0;
for (int j = 0; j < 4; ++j) {
const uint32_t aux32 = q4[2*j+0] | (q4[2*j+1] << 16);
get_int_from_table_16_shift(aux32, extra, all_values, v1, v2);
sumi1 = ggml_cuda_dp4a(v1, q8[j+0], sumi1);
sumi2 = ggml_cuda_dp4a(v2, q8[j+4], sumi2);
}
const float d = __half2float(bq4->d) * __low2float(bq8_1[ib32].ds);
// iqs is 0, 4, 8, 12, 16, 20, 24, 28
// we want iqs = 0 to process quants 0....7 + 64...71 + 128...135 + 192...199
// = 4 to process quants 8...15 + 72...79 + 136...143 + 200...207
// ...
// = 28 to process quants 56..63 + 120..127+ 184...192 + 248...255
const int ib32 = iqs/4; // 0...7
const uint32_t * q4 = (const uint32_t *)bq4->qs + 2*ib32;
const uint8_t sh = bq4->scales_h[ib32/2] >> 4*(ib32%2);
const int ls1 = ((bq4->scales_l[ib32] & 0xf) | ((sh << 4) & 0x30)) - 32;
const int ls2 = ((bq4->scales_l[ib32] >> 4) | ((sh << 2) & 0x30)) - 32;
return d * (sumi1 * ls1 + sumi2 * ls2);
const uint16_t extra = bq4->extra >> 2*ib32;
int val[4];
//const int32_t * q8 = (const int *)bq8_1[ib32].qs;
//get_int_from_table_16_shift_4(q4, extra, all_values, val);
//int sumi1 = ggml_cuda_dp4a(val[0], q8[0], ggml_cuda_dp4a(val[2], q8[2], 0));
//int sumi2 = ggml_cuda_dp4a(val[1], q8[1], ggml_cuda_dp4a(val[3], q8[3], 0));
//get_int_from_table_16_shift_4(q4+16, extra, all_values, val);
//sumi1 = ggml_cuda_dp4a(val[0], q8[4], ggml_cuda_dp4a(val[2], q8[6], sumi1));
//sumi2 = ggml_cuda_dp4a(val[1], q8[5], ggml_cuda_dp4a(val[3], q8[7], sumi2));
//return __half2float(bq4->d) * __low2float(bq8_1[ib32].ds) * (sumi1*ls1 + sumi2*ls2);
get_int_from_table_16_shift_4(q4, extra, all_values, val);
const int32_t * q8 = (const int *)bq8_1[ib32/4].qs + 2*(ib32%4);
int sumi1 = ggml_cuda_dp4a(val[0], q8[0], 0);
int sumi2 = ggml_cuda_dp4a(val[1], q8[1], 0);
float sumf = (sumi1*ls1 + sumi2*ls2) * __low2float(bq8_1[ib32/4].ds);
q8 = (const int *)bq8_1[ib32/4+2].qs + 2*(ib32%4);
sumi1 = ggml_cuda_dp4a(val[2], q8[0], 0);
sumi2 = ggml_cuda_dp4a(val[3], q8[1], 0);
sumf += (sumi1*ls1 + sumi2*ls2) * __low2float(bq8_1[ib32/4+2].ds);
get_int_from_table_16_shift_4(q4+16, extra, all_values, val);
q8 = (const int *)bq8_1[ib32/4+4].qs + 2*(ib32%4);
sumi1 = ggml_cuda_dp4a(val[0], q8[0], 0);
sumi2 = ggml_cuda_dp4a(val[1], q8[1], 0);
sumf += (sumi1*ls1 + sumi2*ls2) * __low2float(bq8_1[ib32/4+4].ds);
q8 = (const int *)bq8_1[ib32/4+6].qs + 2*(ib32%4);
sumi1 = ggml_cuda_dp4a(val[2], q8[0], 0);
sumi2 = ggml_cuda_dp4a(val[3], q8[1], 0);
sumf += (sumi1*ls1 + sumi2*ls2) * __low2float(bq8_1[ib32/4+6].ds);
return __half2float(bq4->d) * sumf;
}
#define VDR_IQ4_KS_Q8_1_MMVQ 4

View File

@@ -37,6 +37,48 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
reinterpret_cast<half&>(y[ib].ds.y) = sum;
}
/*
static __global__ void quantize_q8_1_iqk(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (ix0 >= kx0_padded) {
return;
}
const int64_t ix1 = blockIdx.y;
const int64_t i_padded = ix1*kx0_padded + ix0;
block_q8_1 * y = (block_q8_1 *) vy;
const int64_t ib256 = i_padded / QK_K; // block index
const int64_t iqs256 = i_padded % QK_K; // quant index
const int64_t ib32 = 8*ib256 + iqs256/32;
const int64_t iqs = iqs256%32;
const int64_t idx = QK_K*ib256 + 8*(iqs%8) + 64*(iqs/8);
const float xi = idx < kx ? x[ix1*kx + idx] : 0.0f;
float amax = fabsf(xi);
float sum = xi;
amax = warp_reduce_max(amax);
sum = warp_reduce_sum(sum);
const float d = amax / 127;
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
y[ib32].qs[iqs] = q;
if (iqs > 0) {
return;
}
reinterpret_cast<half&>(y[ib32].ds.x) = d;
reinterpret_cast<half&>(y[ib32].ds.y) = sum;
}
*/
template <mmq_q8_1_ds_layout ds_layout>
static __global__ void quantize_mmq_q8_1(
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
@@ -129,11 +171,16 @@ void quantize_row_q8_1_cuda(
GGML_ASSERT(kx0_padded % QK8_1 == 0);
//printf("%s: kx0=%d, kx1=%d, channels=%d, kx0_padded=%d\n", __func__, (int)kx0, (int)kx1, (int)channels, (int)kx0_padded);
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, kx1*channels, 1);
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
//switch (type_x) {
// case GGML_TYPE_IQ4_K: quantize_q8_1_iqk<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded); break;
// default: quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
//}
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
GGML_UNUSED(type_x);
}