mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-25 07:34:10 +00:00
POC per row scale: CUDA
This commit is contained in:
@@ -154,19 +154,23 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_tn(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
static __global__ void dequantize_block_iq2_tn(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
int64_t n_per_row, int64_t row_size) {
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq2_tn * x = (const block_iq2_tn *) vx;
|
||||
int64_t ii = blockIdx.x;
|
||||
int64_t row = (QK_K * ii) / n_per_row;
|
||||
const char * cx = (const char *)vx + row * row_size;
|
||||
float d = *(const float *)cx;
|
||||
const block_iq2_tn * x = (const block_iq2_tn *)(cx + sizeof(float));
|
||||
int64_t i = ii - (row*n_per_row)/QK_K;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t n = tid/32;
|
||||
const int64_t l = tid - 32*n;
|
||||
|
||||
const uint8_t q = x[i].qs[32*n + l];
|
||||
dst_t * y = yy + i*QK_K + 128*n;
|
||||
dst_t * y = yy + ii*QK_K + 128*n;
|
||||
|
||||
float d = __half2float(x[i].d);
|
||||
y[l+ 0] = d * ((q >> 0) & 3) - d;
|
||||
y[l+32] = d * ((q >> 2) & 3) - d;
|
||||
y[l+64] = d * ((q >> 4) & 3) - d;
|
||||
@@ -743,8 +747,9 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t n
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_tn_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
|
||||
const int64_t k = nrows * n_per_row;
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_tn<<<nb, 64, 0, stream>>>(vx, y);
|
||||
const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_TN, n_per_row);
|
||||
const int nb = (k + 255) / 256;
|
||||
dequantize_block_iq2_tn<<<nb, 64, 0, stream>>>(vx, y, n_per_row, row_size);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
|
||||
@@ -519,7 +519,8 @@ __device__ __forceinline__ float vec_dot_iq3_k_q8_1(
|
||||
static __device__ __forceinline__ float vec_dot_iq2_tn_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
|
||||
const block_iq2_tn * bq2 = (const block_iq2_tn *) vbq + kbx;
|
||||
float scale = *(const float *)vbq;
|
||||
const block_iq2_tn * bq2 = (const block_iq2_tn *)((const char *)vbq + sizeof(float)) + kbx;
|
||||
|
||||
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
||||
|
||||
@@ -533,19 +534,7 @@ static __device__ __forceinline__ float vec_dot_iq2_tn_q8_1(
|
||||
sumf += d8 * (ggml_cuda_dp4a(v & 0x03030303, u, 0) - ggml_cuda_dp4a(0x01010101, u, 0));
|
||||
v >>= 2;
|
||||
}
|
||||
return __half2float(bq2->d) * sumf;
|
||||
|
||||
//float sumf_d = 0;
|
||||
//float sumf_m = 0;
|
||||
//for (int i = 0; i < QR2_K; ++ i) {
|
||||
// int u = *((const int *)bq8_1[bq8_offset + i].qs + iqs % QI8_1);
|
||||
// float2 d8 = __half22float2(bq8_1[bq8_offset + i].ds);
|
||||
// sumf_d += d8.x * ggml_cuda_dp4a(v & 0x03030303, u, 0);
|
||||
// sumf_m += d8.y;
|
||||
// v >>= 2;
|
||||
//}
|
||||
//return __half2float(bq2->d) * (sumf_d - 0.125f * sumf_m);
|
||||
|
||||
return scale * sumf;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq1_tn_q8_1(
|
||||
|
||||
Reference in New Issue
Block a user