iq2_kt: CUDA dot product

Implemented as DMMV.
Very slow - just 81 t/s for LLaMA-3.1-8B.
Then again, Q2_K_S with forced to use DMMV only
gets 112 t/s vs 145 t/s via MMVQ. My memory is that
when the DMMV kernels were properly maintained/used,
DMMV was about on par with MMVQ for k-quants on my GPU.
This commit is contained in:
Iwan Kawrakow
2024-11-07 11:01:11 +02:00
parent b3dfe9984b
commit d2331b9287
4 changed files with 114 additions and 3 deletions

View File

@@ -1943,6 +1943,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& ggml_cuda_mmvq_type_supported(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_q = ggml_is_quantized(src0->type)

View File

@@ -1,3 +1,10 @@
//
// Copyright (C) 2023-2024 The ggml authors
// Copyright (C) 2024 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//
#include "dmmv.cuh"
#include "dequantize.cuh"
#include "convert.cuh"
@@ -8,6 +15,54 @@
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif
static __global__ void dequantize_mul_mat_vec_iq2_kt(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst,
const int ncols, int nrows, int64_t row_size) {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
constexpr uint32_t kmask = 0x8fff8fff;
constexpr uint32_t km32 = 0x3b603b60;
const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
const float * dptr = (const float *)((const char *)vx + row*row_size);
const float d = *dptr * 31.75f * 1.05f;
const block_iq2_kt * x = (const block_iq2_kt *)(dptr + 1);
const int num_blocks_per_row = ncols / QK_K;
float tmp = 0; // partial sum for thread in warp
const int it = threadIdx.x;
uint32_t s;
const half * h = (const half *)&s;
for (int i = 0; i < num_blocks_per_row; ++i) {
const float * y = yy + i * QK_K + 8*it;
const float dl = iq4k_values[(x[i].scales[(it/4)%4] >> 4*(it/16)) & 0xf];
const uint16_t * ql = (const uint16_t *)x[i].ql;
float bdot = 0;
uint32_t val = ql[it] + 4096;
for (int k = 0; k < 8; ++k) {
val = ka*val + kb;
s = (val & kmask) ^ km32;
//float q = (float)h[0] + (float)h[1];
float q = (float)(h[0] + h[1]);
bdot += q * y[k];
}
tmp += dl*bdot;
}
// sum up partial sums and write back result
tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[row] = tmp*d;
}
}
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
@@ -554,6 +609,16 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}
static void dequantize_mul_mat_vec_iq2_kt_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 1;
const int block_num_y = (nrows + ny - 1) / ny;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(32, ny, 1);
const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_KT, ncols);
dequantize_mul_mat_vec_iq2_kt<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows, row_size);
}
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2 / K_QUANTS_PER_ITERATION;
@@ -646,6 +711,9 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
case GGML_TYPE_Q2_K:
dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_IQ2_KT:
dequantize_mul_mat_vec_iq2_kt_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q3_K:
dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
break;
@@ -679,5 +747,6 @@ bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
src0_type == GGML_TYPE_IQ2_KT ||
src0_type == GGML_TYPE_F16;
}

View File

@@ -446,9 +446,9 @@ void ggml_cuda_op_mul_mat_vec_q(
case GGML_TYPE_IQ2_KS:
mul_mat_vec_iq2_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_IQ2_KT:
mul_mat_vec_iq2_kt_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
//case GGML_TYPE_IQ2_KT:
// mul_mat_vec_iq2_kt_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
// break;
case GGML_TYPE_IQ5_K:
mul_mat_vec_iq5_k_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
@@ -469,3 +469,42 @@ void ggml_cuda_op_mul_mat_vec_q(
GGML_UNUSED(src1_ncols);
GGML_UNUSED(src1_padded_row_size);
}
bool ggml_cuda_mmvq_type_supported(ggml_type src0_type) {
switch (src0_type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q6_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ2_K:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ4_K:
case GGML_TYPE_IQ4_KS:
case GGML_TYPE_IQ4_KSS:
case GGML_TYPE_IQ2_KS:
//case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ5_K:
case GGML_TYPE_IQ6_K:
case GGML_TYPE_IQ3_S:
return true;
default:
return false;
}
}

View File

@@ -7,3 +7,5 @@ void ggml_cuda_op_mul_mat_vec_q(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream);
bool ggml_cuda_mmvq_type_supported(ggml_type src0_type);