diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 8b5d73f6..00873651 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1943,6 +1943,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) + && ggml_cuda_mmvq_type_supported(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; bool use_mul_mat_q = ggml_is_quantized(src0->type) diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 12738240..114d0e7e 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -1,3 +1,10 @@ +// +// Copyright (C) 2023-2024 The ggml authors +// Copyright (C) 2024 Iwan Kawrakow +// MIT license +// SPDX-License-Identifier: MIT +// + #include "dmmv.cuh" #include "dequantize.cuh" #include "convert.cuh" @@ -8,6 +15,54 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); #endif +static __global__ void dequantize_mul_mat_vec_iq2_kt(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, + const int ncols, int nrows, int64_t row_size) { + + constexpr uint32_t ka = 89226354; + constexpr uint32_t kb = 64248484; + constexpr uint32_t kmask = 0x8fff8fff; + constexpr uint32_t km32 = 0x3b603b60; + + const int row = blockIdx.x*blockDim.y + threadIdx.y; + if (row > nrows) return; + + const float * dptr = (const float *)((const char *)vx + row*row_size); + const float d = *dptr * 31.75f * 1.05f; + const block_iq2_kt * x = (const block_iq2_kt *)(dptr + 1); + + const int num_blocks_per_row = ncols / QK_K; + + float tmp = 0; // partial sum for thread in warp + + const int it = threadIdx.x; + + uint32_t s; + const half * h = (const half *)&s; + + for (int i = 0; i < num_blocks_per_row; ++i) { + const float * y = yy + i * QK_K + 8*it; + const float dl = iq4k_values[(x[i].scales[(it/4)%4] >> 4*(it/16)) & 0xf]; + const uint16_t * ql = (const uint16_t *)x[i].ql; + float bdot = 0; + uint32_t val = ql[it] + 4096; + for (int k = 0; k < 8; ++k) { + val = ka*val + kb; + s = (val & kmask) ^ km32; + //float q = (float)h[0] + (float)h[1]; + float q = (float)(h[0] + h[1]); + bdot += q * y[k]; + } + tmp += dl*bdot; + } + + // sum up partial sums and write back result + tmp = warp_reduce_sum(tmp); + + if (threadIdx.x == 0) { + dst[row] = tmp*d; + } +} + static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); @@ -554,6 +609,16 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f dequantize_mul_mat_vec_q2_k<<>>(vx, y, dst, ncols, nrows); } +static void dequantize_mul_mat_vec_iq2_kt_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 1; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(32, ny, 1); + const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_KT, ncols); + dequantize_mul_mat_vec_iq2_kt<<>>(vx, y, dst, ncols, nrows, row_size); +} + static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int ny = 2 / K_QUANTS_PER_ITERATION; @@ -646,6 +711,9 @@ void ggml_cuda_op_dequantize_mul_mat_vec( case GGML_TYPE_Q2_K: dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ2_KT: + dequantize_mul_mat_vec_iq2_kt_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q3_K: dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); break; @@ -679,5 +747,6 @@ bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) { src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K || src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K || + src0_type == GGML_TYPE_IQ2_KT || src0_type == GGML_TYPE_F16; } diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index b70f0a70..0c4c1aef 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -446,9 +446,9 @@ void ggml_cuda_op_mul_mat_vec_q( case GGML_TYPE_IQ2_KS: mul_mat_vec_iq2_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; - case GGML_TYPE_IQ2_KT: - mul_mat_vec_iq2_kt_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; + //case GGML_TYPE_IQ2_KT: + // mul_mat_vec_iq2_kt_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + // break; case GGML_TYPE_IQ5_K: mul_mat_vec_iq5_k_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; @@ -469,3 +469,42 @@ void ggml_cuda_op_mul_mat_vec_q( GGML_UNUSED(src1_ncols); GGML_UNUSED(src1_padded_row_size); } + +bool ggml_cuda_mmvq_type_supported(ggml_type src0_type) { + switch (src0_type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q6_0: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ1_BN: + case GGML_TYPE_IQ2_BN: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ3_K: + case GGML_TYPE_IQ4_K: + case GGML_TYPE_IQ4_KS: + case GGML_TYPE_IQ4_KSS: + case GGML_TYPE_IQ2_KS: + //case GGML_TYPE_IQ2_KT: + case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: + case GGML_TYPE_IQ3_S: + return true; + default: + return false; + } +} diff --git a/ggml/src/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh index d9e42fdd..e8ec6850 100644 --- a/ggml/src/ggml-cuda/mmvq.cuh +++ b/ggml/src/ggml-cuda/mmvq.cuh @@ -7,3 +7,5 @@ void ggml_cuda_op_mul_mat_vec_q( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream); + +bool ggml_cuda_mmvq_type_supported(ggml_type src0_type);