From d696d64fdede1621c914db797eb73a84fc9906af Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 24 Oct 2024 15:34:19 +0300 Subject: [PATCH] Remove iq1_tn and iq2_tn - Part 2 --- ggml/src/ggml-cuda.cu | 2 - ggml/src/ggml-cuda/binbcast.cu | 2 +- ggml/src/ggml-cuda/common.cuh | 14 ----- ggml/src/ggml-cuda/convert.cu | 88 -------------------------------- ggml/src/ggml-cuda/iqk_mmvq.cu | 90 --------------------------------- ggml/src/ggml-cuda/iqk_mmvq.cuh | 8 --- ggml/src/ggml-cuda/mmvq.cu | 6 --- ggml/src/iqk/iqk_mul_mat.cpp | 2 +- ggml/src/iqk/iqk_quantize.cpp | 9 ++-- 9 files changed, 6 insertions(+), 215 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 9051863b..6759e202 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2841,9 +2841,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_TYPE_IQ5_K: case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ1_BN: - case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ2_BN: - case GGML_TYPE_IQ2_TN: return true; default: return false; diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 62d115f1..5abbd43c 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -288,7 +288,7 @@ static void scale_f32_cuda_l(const float * x, float * dst, const void * data, co scale_f32_l<<>>(x, dst, data, k); } -void ggml_cuda_op_scale_tensor(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +static void ggml_cuda_op_scale_tensor(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *)src0->data; float * dst_d = (float *)dst->data; diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index a5658a24..2eba527f 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -473,13 +473,6 @@ struct ggml_cuda_type_traits { static constexpr int qi = QI1_BN; }; -template<> -struct ggml_cuda_type_traits { - static constexpr int qk = QK_IQ1BN; - static constexpr int qr = QR1_BN; - static constexpr int qi = QI1_BN; -}; - template<> struct ggml_cuda_type_traits { static constexpr int qk = QK_IQ1BN; @@ -487,13 +480,6 @@ struct ggml_cuda_type_traits { static constexpr int qi = QI1_BN; }; -template<> -struct ggml_cuda_type_traits { - static constexpr int qk = QK_K; - static constexpr int qr = QR2_K; - static constexpr int qi = QI2_K; -}; - template<> struct ggml_cuda_type_traits { static constexpr int qk = QK4_NL; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 0e7d8df5..b9baee1b 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -183,30 +183,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); } -template -static __global__ void dequantize_block_iq2_tn(const void * __restrict__ vx, dst_t * __restrict__ yy, - int64_t n_per_row, int64_t row_size) { - - int64_t ii = blockIdx.x; - int64_t row = (QK_K * ii) / n_per_row; - const char * cx = (const char *)vx + row * row_size; - float d = *(const float *)cx; - const block_iq2_tn * x = (const block_iq2_tn *)(cx + sizeof(float)); - int64_t i = ii - (row*n_per_row)/QK_K; - - const int64_t tid = threadIdx.x; - const int64_t n = tid/32; - const int64_t l = tid - 32*n; - - const uint8_t q = x[i].qs[32*n + l]; - dst_t * y = yy + ii*QK_K + 128*n; - - y[l+ 0] = d * ((q >> 0) & 3) - d; - y[l+32] = d * ((q >> 2) & 3) - d; - y[l+64] = d * ((q >> 4) & 3) - d; - y[l+96] = d * ((q >> 6) & 3) - d; -} - template static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -480,46 +456,6 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_ } } -template -static __global__ void dequantize_block_iq1_tn(const void * __restrict__ vx, dst_t * __restrict__ yy, - int64_t n_per_row, int64_t row_size) { - - int64_t ii = blockIdx.x; - int64_t row = (QK_K * ii) / n_per_row; - const char * cx = (const char *)vx + row * row_size; - float scale = *(const half *)cx; - const block_iq1_bn * x = (const block_iq1_bn *)(cx + sizeof(half)); - - static const uint8_t k_mult[5] = {81, 27, 9, 3, 1}; - -//#define COMPUTE_VS(v) 3*v >> 8 -#define COMPUTE_VS(v) (v + (v >> 1)) >> 7 - - const int tid = threadIdx.x; - const int il = tid/4; // 0...7 - const int ib = tid%4; // 0...3 - dst_t * y = yy + ii*QK_K + 64*ib + 8*il; - const int i16 = il/2; - int64_t i = QK_K/QK_IQ1BN * (ii - (row*n_per_row)/QK_K) + ib; - uint8_t q = x[i].ql[3*i16+2*(il%2)]; - for (int j = 0; j < 5; ++j) { - uint8_t v = k_mult[j]*q; - int8_t vs = COMPUTE_VS(v); - y[2*(il%2)+j] = scale*(vs - 1); - } - q = x[i].ql[3*i16+1]; - for (int j = 0; j < 2; ++j) { - uint8_t v = k_mult[3*(il%2)+j]*q; - int8_t vs = COMPUTE_VS(v); - y[5*(1-(il%2))+j] = scale*(vs-1); - } - uint8_t v = (il%2) ? k_mult[i16]*x[i].extra : k_mult[2]*q; - int8_t vs = COMPUTE_VS(v); - y[7] = scale*(vs - 1); - -#undef COMPUTE_VS -} - template static __global__ void dequantize_block_iq1_bn(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size, int64_t nrows) { @@ -867,14 +803,6 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t n dequantize_block_q2_K<<>>(vx, y); } -template -static void dequantize_row_iq2_tn_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) { - const int64_t k = nrows * n_per_row; - const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_TN, n_per_row); - const int nb = (k + 255) / 256; - dequantize_block_iq2_tn<<>>(vx, y, n_per_row, row_size); -} - template static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) { const int64_t k = nrows * n_per_row; @@ -991,14 +919,6 @@ static void dequantize_row_iq1_bn_cuda(const void * vx, dst_t * y, const int64_t dequantize_block_iq1_bn<<>>(vx, y, n_per_row, row_size, nrows); } -template -static void dequantize_row_iq1_tn_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) { - const int64_t k = nrows * n_per_row; - const int64_t row_size = ggml_row_size(GGML_TYPE_IQ1_TN, n_per_row); - const int nb = (k + 255) / 256; - dequantize_block_iq1_tn<<>>(vx, y, n_per_row, row_size); -} - template static void dequantize_row_iq2_bn_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) { const int64_t k = nrows * n_per_row; @@ -1168,8 +1088,6 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_block_cuda; case GGML_TYPE_Q2_K: return dequantize_row_q2_K_cuda; - case GGML_TYPE_IQ2_TN: - return dequantize_row_iq2_tn_cuda; case GGML_TYPE_Q3_K: return dequantize_row_q3_K_cuda; case GGML_TYPE_Q4_K: @@ -1192,8 +1110,6 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq1_m_cuda; case GGML_TYPE_IQ1_BN: return dequantize_row_iq1_bn_cuda; - case GGML_TYPE_IQ1_TN: - return dequantize_row_iq1_tn_cuda; case GGML_TYPE_IQ2_BN: return dequantize_row_iq2_bn_cuda; case GGML_TYPE_IQ4_NL: @@ -1243,8 +1159,6 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_block_cuda; case GGML_TYPE_Q2_K: return dequantize_row_q2_K_cuda; - case GGML_TYPE_IQ2_TN: - return dequantize_row_iq2_tn_cuda; case GGML_TYPE_Q3_K: return dequantize_row_q3_K_cuda; case GGML_TYPE_Q4_K: @@ -1267,8 +1181,6 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq1_m_cuda; case GGML_TYPE_IQ1_BN: return dequantize_row_iq1_bn_cuda; - case GGML_TYPE_IQ1_TN: - return dequantize_row_iq1_tn_cuda; case GGML_TYPE_IQ2_BN: return dequantize_row_iq2_bn_cuda; case GGML_TYPE_IQ4_NL: diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu index 3b329303..795243e7 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cu +++ b/ggml/src/ggml-cuda/iqk_mmvq.cu @@ -626,82 +626,6 @@ __device__ __forceinline__ float vec_dot_iq3_k_q8_1( } -#define VDR_IQ2_TN_Q8_1_MMVQ 1 -#define VDR_IQ2_TN_Q8_1_MMQ 4 - -static __device__ __forceinline__ float vec_dot_iq2_tn_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - - float scale = *(const float *)vbq; - const block_iq2_tn * bq2 = (const block_iq2_tn *)((const char *)vbq + sizeof(float)) + kbx; - - const int bq8_offset = QR2_K * (iqs / QI8_1); - - const uint16_t * q16 = (const uint16_t *)bq2->qs + 2*iqs; - int v = q16[0] | (q16[1] << 16); - - float sumf = 0; - for (int i = 0; i < QR2_K; ++ i) { - int u = *((const int *)bq8_1[bq8_offset + i].qs + iqs % QI8_1); - float d8 = __low2float(bq8_1[bq8_offset + i].ds); - sumf += d8 * (ggml_cuda_dp4a(v & 0x03030303, u, 0) - ggml_cuda_dp4a(0x01010101, u, 0)); - v >>= 2; - } - return scale * sumf; -} - -static __device__ __forceinline__ float vec_dot_iq1_tn_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - - float scale = *(const half *)vbq; - const block_iq1_bn * bq1 = (const block_iq1_bn *)((const char *)vbq + sizeof(half)) + kbx; - - static const uint8_t k_mult[5] = {81, 27, 9, 3, 1}; - - // iqs is 0 or 1 - - int sumi = 0; -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const int * q8 = (const int *)bq8_1[iqs].qs; - int val[4]; - for (int l = 0; l < 2; ++l) { - int8_t * a = (int8_t *)val; - const int i16 = 2*iqs + l; - for (int k = 0; k < 3; ++k) { - uint8_t q = bq1->ql[3*i16+k]; - for (int j = 0; j < 5; ++j) { - uint8_t v = k_mult[j]*q; - int8_t vs = 3*v >> 8; //(v + (v >> 1)) >> 7; - *a++ = vs-1; - } - } - uint8_t v = k_mult[i16]*bq1->extra; - int8_t vs = 3*v >> 8; //(v + (v >> 1)) >> 7; - *a++ = vs-1; - sumi = __dp4a(val[0], q8[4*l+0], __dp4a(val[1], q8[4*l+1], __dp4a(val[2], q8[4*l+2], __dp4a(val[3], q8[4*l+3], sumi)))); - } -#else - const int8_t * q8 = bq8_1[iqs].qs; - for (int l = 0; l < 2; ++l) { - const int i16 = 2*iqs + l; - for (int k = 0; k < 3; ++k) { - uint8_t q = bq1->ql[3*i16+k]; - for (int j = 0; j < 5; ++j) { - uint8_t v = k_mult[j]*q; - int8_t vs = (v + (v >> 1)) >> 7; - sumi += q8[j]*(vs - 1); - } - q8 += 5; - } - uint8_t v = k_mult[i16]*bq1->extra; - int8_t vs = (v + (v >> 1)) >> 7; - sumi += q8[0]*(vs - 1); - q8++; - } -#endif - return __low2float(bq8_1[iqs].ds) * scale * sumi; -} - __device__ __forceinline__ float vec_dot_iq1_bn_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { @@ -854,13 +778,6 @@ void mul_mat_vec_iq6_k_q8_1_cuda( iqk_mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); } -void mul_mat_vec_iq2_tn_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - iqk_mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - void mul_mat_vec_iq1_bn_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { @@ -872,10 +789,3 @@ void mul_mat_vec_iq2_bn_q8_1_cuda( const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { iqk_mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); } - -void mul_mat_vec_iq1_tn_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - iqk_mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cuh b/ggml/src/ggml-cuda/iqk_mmvq.cuh index 372632da..1693a73a 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cuh +++ b/ggml/src/ggml-cuda/iqk_mmvq.cuh @@ -20,14 +20,6 @@ void mul_mat_vec_iq6_k_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream); -void mul_mat_vec_iq2_tn_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream); - -void mul_mat_vec_iq1_tn_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream); - void mul_mat_vec_iq4_ks_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 70ff47a6..cdf13533 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -422,12 +422,6 @@ void ggml_cuda_op_mul_mat_vec_q( case GGML_TYPE_IQ2_BN: mul_mat_vec_iq2_bn_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; - case GGML_TYPE_IQ2_TN: - mul_mat_vec_iq2_tn_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ1_TN: - mul_mat_vec_iq1_tn_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; case GGML_TYPE_IQ4_NL: mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 2701643c..d7682e54 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -1244,7 +1244,7 @@ struct DequantizerIQ4KSS final : public BaseDequantizer { Q4Bits bits; Scales8KBase s8k; const __m512i values; - const __m512i mask15 = _mm512_set1_epi16(0xfffe); + const __m512i mask15 = _mm512_set1_epi16(-2); // value is 0xfffe, but to shut up the stupid compiler warning we use the signed value const __m512i mask1 = _mm512_set1_epi16(1); const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 6d014e06..b9d48237 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -2803,7 +2803,6 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy, continue; } float best = 0; - bool is_shifted = false; float d = -max/iq4k_values[0]; std::memset(vs, 0, block_size); for (int itry = -ntry; itry <= ntry; ++itry) { @@ -2836,10 +2835,10 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy, } bool copy_p = false, copy_m = false; if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { - d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = false; copy_p = true; + d = sumqx_p/sumq2_p; best = d * sumqx_p; copy_p = true; } if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { - d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = false; copy_m = true; + d = sumqx_m/sumq2_m; best = d * sumqx_m; copy_m = true; } if (copy_m) { std::memcpy(vs, vms, block_size); @@ -2876,10 +2875,10 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy, } copy_p = copy_m = false; if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { - d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = true; copy_p = true; + d = sumqx_p/sumq2_p; best = d * sumqx_p; copy_p = true; } if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { - d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true; copy_m = true; + d = sumqx_m/sumq2_m; best = d * sumqx_m; copy_m = true; } if (copy_m) { std::memcpy(vs, vms, block_size);