diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 9179f562..cf82a3c2 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2357,7 +2357,7 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * CUDA_CHECK(cudaMemsetAsync((char *)dst->data, 0, ggml_nbytes(dst), ctx.stream())); - if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 && + if (src1->ne[1] <= MMVQ_MAX_BATCH_SIZE && src1->ne[2] == 1 && src1->ne[3] == 1 && ggml_is_quantized(src0->type) && ggml_backend_buffer_is_cuda(src0->buffer) && ggml_backend_buffer_is_cuda(src1->buffer) && @@ -2381,18 +2381,19 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * local_dst.ne[1] = local_dst.ne[3] = 1; local_dst.nb[2] = local_dst.nb[1]; - auto local_src1 = *src1; - local_src1.nb[2] = local_src1.nb[3] = 0; - const int64_t src1_padded_col_size = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING); - ggml_cuda_pool_alloc src1_quantized(ctx.pool()); auto src_1_ddq_size = src1_padded_col_size*sizeof(block_q8_1)/QK8_1; - local_src1.data = src1_quantized.alloc(src_1_ddq_size); - quantize_row_q8_1_cuda((const float *)src1->data, (void *)src1_quantized.get(), src1->ne[0], 1, 1, src1_padded_col_size, - src0->type, stream); - CUDA_CHECK(cudaGetLastError()); - + auto local_src1 = *src1; + local_src1.ne[1] = 1; local_src1.nb[1] = src_1_ddq_size; + local_src1.nb[2] = src1->ne[1] > 1 ? src_1_ddq_size : 0; + local_src1.nb[3] = local_src1.nb[2]; + + ggml_cuda_pool_alloc src1_quantized(ctx.pool()); + local_src1.data = src1_quantized.alloc(src_1_ddq_size*src1->ne[1]); + quantize_row_q8_1_cuda((const float *)src1->data, (void *)src1_quantized.get(), src1->ne[0], src1->ne[1], 1, + src1_padded_col_size, src0->type, stream); + CUDA_CHECK(cudaGetLastError()); ggml_cuda_op_mul_mat_vec_q_id(ctx, src0, &local_src1, ids, &local_dst, nullptr, (const char *)src0->data, nullptr, src1_quantized.get(), (float *)dst->data, @@ -2460,92 +2461,95 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst_row.nb[2] = nb1; dst_row.nb[3] = nb1; - if (false && ne12 == 1) { - std::vector ids_host(ggml_nbytes(ids)); - const char * ids_dev = (const char *) ids->data; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { - for (int64_t id = 0; id < n_ids; id++) { - const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); - if (i02 < 0 || i02 >= n_as) continue; + ggml_cuda_pool_alloc dev_row_mapping(ctx.pool()); + std::vector moe_counts, cum_moe_counts; + bool is_ser = prepare_row_mappigs(ctx, n_as, n_ids, ids, moe_counts, cum_moe_counts, dev_row_mapping); + if (is_ser) { + CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream)); + } - const int64_t i11 = id % ne11; - const int64_t i12 = iid1; + ggml_cuda_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); + ggml_cuda_pool_alloc dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst)); - const int64_t i1 = id; - const int64_t i2 = i12; + src1_row.data = src1_contiguous.get(); + dst_row.data = dst_contiguous.get(); - src0_row.data = src0_original + i02*nb02; - src1_row.data = src1_original + i11*nb11 + i12*nb12; - dst_row.data = dst_original + i1*nb1 + i2*nb2; + for (int64_t i02 = 0; i02 < n_as; i02++) { - ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row, nullptr, 0); - } - } - } else { + int64_t num_src1_rows = moe_counts[i02]; - ggml_cuda_pool_alloc dev_row_mapping(ctx.pool()); - std::vector moe_counts, cum_moe_counts; - bool is_ser = prepare_row_mappigs(ctx, n_as, n_ids, ids, moe_counts, cum_moe_counts, dev_row_mapping); - if (is_ser) { - CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream)); + if (num_src1_rows == 0) { + continue; } - ggml_cuda_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); - ggml_cuda_pool_alloc dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst)); + size_t mapping_offset = cum_moe_counts[i02]; - src1_row.data = src1_contiguous.get(); - dst_row.data = dst_contiguous.get(); + { + dim3 block_dims(std::min((unsigned int)ne10, 768u)); + dim3 grid_dims(num_src1_rows); + k_copy_src_to_contiguous<<>>( + src1_original, src1_contiguous.get(), dev_row_mapping.get() + mapping_offset, ne10, ne11, nb11, nb12); + CUDA_CHECK(cudaGetLastError()); + } - for (int64_t i02 = 0; i02 < n_as; i02++) { + src0_row.data = src0_original + i02*nb02; - int64_t num_src1_rows = moe_counts[i02]; + GGML_ASSERT(nb11 == sizeof(float)*ne10); + GGML_ASSERT(nb1 == sizeof(float)*ne0); - if (num_src1_rows == 0) { - continue; - } + src1_row.ne[1] = num_src1_rows; + src1_row.nb[1] = nb11; + src1_row.nb[2] = num_src1_rows*nb11; + src1_row.nb[3] = num_src1_rows*nb11; - size_t mapping_offset = cum_moe_counts[i02]; + dst_row.ne[1] = num_src1_rows; + dst_row.nb[1] = nb1; + dst_row.nb[2] = num_src1_rows*nb1; + dst_row.nb[3] = num_src1_rows*nb1; - { - dim3 block_dims(std::min((unsigned int)ne10, 768u)); - dim3 grid_dims(num_src1_rows); - k_copy_src_to_contiguous<<>>( - src1_original, src1_contiguous.get(), dev_row_mapping.get() + mapping_offset, ne10, ne11, nb11, nb12); + if (ggml_is_quantized(src0->type) && + ggml_cuda_should_use_mmq(src0->type, ggml_cuda_info().devices[ctx.device].cc, num_src1_rows)) { + auto src1_padded_num_cols = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING); + auto src1_padded_row_size = src1_padded_num_cols/ggml_blck_size(GGML_TYPE_Q8_1)*ggml_type_size(GGML_TYPE_Q8_1); + auto src1_quantized_size = src1_padded_row_size*num_src1_rows; + if (true || num_src1_rows > MMVQ_MAX_BATCH_SIZE) { + src1_quantized_size += get_mmq_x_max_host(ggml_cuda_info().devices[ctx.device].cc)*sizeof(block_q8_1_mmq); + ggml_cuda_pool_alloc src1_quantized(ctx.pool(), src1_quantized_size); + quantize_mmq_q8_1_cuda((const float *)src1_contiguous.get(), src1_quantized.get(), ne00, num_src1_rows, 1, + src1_padded_num_cols, src0->type, stream); + src1_row.nb[1] = src1_padded_row_size; + src1_row.nb[2] = src1_row.nb[3] = src1_row.nb[1]*num_src1_rows; + ggml_cuda_mul_mat_q_id(ctx, &src0_row, &src1_row, nullptr, &dst_row, nullptr, src1_quantized.get()); + + CUDA_CHECK(cudaGetLastError()); + } else { + ggml_cuda_pool_alloc src1_quantized(ctx.pool(), src1_quantized_size); + quantize_row_q8_1_cuda((const float *)src1_contiguous.get(), src1_quantized.get(), ne00, num_src1_rows, 1, + src1_padded_num_cols, src0->type, stream); + src1_row.nb[1] = src1_padded_row_size; + src1_row.nb[2] = src1_row.nb[3] = src1_row.nb[1]*num_src1_rows; + ggml_cuda_op_mul_mat_vec_q(ctx, &src0_row, &src1_row, &dst_row, (const char *)src0_row.data, nullptr, + src1_quantized.get(), (float *)dst_row.data, + 0, src0_row.ne[1], num_src1_rows, src1_padded_num_cols, stream); CUDA_CHECK(cudaGetLastError()); } - - src0_row.data = src0_original + i02*nb02; - - GGML_ASSERT(nb11 == sizeof(float)*ne10); - GGML_ASSERT(nb1 == sizeof(float)*ne0); - - src1_row.ne[1] = num_src1_rows; - src1_row.nb[1] = nb11; - src1_row.nb[2] = num_src1_rows*nb11; - src1_row.nb[3] = num_src1_rows*nb11; - - dst_row.ne[1] = num_src1_rows; - dst_row.nb[1] = nb1; - dst_row.nb[2] = num_src1_rows*nb1; - dst_row.nb[3] = num_src1_rows*nb1; - + } else { ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row, nullptr, 0); + } - { - dim3 block_dims(std::min((unsigned int)ne0, 768u)); - dim3 grid_dims(num_src1_rows); - k_copy_dst_from_contiguous<<>>( - dst_original, dst_contiguous.get(), - dev_row_mapping.get() + mapping_offset, - ne0, - nb1, nb2); - CUDA_CHECK(cudaGetLastError()); - } + { + dim3 block_dims(std::min((unsigned int)ne0, 768u)); + dim3 grid_dims(num_src1_rows); + k_copy_dst_from_contiguous<<>>( + dst_original, dst_contiguous.get(), + dev_row_mapping.get() + mapping_offset, + ne0, + nb1, nb2); + CUDA_CHECK(cudaGetLastError()); } } + return false; } @@ -2791,15 +2795,11 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten bool fuse_down = false; if (next && next->op == GGML_OP_MUL_MAT_ID) { - //printf("Fusing MoE down gemm\n"); fuse_down = true; final_dst = *next; final_dst.ne[1] = final_dst.ne[2] = final_dst.ne[3] = 1; final_dst.nb[2] = final_dst.nb[3] = final_dst.nb[1]; final_src = *next->src[0]; - //printf("next->src[0]: %s, %d x %d x %d x %d and %d x %d x %d x %d\n", ggml_type_name(next->src[0]->type), - // (int)next->src[0]->ne[0], (int)next->src[0]->ne[1], (int)next->src[0]->ne[2], (int)next->src[0]->ne[3], - // (int)next->src[0]->nb[0], (int)next->src[0]->nb[1], (int)next->src[0]->nb[2], (int)next->src[0]->nb[3]); final_src.ne[2] = final_src.ne[3] = 1; final_src.nb[3] = final_src.nb[2]; } @@ -2830,8 +2830,6 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten src1_row.data = src1_contiguous.get(); - bool first = false; //true; - ggml_cuda_pool_alloc dev_row_mapping(ctx.pool()); std::vector moe_counts, cum_moe_counts; @@ -2883,8 +2881,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten dst_row.data = dst_up_contiguous.get(); if (use_quantized_src1) { - ggml_cuda_op_mul_mat_q(ctx, &src0_1_row, &src1_row, &dst_row, (const char *)src0_1_row.data, nullptr, src1_quantized.get(), (float *)dst_row.data, - 0, src0_1_row.ne[1], num_src1_rows, src1_padded_num_cols, stream); + ggml_cuda_mul_mat_q_id(ctx, &src0_1_row, &src1_row, nullptr, &dst_row, nullptr, src1_quantized.get()); } else { ggml_cuda_mul_mat(ctx, &src0_1_row, &src1_row, &dst_row, nullptr, 0); } @@ -2900,8 +2897,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten dst_row.data = dst_gate_contiguous.get(); if (use_quantized_src1) { - ggml_cuda_op_mul_mat_q(ctx, &src0_2_row, &src1_row, &dst_row, (const char *)src0_2_row.data, nullptr, src1_quantized.get(), (float *)dst_row.data, - 0, src0_2_row.ne[1], num_src1_rows, src1_padded_num_cols, stream); + ggml_cuda_mul_mat_q_id(ctx, &src0_2_row, &src1_row, nullptr, &dst_row, nullptr, src1_quantized.get()); } else { ggml_cuda_mul_mat(ctx, &src0_2_row, &src1_row, &dst_row, nullptr, 0); } @@ -2933,18 +2929,12 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten final_dst.nb[1] = final_dst.ne[0]*sizeof(float); final_dst.nb[2] = final_dst.nb[3] = num_src1_rows*final_dst.nb[1]; final_src.data = (char *)next->src[0]->data + i02*next->src[0]->nb[2]; - if (first) { - printf("Fusing down for %d rows: (%d x %d x %d x %d) = (%d x %d x %d x %d) * (%d x %d x %d x %d)\n", (int)num_src1_rows, - (int)next->ne[0], (int)next->ne[1], (int)next->ne[2], (int)next->ne[3], - (int)next->src[0]->ne[0], (int)next->src[0]->ne[1], (int)next->src[0]->ne[2], (int)next->src[0]->ne[3], - (int)next->src[1]->ne[0], (int)next->src[1]->ne[1], (int)next->src[1]->ne[2], (int)next->src[1]->ne[3]); - printf(" using (%d x %d x %d x %d) = (%d x %d x %d x %d) * (%d x %d x %d x %d)\n", - (int)final_dst.ne[0], (int)final_dst.ne[1], (int)final_dst.ne[2], (int)final_dst.ne[3], - (int)final_src.ne[0], (int)final_src.ne[1], (int)final_src.ne[2], (int)final_src.ne[3], - (int)dst_row.ne[0], (int)dst_row.ne[1], (int)dst_row.ne[2], (int)dst_row.ne[3]); - first = false; + if (ggml_is_quantized(next->src[0]->type) && + ggml_cuda_should_use_mmq(final_src.type, ggml_cuda_info().devices[ctx.device].cc, dst_row.ne[1])) { + ggml_cuda_mul_mat_q_id(ctx, &final_src, &dst_row, nullptr, &final_dst, nullptr, nullptr); + } else { + ggml_cuda_mul_mat(ctx, &final_src, &dst_row, &final_dst, nullptr, 0); } - ggml_cuda_mul_mat(ctx, &final_src, &dst_row, &final_dst, nullptr, 0); CUDA_CHECK(cudaGetLastError()); dim3 block_dims(std::min((unsigned int)next->ne[0], 768u)); diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 82514c8d..fde49334 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -178,17 +178,13 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { bool mmq_supported; switch (type) { - case GGML_TYPE_Q2_K: mmq_supported = ne11 < 384; break; + case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: - mmq_supported = ne11 < 1536; - break; case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_K_R4: - mmq_supported = ne11 <= 3072; - break; case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -196,8 +192,6 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { case GGML_TYPE_IQ3_K_R4: case GGML_TYPE_IQ4_K_R4: case GGML_TYPE_IQ5_K_R4: - mmq_supported = ne11 < 1024; - break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: diff --git a/ggml/src/ggml-cuda/mmq_id.cu b/ggml/src/ggml-cuda/mmq_id.cu index 230715c0..a9b64ff0 100644 --- a/ggml/src/ggml-cuda/mmq_id.cu +++ b/ggml/src/ggml-cuda/mmq_id.cu @@ -1,6 +1,7 @@ #include "mmq_id_common.cuh" #include "mmq_id.cuh" #include "quantize_id.cuh" +#include "quantize.cuh" #include #include @@ -317,7 +318,7 @@ void ggml_cuda_mul_mat_q_id(ggml_backend_cuda_context & ctx, const ggml_tensor * const ggml_tensor * ids_tensor, ggml_tensor * dst, char * ids_data, char * src1_quantized_data) { GGML_ASSERT( src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(ids_tensor->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. + GGML_ASSERT(!ids_tensor || ids_tensor->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. GGML_TENSOR_BINARY_OP_LOCALS; @@ -331,7 +332,7 @@ void ggml_cuda_mul_mat_q_id(ggml_backend_cuda_context & ctx, const ggml_tensor * //GGML_ASSERT( nb00 == ts_src0); GGML_ASSERT( nb10 == ts_src1); GGML_ASSERT( nb0 == ts_dst); - GGML_ASSERT(ids_tensor->nb[0] == ggml_type_size(ids_tensor->type)); + GGML_ASSERT(!ids_tensor || ids_tensor->nb[0] == ggml_type_size(ids_tensor->type)); GGML_ASSERT(ne13 == 1); GGML_ASSERT(nb12 % nb11 == 0); @@ -364,6 +365,32 @@ void ggml_cuda_mul_mat_q_id(ggml_backend_cuda_context & ctx, const ggml_tensor * const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_CDNA(cc); + if (!ids_tensor) { + + ggml_cuda_pool_alloc src1_q8_1(ctx.pool()); + if (!src1_quantized_data) { + const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 + + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); + src1_q8_1.alloc(nbytes_src1_q8_1); + quantize_mmq_q8_1_cuda(src1_d, src1_q8_1.get(), ne10, ne11, 1, ne10_padded, src0->type, stream); + CUDA_CHECK(cudaGetLastError()); + src1_quantized_data = src1_q8_1.get(); + } + + const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s13 = ne12*s12; + + const mmq_args_id args = { + src0_d, src0->type, (const int *)src1_quantized_data, nullptr, nullptr, dst_d, + ne00, ne01, ne1, s01, ne11, s1, + ne02, ne12, s02, s12, s2, + ne03, ne13, s03, s13, s3, + use_stream_k, ne1}; + + ggml_cuda_mul_mat_q_switch_type_id(ctx, args, stream); + return; + } + const int64_t n_expert_used = ids_tensor->ne[0]; const int64_t ne_get_rows = ne12 * n_expert_used; GGML_ASSERT(ne1 == n_expert_used); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index f70b60ab..3aa9b210 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -225,7 +225,7 @@ void ggml_cuda_op_mul_mat_vec_q_id( const int64_t ne10 = src1->ne[0]; GGML_ASSERT(ne10 % QK8_1 == 0); GGML_ASSERT(src0->ne[3] == 1 && src1->ne[3] == 1 && dst->ne[3] == 1); - GGML_ASSERT(src1->ne[1] == 1 && src1->ne[2] == 1); + GGML_ASSERT(src1->ne[1] <= MMVQ_MAX_BATCH_SIZE && src1->ne[2] == 1); GGML_ASSERT(ids->ne[0] == dst->ne[2]); const int64_t ne0 = dst->ne[0]; diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 52d8787d..2bef2315 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -6,6 +6,8 @@ // #include "quantize.cuh" +#include "mmq.cuh" + #include static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) { diff --git a/ggml/src/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh index e1106164..8f23f91d 100644 --- a/ggml/src/ggml-cuda/quantize.cuh +++ b/ggml/src/ggml-cuda/quantize.cuh @@ -8,7 +8,6 @@ #pragma once #include "common.cuh" -#include "mmq.cuh" #include