mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-23 14:44:09 +00:00
Use mmq_id in mul_mat_id
This commit is contained in:
@@ -2460,92 +2460,98 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
dst_row.nb[2] = nb1;
|
||||
dst_row.nb[3] = nb1;
|
||||
|
||||
if (false && ne12 == 1) {
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
const char * ids_dev = (const char *) ids->data;
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
|
||||
if (i02 < 0 || i02 >= n_as) continue;
|
||||
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool());
|
||||
std::vector<int> moe_counts, cum_moe_counts;
|
||||
bool is_ser = prepare_row_mappigs(ctx, n_as, n_ids, ids, moe_counts, cum_moe_counts, dev_row_mapping);
|
||||
if (is_ser) {
|
||||
CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream));
|
||||
}
|
||||
|
||||
const int64_t i11 = id % ne11;
|
||||
const int64_t i12 = iid1;
|
||||
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
||||
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
|
||||
const int64_t i1 = id;
|
||||
const int64_t i2 = i12;
|
||||
src1_row.data = src1_contiguous.get();
|
||||
dst_row.data = dst_contiguous.get();
|
||||
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
src1_row.data = src1_original + i11*nb11 + i12*nb12;
|
||||
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
||||
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||
|
||||
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row, nullptr, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int64_t num_src1_rows = moe_counts[i02];
|
||||
|
||||
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool());
|
||||
std::vector<int> moe_counts, cum_moe_counts;
|
||||
bool is_ser = prepare_row_mappigs(ctx, n_as, n_ids, ids, moe_counts, cum_moe_counts, dev_row_mapping);
|
||||
if (is_ser) {
|
||||
CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream));
|
||||
if (num_src1_rows == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
||||
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
size_t mapping_offset = cum_moe_counts[i02];
|
||||
|
||||
src1_row.data = src1_contiguous.get();
|
||||
dst_row.data = dst_contiguous.get();
|
||||
{
|
||||
dim3 block_dims(std::min((unsigned int)ne10, 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_src_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
src1_original, src1_contiguous.get(), dev_row_mapping.get() + mapping_offset, ne10, ne11, nb11, nb12);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
|
||||
int64_t num_src1_rows = moe_counts[i02];
|
||||
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
||||
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
||||
|
||||
if (num_src1_rows == 0) {
|
||||
continue;
|
||||
}
|
||||
src1_row.ne[1] = num_src1_rows;
|
||||
src1_row.nb[1] = nb11;
|
||||
src1_row.nb[2] = num_src1_rows*nb11;
|
||||
src1_row.nb[3] = num_src1_rows*nb11;
|
||||
|
||||
size_t mapping_offset = cum_moe_counts[i02];
|
||||
dst_row.ne[1] = num_src1_rows;
|
||||
dst_row.nb[1] = nb1;
|
||||
dst_row.nb[2] = num_src1_rows*nb1;
|
||||
dst_row.nb[3] = num_src1_rows*nb1;
|
||||
|
||||
{
|
||||
dim3 block_dims(std::min((unsigned int)ne10, 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_src_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
src1_original, src1_contiguous.get(), dev_row_mapping.get() + mapping_offset, ne10, ne11, nb11, nb12);
|
||||
if (ggml_is_quantized(src0->type) &&
|
||||
ggml_cuda_should_use_mmq(src0->type, ggml_cuda_info().devices[ctx.device].cc, num_src1_rows)) {
|
||||
auto src1_padded_num_cols = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING);
|
||||
auto src1_padded_row_size = src1_padded_num_cols/ggml_blck_size(GGML_TYPE_Q8_1)*ggml_type_size(GGML_TYPE_Q8_1);
|
||||
auto src1_quantized_size = src1_padded_row_size*num_src1_rows;
|
||||
if (true || num_src1_rows > MMVQ_MAX_BATCH_SIZE) {
|
||||
src1_quantized_size += get_mmq_x_max_host(ggml_cuda_info().devices[ctx.device].cc)*sizeof(block_q8_1_mmq);
|
||||
ggml_cuda_pool_alloc<char> src1_quantized(ctx.pool(), src1_quantized_size);
|
||||
quantize_mmq_q8_1_cuda((const float *)src1_contiguous.get(), src1_quantized.get(), ne00, num_src1_rows, 1,
|
||||
src1_padded_num_cols, src0->type, stream);
|
||||
src1_row.nb[1] = src1_padded_row_size;
|
||||
src1_row.nb[2] = src1_row.nb[3] = src1_row.nb[1]*num_src1_rows;
|
||||
//ggml_cuda_op_mul_mat_q(ctx, &src0_row, &src1_row, &dst_row, (const char *)src0_row.data, nullptr,
|
||||
// src1_quantized.get(), (float *)dst_row.data,
|
||||
// 0, src0_row.ne[1], num_src1_rows, src1_padded_num_cols, stream);
|
||||
ggml_cuda_mul_mat_q_id(ctx, &src0_row, &src1_row, nullptr, &dst_row, nullptr, src1_quantized.get());
|
||||
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
} else {
|
||||
ggml_cuda_pool_alloc<char> src1_quantized(ctx.pool(), src1_quantized_size);
|
||||
quantize_row_q8_1_cuda((const float *)src1_contiguous.get(), src1_quantized.get(), ne00, num_src1_rows, 1,
|
||||
src1_padded_num_cols, src0->type, stream);
|
||||
src1_row.nb[1] = src1_padded_row_size;
|
||||
src1_row.nb[2] = src1_row.nb[3] = src1_row.nb[1]*num_src1_rows;
|
||||
ggml_cuda_op_mul_mat_vec_q(ctx, &src0_row, &src1_row, &dst_row, (const char *)src0_row.data, nullptr,
|
||||
src1_quantized.get(), (float *)dst_row.data,
|
||||
0, src0_row.ne[1], num_src1_rows, src1_padded_num_cols, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
|
||||
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
||||
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
||||
|
||||
src1_row.ne[1] = num_src1_rows;
|
||||
src1_row.nb[1] = nb11;
|
||||
src1_row.nb[2] = num_src1_rows*nb11;
|
||||
src1_row.nb[3] = num_src1_rows*nb11;
|
||||
|
||||
dst_row.ne[1] = num_src1_rows;
|
||||
dst_row.nb[1] = nb1;
|
||||
dst_row.nb[2] = num_src1_rows*nb1;
|
||||
dst_row.nb[3] = num_src1_rows*nb1;
|
||||
|
||||
} else {
|
||||
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row, nullptr, 0);
|
||||
}
|
||||
|
||||
{
|
||||
dim3 block_dims(std::min((unsigned int)ne0, 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
dst_original, dst_contiguous.get(),
|
||||
dev_row_mapping.get() + mapping_offset,
|
||||
ne0,
|
||||
nb1, nb2);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
{
|
||||
dim3 block_dims(std::min((unsigned int)ne0, 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
dst_original, dst_contiguous.get(),
|
||||
dev_row_mapping.get() + mapping_offset,
|
||||
ne0,
|
||||
nb1, nb2);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -178,17 +178,13 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
bool mmq_supported;
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q2_K: mmq_supported = ne11 < 384; break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
mmq_supported = ne11 < 1536;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_K:
|
||||
case GGML_TYPE_IQ2_K_R4:
|
||||
mmq_supported = ne11 <= 3072;
|
||||
break;
|
||||
case GGML_TYPE_IQ3_K:
|
||||
case GGML_TYPE_IQ4_K:
|
||||
case GGML_TYPE_IQ5_K:
|
||||
@@ -196,8 +192,6 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
case GGML_TYPE_IQ3_K_R4:
|
||||
case GGML_TYPE_IQ4_K_R4:
|
||||
case GGML_TYPE_IQ5_K_R4:
|
||||
mmq_supported = ne11 < 1024;
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "mmq_id_common.cuh"
|
||||
#include "mmq_id.cuh"
|
||||
#include "quantize_id.cuh"
|
||||
#include "quantize.cuh"
|
||||
|
||||
#include <vector>
|
||||
#include <climits>
|
||||
@@ -317,7 +318,7 @@ void ggml_cuda_mul_mat_q_id(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
||||
const ggml_tensor * ids_tensor, ggml_tensor * dst, char * ids_data, char * src1_quantized_data) {
|
||||
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ids_tensor->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
|
||||
GGML_ASSERT(!ids_tensor || ids_tensor->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
@@ -331,7 +332,7 @@ void ggml_cuda_mul_mat_q_id(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
||||
//GGML_ASSERT( nb00 == ts_src0);
|
||||
GGML_ASSERT( nb10 == ts_src1);
|
||||
GGML_ASSERT( nb0 == ts_dst);
|
||||
GGML_ASSERT(ids_tensor->nb[0] == ggml_type_size(ids_tensor->type));
|
||||
GGML_ASSERT(!ids_tensor || ids_tensor->nb[0] == ggml_type_size(ids_tensor->type));
|
||||
|
||||
GGML_ASSERT(ne13 == 1);
|
||||
GGML_ASSERT(nb12 % nb11 == 0);
|
||||
@@ -364,6 +365,30 @@ void ggml_cuda_mul_mat_q_id(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
||||
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||
|| GGML_CUDA_CC_IS_CDNA(cc);
|
||||
|
||||
if (!ids_tensor) {
|
||||
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
|
||||
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
|
||||
|
||||
{
|
||||
quantize_mmq_q8_1_cuda(src1_d, src1_q8_1.get(), ne10, ne11, 1, ne10_padded, src0->type, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
|
||||
const int64_t s13 = ne12*s12;
|
||||
|
||||
const mmq_args_id args = {
|
||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
|
||||
ne00, ne01, ne1, s01, ne11, s1,
|
||||
ne02, ne12, s02, s12, s2,
|
||||
ne03, ne13, s03, s13, s3,
|
||||
use_stream_k, ne1};
|
||||
|
||||
ggml_cuda_mul_mat_q_switch_type_id(ctx, args, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t n_expert_used = ids_tensor->ne[0];
|
||||
const int64_t ne_get_rows = ne12 * n_expert_used;
|
||||
GGML_ASSERT(ne1 == n_expert_used);
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
//
|
||||
|
||||
#include "quantize.cuh"
|
||||
#include "mmq.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
#include "mmq.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user