Merge remote-tracking branch 'origin/main' into ik/sm_graph_cuda_graphs

This commit is contained in:
Kawrakow
2026-01-22 10:34:11 +00:00
9 changed files with 350 additions and 149 deletions

View File

@@ -786,6 +786,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.max_extra_alloc_MiB = std::stoi(argv[i]);
return true;
}
if (arg == "-nrep" || arg == "--n-repetitions") {
CHECK_ARG
params.nrep = std::stoi(argv[i]);
return true;
}
if (arg == "--samplers") {
CHECK_ARG
const auto sampler_names = string_split(argv[i], ";");

View File

@@ -145,38 +145,39 @@ struct gpt_params {
int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_ctx_draft = 0; // context size for draft model
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_draft_min = 1; // minimum number of tokens to draft during speculative decoding
float p_draft_min = 0.8f; // minimum speculative decoding probability (greedy)
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_ctx_draft = 0; // context size for draft model
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_draft_min = 1; // minimum number of tokens to draft during speculative decoding
float p_draft_min = 0.8f; // minimum speculative decoding probability (greedy)
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = -1.0f; // YaRN low correction dim
float yarn_beta_fast = -1.0f; // YaRN low correction dim
float yarn_beta_slow = -1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
int32_t max_extra_alloc_MiB = 256; // additional VRAM per GPU the scheduler may allocate for more efficient compute graph evaluation
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
int32_t max_extra_alloc_MiB = 256; // extra VRAM per GPU the scheduler may allocate for more efficient compute graph evaluation
int32_t nrep = 1; // number of repetitions used in sweep bench
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;

View File

@@ -31,6 +31,7 @@ int main(int argc, char ** argv) {
print_usage(argc, argv);
return 1;
}
if (params.nrep < 1) params.nrep = 1;
// init LLM
@@ -135,49 +136,63 @@ int main(int argc, char ** argv) {
common_batch_clear(batch);
llama_kv_cache_clear(ctx);
int i_loop = 0;
for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) {
// clean up KV cache before generation
llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
//llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
int nrep = i_loop < 1 ? params.nrep : 1;
// first measure token generation performance at this context size
const auto t_tg_start = ggml_time_us();
for (unsigned int i = 0; i < tg; ++i) {
for (int irep = 0; irep < nrep; ++irep) {
llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
for (unsigned int i = 0; i < tg; ++i) {
common_batch_clear(batch);
common_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
}
}
const auto t_tg_end = ggml_time_us();
// measure prompt processing performance
const auto t_pp_start = ggml_time_us();
for (int irep = 0; irep < nrep; ++irep) {
// clean up KV cache after generation
llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
// prepare batch of pp size for prompt processing performance measurement
common_batch_clear(batch);
common_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true);
for (unsigned int i = 0; i < pp; ++i) {
common_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, false);
}
batch.logits[batch.n_tokens - 1] = true;
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
}
const auto t_tg_end = ggml_time_us();
// clean up KV cache after generation
llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
// prepare batch of pp size for prompt processing performance measurement
common_batch_clear(batch);
for (unsigned int i = 0; i < pp; ++i) {
common_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, false);
}
batch.logits[batch.n_tokens - 1] = true;
// measure prompt processing performance
const auto t_pp_start = ggml_time_us();
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
const auto t_pp_end = ggml_time_us();
// calculate and print metrics
const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f / nrep;
const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f / nrep;
const float speed_pp = pp / t_pp;
const float speed_tg = tg / t_tg;
@@ -192,6 +207,8 @@ int main(int argc, char ** argv) {
} else {
LOG_TEE("|%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg);
}
++i_loop;
}
llama_batch_free(batch);

View File

@@ -2165,6 +2165,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
for (auto & s : sched->statuses) s = GGML_STATUS_SUCCESS;
int first_reduce = -1;
bool work_done = false;
#ifdef GGML_USE_OPENMP
//This may not be available in old OpenMP versions
@@ -2185,7 +2186,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
}
}
int first_reduce = -1;
for (int i = 0; i < sched->n_splits; i++) {
auto split = &sched->splits[i];
if (split->graph.n_nodes == 1 && split->graph.nodes[0]->op == GGML_OP_REDUCE) {

View File

@@ -1842,7 +1842,7 @@ static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml
}
static __global__ void k_compute_batched_ptrs(
const half * src0_as_f16, const half * src1_as_f16, char * dst,
const void * src0_as_f16, const void * src1_as_f16, char * dst,
const void ** ptrs_src, void ** ptrs_dst,
int64_t ne12, int64_t ne13,
int64_t ne23,
@@ -1850,86 +1850,155 @@ static __global__ void k_compute_batched_ptrs(
size_t nb12, size_t nb13,
size_t nbd2, size_t nbd3,
int64_t r2, int64_t r3) {
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
if (i13 >= ne13 || i12 >= ne12) {
return;
}
int64_t i03 = i13 / r3;
int64_t i02 = i12 / r2;
const int64_t i03 = i13 / r3;
const int64_t i02 = i12 / r2;
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
}
static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
// Type traits for mapping ggml types to CUDA/cuBLAS types
template<ggml_type T>
struct batched_mul_mat_traits;
template<>
struct batched_mul_mat_traits<GGML_TYPE_F32> {
using cuda_type = float;
static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
static inline const cudaDataType_t data_type = CUDA_R_32F;
static inline const ggml_type ggml_type_val = GGML_TYPE_F32;
static inline const float alpha = 1.0f;
static inline const float beta = 0.0f;
static inline const void* get_alpha() { static const float val = alpha; return &val; }
static inline const void* get_beta() { static const float val = beta; return &val; }
static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp32_nc_cuda(src_type); }
};
template<>
struct batched_mul_mat_traits<GGML_TYPE_BF16> {
using cuda_type = nv_bfloat16;
static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
static inline const cudaDataType_t data_type = CUDA_R_16BF;
static inline const ggml_type ggml_type_val = GGML_TYPE_BF16;
static inline const float alpha = 1.0f;
static inline const float beta = 0.0f;
static inline const void* get_alpha() { static const float val = alpha; return &val; }
static inline const void* get_beta() { static const float val = beta; return &val; }
static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_bf16_nc_cuda(src_type); }
};
template<>
struct batched_mul_mat_traits<GGML_TYPE_F16> {
using cuda_type = half;
static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
static inline const cudaDataType_t data_type = CUDA_R_16F;
static inline const ggml_type ggml_type_val = GGML_TYPE_F16;
static inline const half alpha = 1.0;
static inline const half beta = 0.0;
static inline const void* get_alpha() { static const half val = alpha; return &val; }
static inline const void* get_beta() { static const half val = beta; return &val; }
static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp16_nc_cuda(src_type); }
};
template<ggml_type src0_type>
static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
using traits = batched_mul_mat_traits<src0_type>;
using cuda_t = typename traits::cuda_type;
GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(src0->type == src0_type);
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
GGML_ASSERT(src0->type == GGML_TYPE_F16);
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
// As long as dst is contiguous this does not matter though.
GGML_TENSOR_BINARY_OP_LOCALS
const int64_t ne_dst = ggml_nelements(dst);
cudaStream_t main_stream = ctx.stream();
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
void * src0_ddq = src0->data;
half * src0_f16 = (half *) src0_ddq;
float * src1_ddf = (float *) src1->data;
float * dst_ddf = (float *) dst->data;
float * dst_ddf = (float *) dst->data;
const size_t ts_src1 = ggml_type_size(src1->type);
GGML_ASSERT(nb10 == ts_src1);
int64_t s11 = nb11 / ts_src1;
int64_t s12 = nb12 / ts_src1;
int64_t s13 = nb13 / ts_src1;
// convert src1 to fp16
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
if (src1->type != GGML_TYPE_F16) {
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
const cuda_t * src0_ptr = nullptr;
const cuda_t * src1_ptr = nullptr;
ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
// Handle src0
src0_ptr = (const cuda_t *) src0->data;
// Handle src1 - convert if necessary
if (src1->type == src0_type) {
src1_ptr = (const cuda_t *) src1->data;
} else {
// Convert src1 to target type using traits conversion functions
const int64_t ne_src1 = ggml_nelements(src1);
src1_f16_alloc.alloc(ne_src1);
GGML_ASSERT(to_fp16_cuda != nullptr);
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ggml_nrows(src1), src1->ne[0], main_stream);
src1_alloc.alloc(ne_src1);
const auto convert_func = traits::get_nc_converter(src1->type);
GGML_ASSERT(convert_func != nullptr);
convert_func(src1->data, src1_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
src1_ptr = src1_alloc.get();
s11 = ne10;
s12 = ne11*s11;
s13 = ne12*s12;
is_src1_cont_2 = true;
}
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
// Setup destination buffer
ggml_cuda_pool_alloc<cuda_t> dst_temp(ctx.pool());
char * dst_t;
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
cudaDataType_t cu_data_type = CUDA_R_16F;
// dst strides
size_t nbd2 = dst->nb[2];
size_t nbd3 = dst->nb[3];
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
cublasComputeType_t cu_compute_type = traits::compute_type;
cudaDataType_t cu_data_type = traits::data_type;
cudaDataType_t cu_data_type_a = traits::data_type;
cudaDataType_t cu_data_type_b = traits::data_type;
const void * alpha = traits::get_alpha();
const void * beta = traits::get_beta();
const float alpha_f32 = 1.0f;
const float beta_f32 = 0.0f;
const void * alpha = &alpha_f16;
const void * beta = &beta_f16;
const float beta_f32 = 0.0f;
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
dst_t = (char *) dst_f16.alloc(ne_dst);
nbd2 /= sizeof(float) / sizeof(half);
nbd3 /= sizeof(float) / sizeof(half);
if constexpr (src0_type == GGML_TYPE_F32) {
dst_t = (char *) dst_ddf; // Direct F32 output
} else {
dst_t = (char *) dst_temp.alloc(ne_dst);
nbd2 /= sizeof(float) / sizeof(cuda_t);
nbd3 /= sizeof(float) / sizeof(cuda_t);
}
} else {
dst_t = (char *) dst_ddf;
cu_compute_type = CUBLAS_COMPUTE_32F;
cu_data_type = CUDA_R_32F;
cu_data_type = CUDA_R_32F;
alpha = &alpha_f32;
beta = &beta_f32;
beta = &beta_f32;
}
int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
@@ -1937,77 +2006,85 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
#if 0
// use cublasGemmEx
{
for (int i13 = 0; i13 < ne13; ++i13) {
for (int i12 = 0; i12 < ne12; ++i12) {
int i03 = i13 / r3;
int i02 = i12 / r2;
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
const int64_t smb = ne12 == 1 ? s13 : s12;
CUBLAS_CHECK(
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
}
}
#else
#ifdef GGML_USE_MUSA
GGML_ASSERT(false);
#else // !GGML_USE_MUSA
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
// use cublasGemmStridedBatchedEx
CUBLAS_CHECK(
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
alpha, src0_ptr, cu_data_type_a, nb01/nb00, sma, // strideA
src1_ptr, cu_data_type_b, s11, smb, // strideB
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
ne12*ne13,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
// use cublasGemmBatchedEx
const int ne23 = ne12*ne13;
const int64_t ne23 = ne12*ne13;
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
dim3 block_dims(ne13, ne12);
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
src0_f16, src1_f16, dst_t,
size_t src1_stride_size = sizeof(cuda_t);
const int threads_x = 16;
const int threads_y = 16;
dim3 block_dims(threads_x, threads_y);
dim3 grid_dims(
(ne13 + threads_x - 1) / threads_x,
(ne12 + threads_y - 1) / threads_y
);
k_compute_batched_ptrs<<<grid_dims, block_dims, 0, main_stream>>>(
src0_ptr, src1_ptr, dst_t,
ptrs_src.get(), ptrs_dst.get(),
ne12, ne13,
ne23,
nb02, nb03,
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
(src1->type == src0_type) ? nb12 : s12*src1_stride_size,
(src1->type == src0_type) ? nb13 : s13*src1_stride_size,
nbd2, nbd3,
r2, r3);
CUDA_CHECK(cudaGetLastError());
CUBLAS_CHECK(
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
alpha, (const void **) (ptrs_src.get() + 0*ne23), cu_data_type_a, nb01/nb00,
(const void **) (ptrs_src.get() + 1*ne23), cu_data_type_b, s11,
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
ne23,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
#endif // GGML_USE_MUSA
#endif
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16.get(), dst_ddf, ggml_nrows(dst), dst->ne[0], main_stream);
// Convert output back to F32 if needed
if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type != CUDA_R_32F) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(traits::ggml_type_val);
to_fp32_cuda(dst_temp.get(), dst_ddf, ne_dst, 1, main_stream);
}
}
static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F32);
switch (src0->type) {
case GGML_TYPE_F32:
ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F32>(ctx, src0, src1, dst);
break;
case GGML_TYPE_BF16:
ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_BF16>(ctx, src0, src1, dst);
break;
case GGML_TYPE_F16:
ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F16>(ctx, src0, src1, dst);
break;
default:
GGML_ABORT("Unsupported type");
}
}
@@ -3939,11 +4016,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
// Launch graph
CUDA_CHECK(cudaGraphLaunch(graph->instance, cuda_ctx->stream()));
//printf("Launched graph with %d nodes on device %d\n", cgraph->n_nodes, cuda_ctx->device);
//if (!cuda_ctx->compute_event) {
// CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx->compute_event, cudaEventDisableTiming));
//}
//CUDA_CHECK(cudaEventRecord(cuda_ctx->compute_event, cuda_ctx->stream()));
#else
graph_evaluated_or_captured = true;
#endif // USE_CUDA_GRAPH
@@ -4006,6 +4078,8 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
if (graph->number_consecutive_updates >= 4) {
graph->disable_due_to_too_many_updates = true;
use_cuda_graph = false;
cuda_ctx->cur_graph = nullptr;
#ifndef NDEBUG
GGML_CUDA_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif

View File

@@ -2122,3 +2122,69 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
return nullptr;
}
}
// non-contuigous conversions
template <typename src_t, typename dst_t>
static __global__ void convert_unary(
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t s01, const int64_t s02, const int64_t s03) {
const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i00 >= ne00) {
return;
}
const int64_t i01 = blockIdx.y;
const int64_t i02 = blockIdx.z % ne02;
const int64_t i03 = blockIdx.z / ne02;
const src_t * x = (const src_t *) vx;
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
}
template <typename src_t, typename dst_t>
static void convert_unary_cuda(const void * vx, dst_t * y,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03);
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
(vx, y, ne00, ne01, ne02, s01, s02, s03);
}
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_cuda<float>;
case GGML_TYPE_BF16:
return convert_unary_cuda<nv_bfloat16>;
default:
return nullptr;
}
}
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_cuda<float, nv_bfloat16>;
case GGML_TYPE_F16:
return convert_unary_cuda<half, nv_bfloat16>;
default:
return nullptr;
}
}
to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F16:
return convert_unary_cuda<half, float>;
case GGML_TYPE_BF16:
return convert_unary_cuda<nv_bfloat16, float>;
default:
return nullptr;
}
}

View File

@@ -22,6 +22,19 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
template<typename T>
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
typedef to_t_nc_cuda_t<float> to_fp32_nc_cuda_t;
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
typedef to_t_nc_cuda_t<nv_bfloat16> to_bf16_nc_cuda_t;
to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
template<typename dst_t, typename src_t>
__host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
if constexpr (std::is_same_v<dst_t, src_t>) {
@@ -30,6 +43,15 @@ template<typename dst_t, typename src_t>
return __float2bfloat16(float(x));
} else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
return __bfloat162float(x);
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, half2>) {
return __float22half2_rn(x);
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, nv_bfloat162>) {
// bypass compile error on cuda 12.0.1
#ifdef GGML_USE_HIPBLAS
return __float22bfloat162_rn(x);
#else
return {x.x, x.y};
#endif // GGML_USE_HIP
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
return int32_t(x);
} else {

View File

@@ -940,6 +940,5 @@ bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
src0_type == GGML_TYPE_IQ2_KT || src0_type == GGML_TYPE_IQ3_KT || src0_type == GGML_TYPE_IQ4_KT ||
src0_type == GGML_TYPE_F16;
src0_type == GGML_TYPE_IQ2_KT || src0_type == GGML_TYPE_IQ3_KT || src0_type == GGML_TYPE_IQ4_KT;
}

View File

@@ -27,6 +27,14 @@ const char * llama_hparams::rope_scaling_type_name(llama_rope_scaling_type type)
return LLAMA_ROPE_SCALING_TYPES.at(type);
}
static inline const char * llm_expert_gating_func_name(llm_expert_gating_func_type type) {
switch (type) {
case LLM_EXPERT_GATING_FUNC_SOFTMAX: return "softmax";
case LLM_EXPERT_GATING_FUNC_SIGMOID: return "sigmoid";
case LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT: return "weight";
default: return "none";
}
}
void llm_load_hparams(
@@ -778,11 +786,20 @@ void llm_load_hparams(
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_TYPE_NONE;
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
if (hparams.expert_gating_func == 0) {
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
// that have no expert_gating_func model parameter set
hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SOFTMAX;
if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) {
// Older DeepSeek models from the 2.0/2.5 series may not have the experts gating function recorded in the GGUF.
// Such models use SOFTMAX as the experts gating function.
// The new (new as of this commit) GLM-4.7-Flash may also be missing the experts gating function.
// GLM-4.7-Flash uses SIGMOID as the experts gating function.
// Hence, we make the LLM_KV_EXPERT_GATING_FUNC entry optional, and set here if missing.
// We distinguish between GLM-4.7-Flash and DeepSeek-2/2.5 models by the number of layers.
// GLM-4.7-Flash has 47 layers (or 48, if an MTP layer is included in the GGUF).
hparams.expert_gating_func = hparams.n_layer == 47 || hparams.n_layer == 48 ?
LLM_EXPERT_GATING_FUNC_SIGMOID : LLM_EXPERT_GATING_FUNC_SOFTMAX;
printf("================= Missing experts gating function -> set to %s\n",
llm_expert_gating_func_name(llm_expert_gating_func_type(hparams.expert_gating_func)));
}
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);