Also 192x128

This commit is contained in:
Kawrakow
2026-01-27 11:17:05 +00:00
parent 08d42ac69e
commit 1bff295e79
6 changed files with 19 additions and 0 deletions

View File

@@ -61,6 +61,11 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2, 64, 64, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 128, 2, 64, 64, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 8, 128, 2, 64, 96, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 16, 128, 2, 64, 96, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 32, 128, 2, 64, 96, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 64, 128, 2, 64, 96, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 8, 64, 4, 64, 128, 128, 128, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 64, 4, 32, 128, 128, 128, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 32, 128, 128, 128, 2, true);
@@ -1735,6 +1740,7 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 80, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 96, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(192, 128, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256, 64)
// The number of viable configurations for Deepseek is very limited:

View File

@@ -154,6 +154,10 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
GGML_ASSERT(V->ne[0] == 256);
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
break;
case 192:
GGML_ASSERT(V->ne[0] == 128);
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<192, 128>(ctx, dst);
break;
case 576: {
// For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
GGML_ASSERT(V->ne[0] == 512);
@@ -330,6 +334,11 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_NONE;
}
break;
case 192:
if (V->ne[0] != 128) {
return BEST_FATTN_KERNEL_NONE;
}
break;
case 576:
if (V->ne[0] != 512) {
return BEST_FATTN_KERNEL_NONE;

View File

@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 16, 4);
DECL_FATTN_MMA_F16_CASE(96, 96, 16, 4);
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 4);
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 4);
DECL_FATTN_MMA_F16_CASE(192, 128, 16, 4);
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 4);
DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4);

View File

@@ -7,4 +7,5 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 32, 2);
DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
DECL_FATTN_MMA_F16_CASE(192, 128, 32, 2);
DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);

View File

@@ -7,4 +7,5 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 64, 1);
DECL_FATTN_MMA_F16_CASE(96, 96, 64, 1);
DECL_FATTN_MMA_F16_CASE(112, 112, 64, 1);
DECL_FATTN_MMA_F16_CASE(128, 128, 64, 1);
DECL_FATTN_MMA_F16_CASE(192, 128, 64, 1);
DECL_FATTN_MMA_F16_CASE(256, 256, 64, 1);

View File

@@ -7,4 +7,5 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
DECL_FATTN_MMA_F16_CASE(192, 128, 8, 8);
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);