[CK_TILE] Enable canonical-NaN BF16 conversion for FMHA on RDNA (#6253)

## Motivation - On gfx11/gfx12, the existing float -> bf16 conversion path in FMHA forward adds noticeable overhead and causes a meaningful performance gap versus fp16. The asm-based path (mode 3) does not improve this on RDNA and can perform even worse. - In particular, on gfx12, bf16 FMHA forward can be up to ~20% slower than the corresponding fp16 path. - This PR reduces that gap by switching FMHA forward to a different BF16 conversion strategy based on Triton’s canonical-NaN round-to-nearest-even behavior. ## Technical Details - Add a new `standard_cnan` BF16 conversion mode to CK Tile. - Implement a canonical-NaN RTN `float -> bf16` conversion path based on the Triton implementation. - Enable this conversion mode by default for FMHA forward builds targeting gfx11/gfx12. - Retune gfx11/gfx12 FMHA forward kernel selection thresholds for some `hdim=128` cases to keep kernel selection aligned with the updated conversion behavior. ## Test Plan ./build/bin/tile_example_fmha_fwd -prec=bf16 -mode={0/1} -b=1 -h=16 -d={hdim} -s={seqlen} -s_k={seqlen} -lse=0 -iperm={0/1} -operm={0/1} ## Test Result - all tests passed when running `test_ck_tile_fmha` - BF16 FMHA forward performance improves by up to ~5% on gfx11. - BF16 FMHA forward performance improves by up to ~10% on gfx12. ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
2026-05-18 03:49:41 +00:00 · 2026-04-20 14:52:24 -04:00
parent 3dbc77a678
commit fd1060f6fe
4 changed files with 69 additions and 4 deletions
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -180,6 +180,34 @@ if(CK_USE_OCP_FP8)
  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 endif()

+set(FMHA_HAS_RDNA_TARGET OFF)
+set(FMHA_HAS_NON_RDNA_TARGET OFF)
+foreach(inst_target ${INST_TARGETS})
+  if(inst_target MATCHES "^(gfx11|gfx12)")
+    set(FMHA_HAS_RDNA_TARGET ON)
+  else()
+    set(FMHA_HAS_NON_RDNA_TARGET ON)
+  endif()
+endforeach()
+
+if(FMHA_HAS_RDNA_TARGET)
+  set(FMHA_FWD_RDNA_GEN_BLOBS)
+  foreach(fwd_blob ${FMHA_FWD_GEN_BLOBS})
+    if(fwd_blob MATCHES "_gfx1[12][^/]*\\.cpp$")
+      list(APPEND FMHA_FWD_RDNA_GEN_BLOBS ${fwd_blob})
+    endif()
+  endforeach()
+
+  if(FMHA_FWD_RDNA_GEN_BLOBS)
+    set_property(SOURCE ${FMHA_FWD_RDNA_GEN_BLOBS}
+      APPEND PROPERTY COMPILE_DEFINITIONS CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=5)
+  endif()
+
+  if(NOT FMHA_HAS_NON_RDNA_TARGET)
+    list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=5)
+  endif()
+endif()
+
 # use RTN_ASM on float to bfloat16 conversion by default, align with FA upstream
 list(APPEND FMHA_BWD_PRIVATE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3)
 list(APPEND FMHA_BWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3)
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -1183,8 +1183,6 @@ class KernelComponentFactoryGfx11(CompatibilityRuleFactory):
    def get_rules(cls) -> List[CompatibilityRule]:
        rules = super().get_rules()

-        # For gfx11 fp16/bf16 d128, use dpad=dvpad=t for the 64x32 tile:
-        # the exact-hdim variant (dpad=dvpad=f) is much slower here.
        def check_d128_tile_pipeline(
            problem_ctx: ProblemContext, kernel_ctx: KernelContext
        ) -> bool:
@@ -1215,6 +1213,7 @@ class KernelComponentFactoryGfx11(CompatibilityRuleFactory):
                ( 32,  32) : [FmhaFwdTileSize( 64,  64,  16,  32,  32,   32,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                ( 64,  64) : [FmhaFwdTileSize( 64,  64,  32,  64,  32,   64,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q < 4096")),
                              FmhaFwdTileSize(128,  64,  32,  64,  32,   64,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
+                # max_seqlen_q cutoff retuned after the bf16 standard_cnan change.
                (128, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q < 2048")),
                              FmhaFwdTileSize(128,  64,  32, 128,  32,  128,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,   6)],
                (192, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
@@ -1278,7 +1277,8 @@ class KernelComponentFactoryGfx12(CompatibilityRuleFactory):
                #                             bm0, bn0, bk0, bn1, bk1,
                ( 32,  32) : [FmhaFwdTileSize( 64,  64,  16,  32,  32,   32,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                ( 64,  64) : [FmhaFwdTileSize( 64,  64,  32,  64,  32,   64,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
-                (128, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q <= 8192")),
+                # max_seqlen_q cutoff retuned after the bf16 standard_cnan change.
+                (128, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q <= 4096")),
                              FmhaFwdTileSize(128,  64,  32, 128,  32,  128,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,   6)],
                (192, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                (256, 256) : [FmhaFwdTileSize( 64,  64,  32, 256,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],