[CK_TILE] Skip padded k/n fragment work in qr_hpad FMHA fwd (#6450)

## Motivation `qr_hpad` currently executes work for padded head-dim fragments even when only a subset of the values are valid. This adds unnecessary computation for head dimensions that require padding, such as `hdim=72` and `hdim=80`, and hurts FMHA forward performance. The goal of this PR is to make the padded-head-dim path skip invalid work based on the actual valid fragment count, while preserving the existing behavior for the non-padded path. ## Technical Details This PR improves the `qr_hpad` FMHA forward path in three parts: - Skip padded `k`/`n` fragments in the GEMM/pipeline path when only part of the fragment is valid. - Add partial GEMM0 tail handling for `qr_hpad` so the kernel uses the valid fragment range instead of always computing over the padded extent. - Retune the gfx11 `qr_hpad` kernel configuration after enabling the partial-fragment path. To keep the existing path stable, the implementation adds overloads for the updated GEMM/pipeline interfaces. This allows existing full-tile callers to keep using the previous form, while the `qr_hpad` path can pass valid fragment counts when needed. ## Test Plan ./build/bin/tile_example_fmha_fwd -prec=bf16 -mode={0/1} -b=1 -h=16 -d={72/80} -s={seqlen} -s_k={seqlen} -lse=0 -iperm={0/1} -operm={0/1} ## Test Result - On gfx11 and gfx12, for head dimensions that require padding, `tile_example_fmha_fwd` shows about 20-30% performance improvement at `hdim=72/80`. ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
2026-05-18 20:09:25 +00:00 · 2026-04-18 02:44:46 -04:00
parent 907c6e94ae
commit f5e00ec904
4 changed files with 478 additions and 128 deletions
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -1194,18 +1194,15 @@ class KernelComponentFactoryGfx11(CompatibilityRuleFactory):
            if (problem_ctx.hdim, problem_ctx.hdim_v) != (128, 128):
                return True

-            is_64x32_tile = kernel_ctx.tile.F_bm0 == 64 and kernel_ctx.tile.F_bn0 == 32
-            pads_hdim = (
-                kernel_ctx.pipeline.F_dpad == "t" and kernel_ctx.pipeline.F_dvpad == "t"
-            )
-            exact_hdim = (
-                kernel_ctx.pipeline.F_dpad == "f" and kernel_ctx.pipeline.F_dvpad == "f"
-            )
+            # For (128, 128) head dims, partial-fragment support in qr_hpad removes the need
+            # for the previous qr_hpad-specific handling that was added to avoid register spill.
+            # qr_hpad now reuses the regular 128x64 tile choice.
+            # The 64x64 tile remains disabled for qr_hpad because it is consistently slower
+            # in our measurements.
+            if kernel_ctx.tile.F_bm0 == 64 and kernel_ctx.tile.F_bn0 == 64:
+                return kernel_ctx.pipeline.tag != "qr_hpad"

-            if is_64x32_tile:
-                return pads_hdim
-
-            return exact_hdim
+            return True

        rules.append(check_d128_tile_pipeline)
        return rules
@@ -1218,8 +1215,7 @@ class KernelComponentFactoryGfx11(CompatibilityRuleFactory):
                ( 32,  32) : [FmhaFwdTileSize( 64,  64,  16,  32,  32,   32,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                ( 64,  64) : [FmhaFwdTileSize( 64,  64,  32,  64,  32,   64,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q < 4096")),
                              FmhaFwdTileSize(128,  64,  32,  64,  32,   64,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
-                (128, 128) : [FmhaFwdTileSize( 64,  32,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,   6, CppConstraint("a.hdim_q != 128 || a.hdim_v != 128")),
-                              FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q < 4096")),
+                (128, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q < 2048")),
                              FmhaFwdTileSize(128,  64,  32, 128,  32,  128,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,   6)],
                (192, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                (256, 256) : [FmhaFwdTileSize(128,  64,  32, 256,  32,  256,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,   6)]