From 5f9b03746d65942cbbfbbd9a2ca8b59b49ad7adc Mon Sep 17 00:00:00 2001
From: Amir Ghamarian <amir.ghamarian@amd.com>
Date: Sat, 28 Mar 2026 11:28:21 +0000
Subject: [PATCH] Expand medium tier dispatch threshold for multi-seq prefill

Raise the medium tier boundary from avg_q <= 32 to avg_q <= 128.
This routes prefill shapes with 76-253 sequences (avg query length
32-108) to the 4-warp kBlockM=128 kernel instead of the 8-warp
kBlockM=256 kernel.

Results on affected shapes (via aiter):
  76-seq prefill:  1.44x vs Triton -> 1.03x (near-tied)
  157-seq prefill: 1.61x -> 1.02x
  181-seq prefill: 1.74x -> 1.00x (tied)
  229-seq prefill: 1.77x -> 1.05x

Made-with: Cursor
---
 example/ck_tile/42_unified_attention/unified_attention.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/ck_tile/42_unified_attention/unified_attention.cpp b/example/ck_tile/42_unified_attention/unified_attention.cpp
index 5f9fb42c13..e642f3a60f 100644
--- a/example/ck_tile/42_unified_attention/unified_attention.cpp
+++ b/example/ck_tile/42_unified_attention/unified_attention.cpp
@@ -50,7 +50,7 @@ static tile_tier select_tile_tier(const unified_attention_args& args)
         return tile_tier::small;   // pure decode: 2 warps, kBlockM=64
 
     const index_t kBlockQ_medium = 128 / args.num_queries_per_kv; // kBlockQ for 4-warp kernel
-    if(avg_q <= kBlockQ_medium * 2)
+    if(avg_q <= kBlockQ_medium * 8)
         return tile_tier::medium;  // many short seqs: 4 warps, kBlockM=128
 
     return tile_tier::large;       // long prefill: 8 warps, kBlockM=256