[Performance] Use N0Sub=16 for trload with softmax pipeline to reduce vgpr spilling

This commit is contained in:
Qianfeng Zhang
2026-02-02 15:59:38 +00:00
parent c360e0cbc4
commit 0a8c5f523a

View File

@@ -260,7 +260,7 @@ struct HstuAttentionWithSoftmaxFwdBlockTile<96>
template <>
struct HstuAttentionWithSoftmaxFwdBlockTile<128>
{
using type = ck_tile::sequence<128, 64, 32, 128, 32, 128>;
using type = ck_tile::sequence<128, 64, 16, 128, 32, 128>;
using gemm0_warps = ck_tile::sequence<4, 1, 1>;
using gemm1_warps = ck_tile::sequence<4, 1, 1>;
};
@@ -268,7 +268,7 @@ struct HstuAttentionWithSoftmaxFwdBlockTile<128>
template <>
struct HstuAttentionWithSoftmaxFwdBlockTile<256>
{
using type = ck_tile::sequence<128, 64, 32, 256, 32, 256>;
using type = ck_tile::sequence<128, 64, 16, 256, 32, 256>;
using gemm0_warps = ck_tile::sequence<4, 1, 1>;
using gemm1_warps = ck_tile::sequence<4, 1, 1>;
};