mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 11:30:02 +00:00
[Performance] Use N0Sub=16 for trload with softmax pipeline to reduce vgpr spilling
This commit is contained in:
@@ -260,7 +260,7 @@ struct HstuAttentionWithSoftmaxFwdBlockTile<96>
|
||||
template <>
|
||||
struct HstuAttentionWithSoftmaxFwdBlockTile<128>
|
||||
{
|
||||
using type = ck_tile::sequence<128, 64, 32, 128, 32, 128>;
|
||||
using type = ck_tile::sequence<128, 64, 16, 128, 32, 128>;
|
||||
using gemm0_warps = ck_tile::sequence<4, 1, 1>;
|
||||
using gemm1_warps = ck_tile::sequence<4, 1, 1>;
|
||||
};
|
||||
@@ -268,7 +268,7 @@ struct HstuAttentionWithSoftmaxFwdBlockTile<128>
|
||||
template <>
|
||||
struct HstuAttentionWithSoftmaxFwdBlockTile<256>
|
||||
{
|
||||
using type = ck_tile::sequence<128, 64, 32, 256, 32, 256>;
|
||||
using type = ck_tile::sequence<128, 64, 16, 256, 32, 256>;
|
||||
using gemm0_warps = ck_tile::sequence<4, 1, 1>;
|
||||
using gemm1_warps = ck_tile::sequence<4, 1, 1>;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user