[rocm-libraries] ROCm/rocm-libraries#4302 (commit e62bd8a)

[CK_TILE] add tf32 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Proposed changes

TF32 is added in CK on gfx942 and gfx950. This PR is to initiate tf32 in
CK_TILE on gfx942 and gfx950.

## Checklist

Please put an into the boxes that apply. You can also fill these out
after creating the PR. If you're not sure, please don't hesitate to ask.

- [ ] I have added tests relevant to the introduced functionality, and
the unit tests are passing locally
- [ ] I have added the test to REGRESSION_TESTS list defined at the top
of CMakeLists.txt in tests/CMakeLists.txt, **IF** the test takes more
than 30 seconds to run.
- [ ] I have added inline documentation which enables the maintainers
with understanding the motivation
- [ ] I have removed the stale documentation which is no longer relevant
after this pull request
- [ ] (If this change is user-facing) I have added release notes which
provide the end users with a brief summary of the improvement from this
pull request
- [x] I have run  on all changed files
- [ ] Any dependent changes have been merged

## Discussion
This commit is contained in:
yinglu
2026-03-19 09:19:06 +00:00
committed by assistant-librarian[bot]
parent 652d3456ca
commit d460ab35b6
30 changed files with 1164 additions and 260 deletions

View File

@@ -89,19 +89,32 @@ struct CShuffleEpilogue
remove_cvref_t<BsDataType>,
remove_cvref_t<tuple<BsDataType>>>;
using ADataType = remove_cvref_t<std::tuple_element_t<number<0>{}, AsDataTypeTuple>>;
using BDataType = remove_cvref_t<std::tuple_element_t<number<0>{}, BsDataTypeTuple>>;
// ADataTypeCompute: compute type from Problem (may be tf32_t for TF32 mode)
using ADataTypeCompute = remove_cvref_t<std::tuple_element_t<number<0>{}, AsDataTypeTuple>>;
using BDataTypeCompute = remove_cvref_t<std::tuple_element_t<number<0>{}, BsDataTypeTuple>>;
using ATypeToUse = std::conditional_t<std::is_same_v<ADataType, pk_int4_t> ||
std::is_same_v<ADataType, pk_fp4_t>,
BDataType,
ADataType>;
// ADataTypeBuf: buffer/storage type (fp32 when tf32)
using ADataTypeBuf = if_select_t<ADataTypeCompute, tf32_t, float, ADataTypeCompute>;
using BDataTypeBuf = if_select_t<BDataTypeCompute, tf32_t, float, BDataTypeCompute>;
// For warp gemm selection: use tf32_t if compute type was tf32_t
// For pk_int4/pk_fp4: use the other data type
using ATypeToUse =
std::conditional_t<std::is_same_v<ADataTypeCompute, tf32_t>,
tf32_t,
std::conditional_t<std::is_same_v<ADataTypeBuf, pk_int4_t> ||
std::is_same_v<ADataTypeBuf, pk_fp4_t>,
BDataTypeBuf,
ADataTypeBuf>>;
// Used for weight-only quantization kernel, B would be dequantized to the same data type as A
using BTypeToUse = std::conditional_t<std::is_same_v<BDataType, pk_int4_t> ||
std::is_same_v<BDataType, pk_fp4_t> ||
sizeof(BDataType) < sizeof(ADataType),
ADataType,
BDataType>;
using BTypeToUse =
std::conditional_t<std::is_same_v<BDataTypeCompute, tf32_t>,
tf32_t,
std::conditional_t<std::is_same_v<BDataTypeBuf, pk_int4_t> ||
std::is_same_v<BDataTypeBuf, pk_fp4_t> ||
sizeof(BDataTypeBuf) < sizeof(ADataTypeBuf),
ADataTypeBuf,
BDataTypeBuf>>;
using ELayout = remove_cvref_t<typename Problem::ELayout>;
using CDElementwise = remove_cvref_t<typename Problem::CDElementwise>;
@@ -137,7 +150,7 @@ struct CShuffleEpilogue
[[nodiscard]] CK_TILE_HOST static const std::string GetName()
{
// clang-format off
return concat('_', "CShuffleEpilogue",
return concat('_', "CShuffleEpilogue",
concat('x', MWave, NWave),
concat('x', MPerXdl, NPerXdl, KPerXdl),
VectorSizeC,
@@ -440,8 +453,8 @@ struct CShuffleEpilogue
constexpr int RakedXDLN_PerWarp = NumNXdlPerWavePerShuffle / BlockedXDLN_PerWarp;
// BlockedLayout
// this branch is for original a16w4
if constexpr(is_950 || is_any_of<ADataType, pk_int4_t, pk_fp4_t>::value ||
is_any_of<BDataType, pk_int4_t, pk_fp4_t>::value)
if constexpr(is_950 || is_any_of<ADataTypeBuf, pk_int4_t, pk_fp4_t>::value ||
is_any_of<BDataTypeBuf, pk_int4_t, pk_fp4_t>::value)
{
if constexpr(EightWave)
{