mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 05:01:25 +00:00
* Support 16x16 (MFMA, WMMA) and 32x32 (MFMA) tiles in fwd and bwd BlockDropout
Add comments with dropout implementation details
Fix performance regression of fwd+dropout
* Remove some usage of type punning (reinterpret_cast with ref or ptr) in Philox;
* "scalarize" seed and offset, they may come either from kernel args or from device memory
(presumably loaded with vector loads).
These changes help the compiler to procude more optimal code and reduce register spilling.
Use WarpGemmDispatcher instead of explicit WarpGemmMfma... to get CWarpDstrEncoding
Use code based on BlockDropout in BlockDropoutBwd
Refactor BlockDropout (fwd)
Implement BlockDropout (fwd) for WMMA
Originally BlockDropout only supported 32x32 tiles (IsWG32 = true),
this version supports 16x16 tiles.
If MPerBlock > MWarp * 16, it can generate numbers for two 16x16 tiles, similarly
to BlockDropoutBwd.
Implement BlockDropoutBwd for WMMA
Remove MakeRandValLds* functions unused in BlockDropoutBwd
Remove unused Run overload from BlockDropoutBwd
* Fix regression with philox seed and offset when they exceed 32-bit int
__builtin_amdgcn_readfirstlane works with 32-bit values, seed and offset
are 64-bit so they get truncated.
* Add F32 MFMA warp gemms
* Support f32 in fwd FMHA
* Implement transpose_vectors for 4-byte types (float)
* Fix unexpected implicit f32->uint32 cast in buffer_store<4>
__builtin_amdgcn_raw_buffer_store_b32 expects unsigned int but float was passed (implicitly casted to uint).
mbuf_t types in other buffer_store<> are changed for consistency.
* Support F32 in bwd FMHA
hdim = 256 is disabled for now because it uses too much memory on gfx90a
* Support Headdim = 48 (divisible by 16) in fwd
* Add fp32-specific receipts (800 and 801)
* Tune fwd tiles
* Tune bwd tiles
* Use small tiles only for small seqlen_q
* Fix after rebasing
* Fix selection of a fallback tile based on bm0
The assumption that the largest bm0 == 128 is not always true for
current fp32 tiles.
* Remove constraints and adjust filtering for fp32
Custom constraints are no longer needed because now the smallest tile
is selected automtically based on seqlen_q.
Filters related to qr_async_trload disabled valid fp32 tiles.
* Add fp32 tests
* Make splitkv and appendkv compile for fp32 only
There are no instances yet, but API still must compile when only fp32 is
requested.
* Remove unimportant f32 instances
* Add test_ck_tile_fmha_*_fp32 to REGRESSION_TESTS
* Replace magic numbers with a constant, improve comments for dropout
* Update changelog
* Fix condition that dq_acc must be set to zero when mask is used
The change was introduced in #2799
* Replace warp_uniform with recently added amd_wave_read_first_lane
* Add hdim = 96 and 192 to fwd
405 lines
18 KiB
C++
405 lines
18 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "ck_tile/core.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
|
|
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp"
|
|
|
|
namespace ck_tile {
|
|
|
|
// fp32
|
|
|
|
using WarpGemmMfmaF32F32F32M16N16K4 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF32F32F32M16N16K4<WGAttrCtlEnum::Default_>>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaF32F32F32M16N16K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImplF32F32F32M16N16K4<WGAttrCtlEnum::Default_>,
|
|
4,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaF32F32F32M16N16K16TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplF32F32F32M16N16K4<WGAttrCtlEnum::Default_>,
|
|
4,
|
|
AttrNumAccess>>;
|
|
|
|
// fp16
|
|
|
|
using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
|
|
|
|
#if defined(__gfx950__)
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
#else
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
|
|
2,
|
|
AttrNumAccess>>;
|
|
#endif
|
|
|
|
#if defined(__gfx950__)
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
#else
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
|
|
2,
|
|
AttrNumAccess>>;
|
|
#endif
|
|
|
|
using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
|
|
WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
|
|
1>>;
|
|
|
|
using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
|
|
WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
|
|
2>>;
|
|
|
|
using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
|
|
|
|
#if defined(__gfx950__)
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
#else
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
|
|
2,
|
|
AttrNumAccess>>;
|
|
#endif
|
|
|
|
#if defined(__gfx950__)
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
#else
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
|
|
2,
|
|
AttrNumAccess>>;
|
|
#endif
|
|
|
|
#if defined(__gfx950__)
|
|
using WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
|
|
WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
|
|
1>>;
|
|
|
|
using WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
|
|
1>>;
|
|
#endif
|
|
|
|
using WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
|
|
WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
#if defined(__gfx950__)
|
|
using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
|
|
WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
|
|
#else
|
|
using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
|
|
WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
|
|
2>>;
|
|
#endif
|
|
|
|
using WarpGemmMfmaF16F16F32M4N64K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImplF16F16F32M4N64K4<WGAttrCtlEnum::Default_>,
|
|
4>>;
|
|
|
|
using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImplF16F16F32M64N4K4<WGAttrCtlEnum::Default_>,
|
|
4>>;
|
|
|
|
// fp16 2:4 structured sparsity
|
|
using WarpGemmSmfmacF16F16F32M32N32K16 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
|
|
WarpGemmAttributeSmfmacImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
|
|
WarpGemmAttributeSmfmacImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
|
|
|
|
// bf16
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
|
|
|
|
#if defined(__gfx950__)
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
#else
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
|
|
2,
|
|
AttrNumAccess>>;
|
|
#endif
|
|
|
|
#if defined(__gfx950__)
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
#else
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
|
|
2,
|
|
AttrNumAccess>>;
|
|
#endif
|
|
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
|
|
1>>;
|
|
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
|
|
2>>;
|
|
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
|
|
|
|
#if defined(__gfx950__)
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
#else
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
|
|
2,
|
|
AttrNumAccess>>;
|
|
#endif
|
|
|
|
#if defined(__gfx950__)
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
#else
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
|
|
2,
|
|
AttrNumAccess>>;
|
|
#endif
|
|
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
#if defined(__gfx950__)
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
|
|
#else
|
|
using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
|
|
2>>;
|
|
#endif
|
|
|
|
using WarpGemmMfmaBf16Bf16F32M4N64K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4<WGAttrCtlEnum::Default_>,
|
|
4>>;
|
|
|
|
using WarpGemmMfmaBf16Bf16F32M64N4K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4<WGAttrCtlEnum::Default_>,
|
|
4>>;
|
|
|
|
// fp8
|
|
|
|
using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_32x32x32_fp8_fp8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>,
|
|
2>>;
|
|
|
|
using WarpGemmMfma_f32_32x32x32_bf8_bf8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>,
|
|
2>>;
|
|
|
|
using WarpGemmMfma_f32_16x16x32_fp8_fp8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_16x16x32_fp8_fp8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_16x16x32_bf8_bf8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_16x16x32_bf8_bf8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_16x16x64_fp8_fp8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>,
|
|
2>>;
|
|
|
|
using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
|
WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>,
|
|
2>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_16x16x128_fp8_fp8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_16x16x128_fp8_bf8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_16x16x128_bf8_fp8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_16x16x128_fp8_fp8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_16x16x128_fp8_bf8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_16x16x128_bf8_fp8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_16x16x128_bf8_bf8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_32x32x64_fp8_fp8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_32x32x64_fp8_bf8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_32x32x64_bf8_fp8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
|
|
using WarpGemmMfma_f32_32x32x64_bf8_bf8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>,
|
|
AttrNumAccess>>;
|
|
|
|
using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
template <index_t swizzle_factor = 2>
|
|
using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
|
|
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, WGAttrCtlEnum::Default_>,
|
|
2,
|
|
swizzle_factor>>;
|
|
|
|
// int8
|
|
using WarpGemmMfma_i32_32x32x16_i8_i8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_i32_16x16x32_i8_i8 = WarpGemmImpl<
|
|
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
using WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed =
|
|
WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
|
|
WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
|
|
|
|
} // namespace ck_tile
|