Use new mfma instructions for FP8 on gfx950 (#2202)

* Add logic to use new mfma instructions for fp8 bf8

* Fix example_gemm_xdl_fp8_pk_i4_bpreshuffle_v3 on gfx950 and run clang format

* Update include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp

Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>

* Fix intrin_mfma f8 calls due to merge mistake

---------

Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
This commit is contained in:
jefyang1
2025-05-19 17:29:51 -07:00
committed by GitHub
parent 57e0f5df29
commit f18170064d
34 changed files with 548 additions and 180 deletions

View File

@@ -124,7 +124,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<BlockGemmPipelineSch
using Base::I1;
using Base::I2;
using Base::KRepeat;
using Base::xdlops_gemm;
using typename Base::HotLoopInstList;
using Base::a_block_desc_m0_m1_m2_k;
@@ -145,6 +144,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<BlockGemmPipelineSch
using Base::MWaves;
static constexpr auto xdlops_gemm =
XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, BDataType>{};
static constexpr index_t PrefetchStages = 2;
static constexpr index_t PrefillStages = 1;
static constexpr index_t GlobalBufferNum = 1;