Use new mfma instructions for FP8 on gfx950 (#2202)

* Add logic to use new mfma instructions for fp8 bf8 * Fix example_gemm_xdl_fp8_pk_i4_bpreshuffle_v3 on gfx950 and run clang format * Update include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com> * Fix intrin_mfma f8 calls due to merge mistake --------- Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
2026-04-20 06:49:15 +00:00 · 2025-05-19 17:29:51 -07:00
parent 57e0f5df29
commit f18170064d
34 changed files with 548 additions and 180 deletions
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
@@ -124,7 +124,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<BlockGemmPipelineSch
    using Base::I1;
    using Base::I2;
    using Base::KRepeat;
-    using Base::xdlops_gemm;
    using typename Base::HotLoopInstList;

    using Base::a_block_desc_m0_m1_m2_k;
@@ -145,6 +144,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<BlockGemmPipelineSch

    using Base::MWaves;

+    static constexpr auto xdlops_gemm =
+        XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, BDataType>{};
+
    static constexpr index_t PrefetchStages        = 2;
    static constexpr index_t PrefillStages         = 1;
    static constexpr index_t GlobalBufferNum       = 1;