MX GEMM - FP6 Example (#2419)

Adds support for MX FP6 data type in MX GEMM block pipeline version v1. Provides an example of MX FP6 GEMM algorithm. --------- Co-authored-by: OscarXu <huaiguxu@amd.com> Co-authored-by: aska-0096 <haocwang@amd.com> Co-authored-by: mtgu0705 <mtgu@amd.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com> Co-authored-by: valarLip <340077269@qq.com> Co-authored-by: Ding, Yi <yi.ding@amd.com> Co-authored-by: feifei14119 <feiw@amd.com> Co-authored-by: Lin, Qun <qlin@amd.com> Co-authored-by: joye <joye@amd.com>
2026-04-20 14:59:17 +00:00 · 2025-07-07 10:33:26 -06:00
parent bfe573d3ba
commit 054f85ab7c
18 changed files with 578 additions and 95 deletions
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
@@ -66,9 +66,12 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
    static constexpr index_t AMmaKStride = KPack;
    static constexpr index_t BMmaKStride = KPack;

-    //> store rows/cols into thread registers in chunks of 16
-    //> e.g. [k0,...,k15,k64,...,k79] or [k0,...,k15,k32,...,k47]
-    static constexpr index_t KThreadChunk = 16 / sizeof(ComputeTypeA);
+    // store rows/cols into thread registers in chunks of 16 for FP8
+    // e.g. [k0,...,k15,k64,...,k79] or [k0,...,k15,k32,...,k47]
+    // or in chunks of 32 / APackedSize for FP6/FP4
+    static constexpr index_t KThreadChunk = (APackedSize == 1) ? 16 : 32 / APackedSize;
+
+    static_assert(APackedSize == BPackedSize, "APackedSize must be equal to BPackedSize for now");

    static constexpr index_t KPerThread    = KPerBlock / xdlops_gemm.K0PerXdlops;
    static constexpr index_t KRepeat       = KPerThread / KPack;