MX GEMM - FP6 Example (#2419)

Adds support for MX FP6 data type in MX GEMM block pipeline version v1.
Provides an example of MX FP6 GEMM algorithm.

---------

Co-authored-by: OscarXu <huaiguxu@amd.com>
Co-authored-by: aska-0096 <haocwang@amd.com>
Co-authored-by: mtgu0705 <mtgu@amd.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com>
Co-authored-by: valarLip <340077269@qq.com>
Co-authored-by: Ding, Yi <yi.ding@amd.com>
Co-authored-by: feifei14119 <feiw@amd.com>
Co-authored-by: Lin, Qun <qlin@amd.com>
Co-authored-by: joye <joye@amd.com>
This commit is contained in:
Andriy Roshchenko
2025-07-07 10:33:26 -06:00
committed by GitHub
parent bfe573d3ba
commit 054f85ab7c
18 changed files with 578 additions and 95 deletions

View File

@@ -66,9 +66,12 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
static constexpr index_t AMmaKStride = KPack;
static constexpr index_t BMmaKStride = KPack;
//> store rows/cols into thread registers in chunks of 16
//> e.g. [k0,...,k15,k64,...,k79] or [k0,...,k15,k32,...,k47]
static constexpr index_t KThreadChunk = 16 / sizeof(ComputeTypeA);
// store rows/cols into thread registers in chunks of 16 for FP8
// e.g. [k0,...,k15,k64,...,k79] or [k0,...,k15,k32,...,k47]
// or in chunks of 32 / APackedSize for FP6/FP4
static constexpr index_t KThreadChunk = (APackedSize == 1) ? 16 : 32 / APackedSize;
static_assert(APackedSize == BPackedSize, "APackedSize must be equal to BPackedSize for now");
static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
static constexpr index_t KRepeat = KPerThread / KPack;