Mx fp6 flatmm (#3601)

* add fp6 data-type and support sync/async dwordx3 load/store

* clang-format

* pre-commit

* 1st commit

* default mnk pass ut

* fix a distrubution

* fix

* fix bdram distr

* update

* pass ut

* improve perf

* update

* clean code

* resolve copilot comment

* reslove comment

* clang-format

---------

Co-authored-by: ZheWang <zhewan@amd.com>
This commit is contained in:
ZheWang
2026-02-02 16:04:40 +08:00
committed by GitHub
parent 1ae83137eb
commit e6bcd192d4
21 changed files with 761 additions and 136 deletions

View File

@@ -179,10 +179,11 @@ auto preShuffleWeight(ck_tile::HostTensor<dtype>& src)
const int K = src_lengths[0];
const int N = src_lengths[1];
constexpr int packed_size = ck_tile::numeric_traits<dtype>::PackedSize;
int KPack = 16 * packed_size; // fp4:32 or fp8:16
int NLane = N_Warp_Tile;
int KLane = 64 / NLane;
int K0 = K / (KLane * KPack);
int KPack =
std::is_same_v<dtype, ck_tile::pk_fp6x16_t> ? 32 : 16 * packed_size; // fp4/fp6:32 or fp8:16
int NLane = N_Warp_Tile;
int KLane = 64 / NLane;
int K0 = K / (KLane * KPack);
ck_tile::HostTensor<dtype> shuffled(ck_tile::HostTensorDescriptor({N * K}, {1}));
@@ -295,7 +296,14 @@ int run_mx_flatmm_example(int argc, char* argv[])
}
else if(mx_prec == "fp6" || mx_prec == "fp6xfp6")
{
throw std::runtime_error("fp6xfp6 is not supported.");
if(persistent_opt == 0)
return run_mx_flatmm_with_layouts<ck_tile::pk_fp6x16_t,
ck_tile::pk_fp6x16_t,
ck_tile::fp16_t,
MXfp6_FlatmmConfig16,
false>(argc, argv, Row{}, Col{}, Row{});
else
throw std::runtime_error("Only support non-persistent kernel now!");
}
else if(mx_prec == "fp8" || mx_prec == "fp8xfp8")
{