moe fp8 blockscale use nt (#3524)

* nt on fp8 blockscale

* some improve and tests needs to be fixed

* update

* fix format

* revert useless change

* revert any change in amd_buffer_coherence
This commit is contained in:
yadaish
2026-01-12 10:48:10 +08:00
committed by GitHub
parent 4216d43da8
commit 32408c8bc0
3 changed files with 63 additions and 34 deletions

View File

@@ -80,7 +80,8 @@ template <typename ALayout,
typename ComputeTypeA = CDataType,
typename ComputeTypeB = ComputeTypeA,
typename LDSTypeA = ComputeTypeA,
typename LDSTypeB = ComputeTypeB>
typename LDSTypeB = ComputeTypeB,
bool NonTemporalLoadB = false>
struct DeviceMoeGemmBlockScale
: public DeviceGemmMultipleD_BlockScale_BPreshuffle<ALayout,
BLayout,
@@ -163,7 +164,8 @@ struct DeviceMoeGemmBlockScale
ComputeTypeA,
ComputeTypeB,
LDSTypeA,
LDSTypeB>;
LDSTypeB,
NonTemporalLoadB>;
using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;