mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
moe fp8 blockscale use nt (#3524)
* nt on fp8 blockscale * some improve and tests needs to be fixed * update * fix format * revert useless change * revert any change in amd_buffer_coherence
This commit is contained in:
@@ -80,7 +80,8 @@ template <typename ALayout,
|
||||
typename ComputeTypeA = CDataType,
|
||||
typename ComputeTypeB = ComputeTypeA,
|
||||
typename LDSTypeA = ComputeTypeA,
|
||||
typename LDSTypeB = ComputeTypeB>
|
||||
typename LDSTypeB = ComputeTypeB,
|
||||
bool NonTemporalLoadB = false>
|
||||
struct DeviceMoeGemmBlockScale
|
||||
: public DeviceGemmMultipleD_BlockScale_BPreshuffle<ALayout,
|
||||
BLayout,
|
||||
@@ -163,7 +164,8 @@ struct DeviceMoeGemmBlockScale
|
||||
ComputeTypeA,
|
||||
ComputeTypeB,
|
||||
LDSTypeA,
|
||||
LDSTypeB>;
|
||||
LDSTypeB,
|
||||
NonTemporalLoadB>;
|
||||
using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
|
||||
using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user