moe fp8 blockscale use nt (#3524)

* nt on fp8 blockscale * some improve and tests needs to be fixed * update * fix format * revert useless change * revert any change in amd_buffer_coherence
2026-04-19 22:39:03 +00:00 · 2026-01-12 10:48:10 +08:00
parent 4216d43da8
commit 32408c8bc0
3 changed files with 63 additions and 34 deletions
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
@@ -80,7 +80,8 @@ template <typename ALayout,
          typename ComputeTypeA                       = CDataType,
          typename ComputeTypeB                       = ComputeTypeA,
          typename LDSTypeA                           = ComputeTypeA,
-          typename LDSTypeB                           = ComputeTypeB>
+          typename LDSTypeB                           = ComputeTypeB,
+          bool NonTemporalLoadB                       = false>
 struct DeviceMoeGemmBlockScale
    : public DeviceGemmMultipleD_BlockScale_BPreshuffle<ALayout,
                                                        BLayout,
@@ -163,7 +164,8 @@ struct DeviceMoeGemmBlockScale
        ComputeTypeA,
        ComputeTypeB,
        LDSTypeA,
-        LDSTypeB>;
+        LDSTypeB,
+        NonTemporalLoadB>;
    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;