From 2d518fbb1552ffe74800bc80a5b3fad3a8fecc28 Mon Sep 17 00:00:00 2001 From: yadaish Date: Thu, 18 Dec 2025 17:32:42 +0000 Subject: [PATCH] fix accruacy issue --- .../moe_gemm1_xdl_fp8_blockscale_splitk.cpp | 2 +- ...ise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale_splitk.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale_splitk.cpp index ad17d18824..08a1d6614e 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale_splitk.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale_splitk.cpp @@ -128,7 +128,7 @@ static constexpr ck::index_t CShuffleMXDLPerWave = MXDLPerWave; static constexpr ck::index_t CShuffleNXDLPerWave = NXDLPerWave; static constexpr ck::index_t BLOCKSIZE = 256; -static constexpr ck::index_t KPerBlock = 256 / sizeof(A0DataType); +static constexpr ck::index_t KPerBlock = 128 / sizeof(A0DataType); static constexpr ck::index_t AK1 = 16 / sizeof(A0DataType); static constexpr ck::index_t BK1 = 16 / sizeof(B0DataType); static constexpr ck::index_t EVec = 16 / sizeof(EDataType); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp index 59265502e8..a76be40753 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp @@ -360,6 +360,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1< }); }); + __builtin_amdgcn_sched_barrier(0); // Local prefill A1 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0); @@ -550,6 +551,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1< }); }); + __builtin_amdgcn_sched_barrier(0); a_scale_thread_copy.Run(a_scale_grid_desc, a_scale_grid_buf, a_scale_thread_desc, @@ -677,6 +679,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1< }); block_sync_lds(); + __builtin_amdgcn_sched_barrier(0); static_for<0, MRepeat, 1>{}([&](auto m0) { static_for<0, KRepeat, 1>{}([&](auto k0) {