diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp index 56d3b48547..12d28f572c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp @@ -253,12 +253,12 @@ struct DeviceMoeGemmBlockScale // rotating mem rotating_mem.Next(); // clear c mem - // if(arg_.KBatch > 1) - // hipGetErrorString(hipMemsetAsync(arg_.p_c_grid, - // 0, - // arg_.M * arg_.N * sizeof(CDataType) - // * (IsInputGemm && IsSplitK ? 2 : 1), - // stream_config.stream_id_)); + if(arg_.KBatch > 1) + hipGetErrorString(hipMemsetAsync(arg_.p_c_grid, + 0, + arg_.M * arg_.N * sizeof(CDataType) * + (IsInputGemm && IsSplitK ? 2 : 1), + stream_config.stream_id_)); }; ave_time = ck::utility::launch_and_time_kernel_with_preprocess( @@ -272,12 +272,12 @@ struct DeviceMoeGemmBlockScale } else { - // if(arg.KBatch > 1) - // hipGetErrorString(hipMemsetAsync(arg.p_c_grid, - // 0, - // arg.M * arg.N * sizeof(CDataType) * - // (IsInputGemm && IsSplitK ? 2 : 1), - // stream_config.stream_id_)); + if(arg.KBatch > 1) + hipGetErrorString(hipMemsetAsync(arg.p_c_grid, + 0, + arg.M * arg.N * sizeof(CDataType) * + (IsInputGemm && IsSplitK ? 2 : 1), + stream_config.stream_id_)); ave_time = launch_and_time_kernel( stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);