From d3fb5a9b8d02490cf16938644335df2a949d0ff3 Mon Sep 17 00:00:00 2001 From: illsilin Date: Fri, 28 Mar 2025 11:52:40 -0700 Subject: [PATCH] fix clang format --- .../moe_gemm2_xdl_fp8.cpp | 29 ++--- ...dlops_b_preshuffle_gufusion_dequant_v1.hpp | 123 +++++++++--------- ...peline_xdlops_b_preshuffle_gufusion_v1.hpp | 80 ++++++------ ..._pipeline_xdlops_b_preshuffle_selector.hpp | 92 +++++++------ .../blockwise_gemm_pipeline_xdlops_base.hpp | 4 +- ...ise_tensor_slice_transfer_v7r3_scatter.hpp | 4 +- include/ck/utility/dynamic_buffer.hpp | 21 +-- .../cpu/reference_moe_gemm.hpp | 20 +-- 8 files changed, 191 insertions(+), 182 deletions(-) diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp index 002497e9d2..299ccb6a3e 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp @@ -255,7 +255,7 @@ int main(int argc, char* argv[]) // max_token_id.mData[0] = valid_size; max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13}; int eids[] = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3}; - //max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8}; + // max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8}; // int eids[] = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2} for(int i = 0; i < sorted_tile_num; i++) { @@ -419,20 +419,19 @@ int main(int argc, char* argv[]) Tensor c_t_n({tokens, N}); - using ReferenceGemmInstance = - ck::tensor_operation::host::ReferenceMoeGemm2; - auto ref_moe_gemm = ReferenceGemmInstance{}; - auto ref_invoker = ref_moe_gemm.MakeInvoker(); - auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids, + using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm2; + auto ref_moe_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_moe_gemm.MakeInvoker(); + auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids, expert_ids, max_token_id, MPerBlock, diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp index 3df1a3350d..ce102ff1ad 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp @@ -58,45 +58,45 @@ template -struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1 - : BlockwiseGemmXdlops_pipeline_base +struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< + BlockGemmPipelineScheduler::Intrawave, + BlockSize, + ADataType, + BDataType, + ComputeDataType, + AccDataType, + ATileDesc, + BTileDesc, + AMmaTileDesc, + BMmaTileDesc, + ABlockTransferSrcScalarPerVector, + BBlockTransferSrcScalarPerVector, + MPerBlock, + NPerBlock, + KPerBlock, + MPerXDL, + NPerXDL, + MRepeat, + NRepeat, + KPack> : BlockwiseGemmXdlops_pipeline_base { using Base = BlockwiseGemmXdlops_pipeline_base{}> b_thread_dequant_bufs; - StaticallyIndexedArray{}> b_thread_dequant_bufs_up; + StaticallyIndexedArray{}> + b_thread_dequant_bufs_up; // Global prefetch A1 B1 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0); @@ -268,10 +269,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1VGPR dequant b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1, - b_block_origin_idx, - b_thread_bufs(I0), - b_thread_desc_, - make_tuple(I0, I0, I0, I0), - b_thread_dequant_bufs(I0)); + b_block_origin_idx, + b_thread_bufs(I0), + b_thread_desc_, + make_tuple(I0, I0, I0, I0), + b_thread_dequant_bufs(I0)); b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1, b_block_origin_idx, b_thread_bufs_up(I0), @@ -330,10 +331,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1{}]; b_thread_vec_up.template AsType()(ik) = - b_thread_dequant_bufs_up[mfma_reg_buf] - [Number{}]; + b_thread_dequant_bufs_up + [mfma_reg_buf][Number{}]; }); using mfma_input_type = typename vector_type struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1 + BlockSize, + ADataType, + BDataType, + ComputeDataType, + AccDataType, + ATileDesc, + BTileDesc, + AMmaTileDesc, + BMmaTileDesc, + ABlockTransferSrcScalarPerVector, + BBlockTransferSrcScalarPerVector, + MPerBlock, + NPerBlock, + KPerBlock, + MPerXDL, + NPerXDL, + MRepeat, + NRepeat, + KPack> : BlockwiseGemmXdlops_pipeline_base{}([&](auto i) { ignore = i; @@ -276,10 +277,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1{}]; b_thread_vec_up.template AsType()(ik) = b_thread_bufs_up[mfma_reg_buf] - [Number{}]; + [Number{}]; }); using mfma_input_type = typename vector_type(), b_thread_vec.template AsType(), c_thread_buf.GetVectorTypeReference(Number{})); - + xdlops_gemm.Run( a_thread_vec.template AsType(), b_thread_vec_up.template AsType(), @@ -410,10 +411,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1(), b_thread_vec.template AsType(), c_thread_buf.GetVectorTypeReference(Number{})); - + xdlops_gemm.Run(a_thread_vec.template AsType(), b_thread_vec_up.template AsType(), c_thread_buf_up.GetVectorTypeReference(Number{})); @@ -537,7 +538,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1(), c_thread_buf.GetVectorTypeReference(Number{})); xdlops_gemm.Run(a_thread_vec.template AsType(), - b_thread_vec.template AsType(), + b_thread_vec.template AsType(), c_thread_buf_up.GetVectorTypeReference(Number{})); }); }); @@ -567,7 +568,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1{}, I1, Number{}, Number{})); static constexpr BTileDesc b_block_desc_n0_n1_k0_k1; - }; } // namespace ck diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp index 190ea1ce27..074b5873ee 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp @@ -43,53 +43,58 @@ constexpr auto BlockGemmBPreshufflePipeline_Selector() { if constexpr(std::is_same::value) { - if constexpr(GUFusion) { - return BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1{}; - } else { + if constexpr(GUFusion) + { + return BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1< + BlkGemmPipeSche, + BlockSize, + ADataType, + BDataType, + ComputeDataType, + AccDataType, + ATileDesc, + BTileDesc, + AMmaTileDesc, + BMmaTileDesc, + ABlockTransferSrcScalarPerVector, + BBlockTransferSrcScalarPerVector, + MPerBlock, + NPerBlock, + KPerBlock, + MPerXDL, + NPerXDL, + MRepeat, + NRepeat, + KPack>{}; + } + else + { return BlockwiseGemmXdlops_pipeline_bpreshuffle_v1{}; + BlockSize, + ADataType, + BDataType, + ComputeDataType, + AccDataType, + ATileDesc, + BTileDesc, + AMmaTileDesc, + BMmaTileDesc, + ABlockTransferSrcScalarPerVector, + BBlockTransferSrcScalarPerVector, + MPerBlock, + NPerBlock, + KPerBlock, + MPerXDL, + NPerXDL, + MRepeat, + NRepeat, + KPack>{}; } } else { - if constexpr(GUFusion) { + if constexpr(GUFusion) + { return BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlkGemmPipeSche, BlockSize, @@ -112,7 +117,8 @@ constexpr auto BlockGemmBPreshufflePipeline_Selector() NRepeat, KPack>{}; } - else { + else + { return BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1< BlkGemmPipeSche, BlockSize, diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp index 426e7a6a8c..d086201dd7 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp @@ -335,9 +335,7 @@ struct BlockwiseGemmXdlops_pipeline_base return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2( c_grid_desc_g_m0_n0_m1_n1_m2_n2); } - __host__ __device__ static constexpr auto GetCThreadDesc() { - return c_thread_desc_; - } + __host__ __device__ static constexpr auto GetCThreadDesc() { return c_thread_desc_; } static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k; static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp index 1a3b4517a0..7b912ef362 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp @@ -397,7 +397,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter // loop over space-filling curve static_for<0, dst_num_access, 1>{}([&](auto iAccess) { - auto dst_vectors = dst_vectors_tuple_[thread_scratch_id][iAccess]; + auto dst_vectors = dst_vectors_tuple_[thread_scratch_id][iAccess]; IndexType scatter_offset = 0; if constexpr(OutputScatter) { @@ -408,7 +408,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter // copy data from buf_vectors into dst_bufs static_for<0, nDst, 1>{}([&](auto i) { using dst_vector_t = typename remove_cvref_t::type; - IndexType dst_offset = scatter_offset + (dst_coords_[i].GetOffset()); + IndexType dst_offset = scatter_offset + (dst_coords_[i].GetOffset()); const bool is_dst_valid = dst_offset < dst_descs[i].GetElementSpaceSize(); // coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_descs[i], // dst_coords_[i]); diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index 04d3a9790a..1d80f196b5 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -25,7 +25,7 @@ template + typename IndexType = index_t> struct DynamicBuffer { using type = T; @@ -380,13 +380,14 @@ struct DynamicBuffer (is_same_v, half_t> && scalar_per_x_vector % 2 == 0) || (is_same_v, bhalf_t> && scalar_per_x_vector % 2 == 0); #elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT) - bool constexpr use_amd_buffer_addressing = sizeof(IndexType) <= sizeof(int32_t) && is_same_v, int32_t>; + bool constexpr use_amd_buffer_addressing = + sizeof(IndexType) <= sizeof(int32_t) && is_same_v, int32_t>; #elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT bool constexpr use_amd_buffer_addressing = - sizeof(IndexType) <= sizeof(int32_t) && ( - is_same_v, float> || - (is_same_v, half_t> && scalar_per_x_vector % 2 == 0) || - (is_same_v, bhalf_t> && scalar_per_x_vector % 2 == 0)); + sizeof(IndexType) <= sizeof(int32_t) && + (is_same_v, float> || + (is_same_v, half_t> && scalar_per_x_vector % 2 == 0) || + (is_same_v, bhalf_t> && scalar_per_x_vector % 2 == 0)); #else bool constexpr use_amd_buffer_addressing = false; #endif @@ -424,8 +425,9 @@ struct DynamicBuffer static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem"); #if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 - using scalar_t = typename scalar_type>::type; - bool constexpr use_amd_buffer_addressing = sizeof(IndexType) <= sizeof(int32_t) && is_same_v, double>; + using scalar_t = typename scalar_type>::type; + bool constexpr use_amd_buffer_addressing = + sizeof(IndexType) <= sizeof(int32_t) && is_same_v, double>; #else bool constexpr use_amd_buffer_addressing = false; #endif @@ -462,7 +464,8 @@ template -__host__ __device__ constexpr auto make_long_dynamic_buffer(T* p, ElementSpaceSize element_space_size) +__host__ __device__ constexpr auto make_long_dynamic_buffer(T* p, + ElementSpaceSize element_space_size) { return DynamicBuffer{ p, element_space_size}; diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp index 8c50923912..a50307e86f 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp @@ -113,7 +113,7 @@ struct ReferenceMoeGemm : public device::BaseOperator #if CK_USE_PK4_LAYOUT_SHUFFLE v_a = i4_to_f32_gfx9(i4); #else - v_a = i4 - 8; + v_a = i4 - 8; #endif } else @@ -123,23 +123,25 @@ struct ReferenceMoeGemm : public device::BaseOperator // same for B matrix if constexpr(is_same_v) { - uint8_t i4x2 = arg.b_e_n_k_(e, k, n).data; + uint8_t i4x2 = arg.b_e_n_k_(e, k, n).data; uint8_t i4x2_up = arg.b_e_n_k_(e, k, n + full_n).data; - uint8_t i4 = 0; + uint8_t i4 = 0; uint8_t i4_up = 0; - if(k % 2 == 1) { - i4 = (i4x2 >> 0) & 0xf; + if(k % 2 == 1) + { + i4 = (i4x2 >> 0) & 0xf; i4_up = (i4x2_up >> 0) & 0xf; } - else { - i4 = (i4x2 >> 4) & 0xf; + else + { + i4 = (i4x2 >> 4) & 0xf; i4_up = (i4x2_up >> 4) & 0xf; } #if CK_USE_PK4_LAYOUT_SHUFFLE - v_b = i4_to_f32_gfx9(i4); + v_b = i4_to_f32_gfx9(i4); v_b_up = i4_to_f32_gfx9(i4_up); #else - v_b = i4 - 8; + v_b = i4 - 8; v_b_up = i4_up - 8; #endif }