From 2ff7ac5abc566cd0c43853a57dbeaa1f3325fded Mon Sep 17 00:00:00 2001 From: Aviral Goel Date: Fri, 10 Apr 2026 11:17:11 -0400 Subject: [PATCH] CK: Remove 41 commented-out dead code blocks (~200 lines) (#6302) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depends on #6300 ## Summary Remove 41 commented-out code blocks across 33 files in Composable Kernel, totaling ~200 lines. Identified using an automated dead code scanning skill (`ck-dead-code`) with a calibrated two-stage pipeline: 1. **Pre-filter**: Keyword-based scan found 1,338 `//`-commented blocks. Calibrated heuristics (trained on 50-sample expert classification) reduced to 89 high-confidence candidates — 93% noise reduction. 2. **Expert triage**: LLM expert classified each block in context as CODE_REMOVE, CODE_KEEP, or NOT_CODE. | Classification | Count | |---------------|-------| | Removed (this PR) | 41 | | Kept (debug helpers, alt configs, reference impls) | 32 | | Not code (false positives) | 16 | Removed blocks include: superseded implementations, old test data, abandoned stubs, unreachable code, and buggy dead code. --- .../test/grouped_conv_fwd_multiple_d_v1.cpp | 4 - .../test/grouped_conv_fwd_multiple_d_v2.cpp | 4 - .../test/grouped_conv_fwd_multiple_d_v3.cpp | 4 - .../test/grouped_conv_fwd_multiple_d_v4.cpp | 4 - ...ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp | 10 - .../moe_gemm2_xdl_fp8.cpp | 3 - .../moe_gemm2_xdl_fp8_blockscale.cpp | 10 - ...norm2d_rdquant_fwd_bf16_n1024_instance.cpp | 8 - ...norm2d_rdquant_fwd_fp16_n1024_instance.cpp | 8 - .../smoothquant_bf16_n1024_instance.cpp | 8 - .../smoothquant_fp16_n1024_instance.cpp | 8 - .../moe_smoothquant_bf16_n1024_instance.cpp | 8 - .../moe_smoothquant_fp16_n1024_instance.cpp | 8 - include/ck/host_utility/flush_cache.hpp | 10 - ...n3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp | 7 - ...dlops_blockscale_b_preshuffle_selector.hpp | 26 - ...s_moe_blockscale_b_preshuffle_selector.hpp | 26 - ...roup_tensor_slice_transfer_direct_load.hpp | 6 - ...nsor_slice_transfer_gather_direct_load.hpp | 6 - ...d_contraction_multiple_d_wmma_cshuffle.hpp | 5 - ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 141 --- ...e_grouped_query_attention_forward_wmma.hpp | 141 --- ...ice_multi_query_attention_forward_wmma.hpp | 141 --- .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp | 6 - .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp | 175 ---- .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 39 - ...wise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp | 39 - .../gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp | 39 - ...ridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp | 76 -- .../gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp | 39 - ..._gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp | 39 - ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp | 39 - .../grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp | 39 - ...se_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp | 39 - .../gpu/grid/gridwise_moe_mx_gemm.hpp | 806 ------------------ .../gpu/grid/gridwise_moe_mx_gemm_bns.hpp | 44 - .../tensor_operation/gpu/warp/wmma_gemm.hpp | 12 +- ...ransform_contraction_to_gemm_arraybase.hpp | 5 - .../ck/utility/container_element_picker.hpp | 6 - include/ck/utility/dynamic_buffer.hpp | 6 - include/ck/utility/transpose_vectors.hpp | 17 - include/ck/utility/workgroup_barrier.hpp | 14 - .../core/arch/amd_buffer_addressing.hpp | 16 - .../arch/amd_buffer_addressing_builtins.hpp | 16 - include/ck_tile/core/container/array.hpp | 20 - include/ck_tile/core/container/sequence.hpp | 26 - .../container/statically_indexed_array.hpp | 14 - .../ck_tile/core/container/thread_buffer.hpp | 31 - include/ck_tile/core/container/tuple.hpp | 10 - include/ck_tile/core/numeric/half.hpp | 87 -- include/ck_tile/core/numeric/int8.hpp | 21 - include/ck_tile/core/tensor/sweep_tile.hpp | 4 - .../ck_tile/core/tensor/tile_distribution.hpp | 39 - .../unary_element_wise_operation.hpp | 63 -- .../flatmm/kernel/grouped_flatmm_kernel.hpp | 10 - .../ops/flatmm/kernel/moe_flatmm_kernel.hpp | 7 - ...mm_pipeline_agmem_bgmem_creg_v1_policy.hpp | 4 - ...ec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp | 9 - .../moe_flatmm_pipeline_agmem_bgmem_creg.hpp | 13 +- ...mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp | 4 - ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 3 - .../fused_moe/kernel/moe_sorting_kernel.hpp | 138 +-- .../pipeline/moe_sorting_pipeline.hpp | 8 - ...block_gemm_areg_bsmem_creg_one_warp_v1.hpp | 3 - ...gemm_areg_bsmem_creg_v1_default_policy.hpp | 23 - ...gemm_areg_bsmem_creg_v2_default_policy.hpp | 23 - ...gemm_asmem_breg_creg_v1_default_policy.hpp | 23 - .../norm_reduce/block/block_norm_reduce.hpp | 15 - .../ops/reduce/block/block_reduce2d.hpp | 26 - .../gpu/gemm_streamk.hpp | 60 -- ...e_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp | 6 - ..._streamk_f16_f16_f16_mk_kn_mn_instance.cpp | 3 - ...mm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp | 19 - ...gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp | 19 - profiler/src/profile_softmax.cpp | 7 - .../block_swizzle_test/block_swizzle_test.cpp | 12 +- ...norm2d_rdquant_fwd_bf16_n1024_instance.cpp | 8 - ...norm2d_rdquant_fwd_fp16_n1024_instance.cpp | 8 - .../moe_smoothquant_bf16_n1024_instance.cpp | 8 - .../moe_smoothquant_fp16_n1024_instance.cpp | 8 - .../smoothquant_bf16_n1024_instance.cpp | 8 - .../smoothquant_fp16_n1024_instance.cpp | 8 - 82 files changed, 22 insertions(+), 2883 deletions(-) diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp index 863501cd0a..9895ed7e54 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp @@ -198,10 +198,6 @@ struct Epilogue input_left_pads, input_right_pads); - // auto res = rtc::from_gpu(out_dev); - // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); - // assert(pass); - // Simple check: this checks that the output from each instance matches the output from the // first instance CHECK(report(solution, check(rtc::from_gpu(out_dev)))); diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp index e748a29743..617c2318d5 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp @@ -198,10 +198,6 @@ struct Epilogue input_left_pads, input_right_pads); - // auto res = rtc::from_gpu(out_dev); - // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); - // assert(pass); - // Simple check: this checks that the output from each instance matches the output from the // first instance CHECK(report(solution, check(rtc::from_gpu(out_dev)))); diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp index a68fb53cba..84516b2577 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp @@ -198,10 +198,6 @@ struct Epilogue input_left_pads, input_right_pads); - // auto res = rtc::from_gpu(out_dev); - // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); - // assert(pass); - // Simple check: this checks that the output from each instance matches the output from the // first instance CHECK(report(solution, check(rtc::from_gpu(out_dev)))); diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp index 0262319c39..3490c38f6a 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp @@ -198,10 +198,6 @@ struct Epilogue input_left_pads, input_right_pads); - // auto res = rtc::from_gpu(out_dev); - // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); - // assert(pass); - // Simple check: this checks that the output from each instance matches the output from the // first instance CHECK(report(solution, check(rtc::from_gpu(out_dev)))); diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp index b0b2d29d98..2ceca3c877 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp @@ -238,16 +238,6 @@ int main(int argc, char* argv[]) Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); -#if 0 - for(int n = 0; n < N; ++n) - { - for(int k = 0; k < K; ++k) - { - b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n)); - } - } -#endif - using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm expert_ids(HostTensorDescriptor({sorted_tile_num}, {1})); Tensor sorted_token_ids(HostTensorDescriptor({sorted_size}, {1})); Tensor max_token_id(HostTensorDescriptor({1})); - // max_token_id.mData[0] = valid_size; - // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13}; - // int eids[] = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3}; max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8}; // int eids[] = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2} for(int i = 0; i < sorted_tile_num; i++) diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp index 552d3cd7b5..8ae97ef1c2 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp @@ -261,16 +261,6 @@ int main(int argc, char* argv[]) Tensor max_token_id(HostTensorDescriptor({1})); max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8}; - // int eids[] = {0, 1, 3, 3, 3}; - // int eids[] = {0, 1, 2, 3, 4, 5, 6, 7}; //, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2} - // int eids[] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3}; - // int eids[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - // 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - // 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, - // 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, - // 7, 7, - // 3, 3, 3}; for(int i = 0; i < sorted_tile_num; i++) { expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts); diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp index 8f4813a47e..ca49114844 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd x 3p -#if 0 -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); - -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -#endif template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp index e357d7e3ac..f754d8e959 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd x 3p -#if 0 -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); - -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -#endif template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp index 8a5e0c74a0..66f427247a 100644 --- a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd 2p -#if 0 -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); - -template float smoothquant_>(const S&, A); -#endif template float smoothquant_>(const S&, A); template float smoothquant_>(const S&, A); diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp index 9c08cf64f0..103f7281b0 100644 --- a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd 2p -#if 0 -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); - -template float smoothquant_>(const S&, A); -#endif template float smoothquant_>(const S&, A); template float smoothquant_>(const S&, A); diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp index 8c72b81dc1..56fcca3beb 100644 --- a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd 2p -#if 0 -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); - -template float moe_smoothquant_>(const S&, A); -#endif template float moe_smoothquant_>(const S&, A); template float moe_smoothquant_>(const S&, A); diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp index 6d7a5e7c1f..2462cd218e 100644 --- a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd 2p -#if 0 -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); - -template float moe_smoothquant_>(const S&, A); -#endif template float moe_smoothquant_>(const S&, A); template float moe_smoothquant_>(const S&, A); diff --git a/include/ck/host_utility/flush_cache.hpp b/include/ck/host_utility/flush_cache.hpp index 2d051233e4..25084bae85 100644 --- a/include/ck/host_utility/flush_cache.hpp +++ b/include/ck/host_utility/flush_cache.hpp @@ -476,16 +476,6 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, hip_check_error(hipGetLastError()); // end real kernel - // hip_check_error(hipEventRecord(stop, stream_config.stream_id_)); - // hip_check_error(hipEventSynchronize(stop)); - // float cur_time = 0; - // hip_check_error(hipEventElapsedTime(&cur_time, start, stop)); - // #if MEDIAN - // times.insert(cur_time); - // #else - // total_time += cur_time; - // #endif - #if !defined(CK_USE_WMMA) if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { diff --git a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp index fd50e61f32..7ccebaf35a 100644 --- a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp @@ -137,13 +137,6 @@ transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad( make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - // const auto out_grid_desc_gemmm_gemmn = transform_tensor_descriptor( - // out_n_do_ho_wo_k_grid_desc, - // make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)), - // make_pass_through_transform(K)), - // make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<3>{}), - // make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(in_grid_desc_gemmk0_gemmm_gemmk1, wei_grid_desc_gemmk0_gemmn_gemmk1, out_grid_desc_gemmm_gemmn); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp index 8df23454a2..41ca5916cb 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp @@ -60,32 +60,6 @@ constexpr auto BlockGemmBlockScaleBPreshufflePipeline_Selector() NRepeat, KPack>{}; } -#if 0 - else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2) - { - return BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v2< - BlkGemmPipeSche, - BlockSize, - ADataType, - BDataType, - ComputeDataType, - AccDataType, - ATileDesc, - BTileDesc, - AMmaTileDesc, - BMmaTileDesc, - ABlockTransferSrcScalarPerVector, - BBlockTransferSrcScalarPerVector, - MPerBlock, - NPerBlock, - KPerBlock, - MPerXDL, - NPerXDL, - MRepeat, - NRepeat, - KPack>{}; - } -#endif else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3) { static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3"); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp index 199c729f53..96bf5e81b7 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp @@ -93,32 +93,6 @@ constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector() KPack>{}; } } -#if 0 - else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2) - { - return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v2< - BlkGemmPipeSche, - BlockSize, - ADataType, - BDataType, - ComputeDataType, - AccDataType, - ATileDesc, - BTileDesc, - AMmaTileDesc, - BMmaTileDesc, - ABlockTransferSrcScalarPerVector, - BBlockTransferSrcScalarPerVector, - MPerBlock, - NPerBlock, - KPerBlock, - MPerXDL, - NPerXDL, - MRepeat, - NRepeat, - KPack>{}; - } -#endif else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3) { static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3"); diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp index a31c9101a1..ade2839950 100644 --- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp +++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp @@ -144,12 +144,6 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad "When loading more than one element per thread at once, the contiguous " "dimension must be the same between source and destination."); - // constexpr auto dword_bytes = 4; - // constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData); - // static_assert(bytes_per_thread_load == dword_bytes, - // "Direct load transfer requires each thread to load exactly a single " - // "DWORD of data."); - static_assert(nDim == remove_cvref_t::GetNumOfDimension() && nDim == remove_cvref_t::GetNumOfDimension() && nDim == ThreadClusterLengths::Size(), diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp index 11043281ec..8c6e77bccd 100644 --- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp +++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp @@ -152,12 +152,6 @@ struct ThreadGroupTensorSliceTransfer_Gather_DirectLoad "When loading more than one element per thread at once, the contiguous " "dimension must be the same between source and destination."); - // constexpr auto dword_bytes = 4; - // constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData); - // static_assert(bytes_per_thread_load == dword_bytes, - // "Direct load transfer requires each thread to load exactly a single " - // "DWORD of data."); - static_assert(nDim == remove_cvref_t::GetNumOfDimension() && nDim == remove_cvref_t::GetNumOfDimension() && nDim == ThreadClusterLengths::Size(), diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index c64f2c42f3..69d8eef80a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -737,11 +737,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle // Batch Offset ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_; - - // for checking vector load/store - // index_t MRaw_; - // index_t NRaw_; - // index_t KRaw_; }; // Invoker diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 6b595c4dce..043adf5fc0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -1433,147 +1433,6 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle // TODO: properly implement this check return true; } -#if 0 - static bool IsSupportedArgument(const Argument& arg) - { - if(ck::is_gfx11_supported()) - { - if constexpr(!(is_same_v || is_same_v)) - { - printf("DeviceOp: Acc0 Type err"); - return false; - } - - if constexpr(!(is_same_v || is_same_v)) - { - printf("DeviceOp: Acc1 Type err"); - return false; - } - } - else - { - printf("DeviceOp: Arch err"); - return false; - } - - if(!GridwiseOp::CheckValidity(arg.a_grid_desc, - arg.b0_grid_desc, - arg.b1_grid_desc, - arg.c_grid_desc_m_n_, - arg.block_2_ctile_map_)) - { - return false; - } - - // Check if C permute dimension matches GEMM + GEMM shape - const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded - - if(!(c_g == arg.batch_count_)) - { - printf("DeviceOp: BatchCount err"); - return false; - } - - // Note: we need raw lengths since threadwise copy can not handle vector load when part of - // vector is out of bounds - // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O - const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0]; - const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1]; - const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2]; - const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3]; - - // Check scalar per vector requirement - const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; - const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw; - const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw; - const auto c_extent_lowest = NzRaw; - - if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && - b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 && - b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && - c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) - { - printf("DeviceOp: Data Transfer Vector scalar err"); - return false; - } - - // Check vector load/store requirement - const auto a_stride_lowest = - ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0]; - const auto b0_stride_lowest = - B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0]; - const auto b1_stride_lowest = - B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0]; - const auto c_stride_lowest = arg.c_mz_nz_strides_[1]; - - if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || - c_stride_lowest == 1)) - { - printf("DeviceOp: Data Vectorize transfer err"); - return false; - } - - return true; - } - - // polymorphic - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - return IsSupportedArgument(*dynamic_cast(p_arg)); - } - - static auto MakeArgument( - const ADataType* p_a, - const B0DataType* p_b0, - const B1DataType* p_b1, - CDataType* p_c, - const std::array p_acc0_biases, - const std::array p_acc1_biases, - const std::array& a_gs_ms_ks_lengths, - const std::array& a_gs_ms_ks_strides, - const std::array& b0_gs_ls_ks_lengths, - const std::array& b0_gs_ls_ks_strides, - const std::array& b1_gs_ns_ls_lengths, - const std::array& b1_gs_ns_ls_strides, - const std::array& c_gs_ms_ns_lengths, - const std::array& c_gs_ms_ns_strides, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, - AElementwiseOperation a_element_op, - B0ElementwiseOperation b0_element_op, - AccElementwiseOperation acc_element_op, - B1ElementwiseOperation b1_element_op, - CElementwiseOperation c_element_op) - { - return Argument{p_a, - p_b0, - p_b1, - p_c, - p_acc0_biases, - p_acc1_biases, - a_gs_ms_ks_lengths, - a_gs_ms_ks_strides, - b0_gs_ls_ks_lengths, - b0_gs_ls_ks_strides, - b1_gs_ns_ls_lengths, - b1_gs_ns_ls_strides, - c_gs_ms_ns_lengths, - c_gs_ms_ns_strides, - acc0_biases_gs_ms_ls_lengths, - acc0_biases_gs_ms_ls_strides, - acc1_biases_gs_ms_ns_lengths, - acc1_biases_gs_ms_ns_strides, - 1, - 1, - a_element_op, - b0_element_op, - acc_element_op, - b1_element_op, - c_element_op}; - } -#endif // polymorphic std::unique_ptr MakeArgumentPointer( diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp index 6aa766ab5c..d1269c6d9a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp @@ -956,147 +956,6 @@ struct DeviceGroupedQueryAttentionForward_Wmma // TODO: properly implement this check return true; } -#if 0 - static bool IsSupportedArgument(const Argument& arg) - { - if(ck::is_gfx11_supported()) - { - if constexpr(!(is_same_v || is_same_v)) - { - printf("DeviceOp: Acc0 Type err"); - return false; - } - - if constexpr(!(is_same_v || is_same_v)) - { - printf("DeviceOp: Acc1 Type err"); - return false; - } - } - else - { - printf("DeviceOp: Arch err"); - return false; - } - - if(!GridwiseOp::CheckValidity(arg.a_grid_desc, - arg.b0_grid_desc, - arg.b1_grid_desc, - arg.c_grid_desc_m_n_, - arg.block_2_ctile_map_)) - { - return false; - } - - // Check if C permute dimension matches GEMM + GEMM shape - const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded - - if(!(c_g == arg.batch_count_)) - { - printf("DeviceOp: BatchCount err"); - return false; - } - - // Note: we need raw lengths since threadwise copy can not handle vector load when part of - // vector is out of bounds - // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O - const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0]; - const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1]; - const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2]; - const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3]; - - // Check scalar per vector requirement - const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; - const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw; - const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw; - const auto c_extent_lowest = NzRaw; - - if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && - b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 && - b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && - c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) - { - printf("DeviceOp: Data Transfer Vector scalar err"); - return false; - } - - // Check vector load/store requirement - const auto a_stride_lowest = - ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0]; - const auto b0_stride_lowest = - B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0]; - const auto b1_stride_lowest = - B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0]; - const auto c_stride_lowest = arg.c_mz_nz_strides_[1]; - - if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || - c_stride_lowest == 1)) - { - printf("DeviceOp: Data Vectorize transfer err"); - return false; - } - - return true; - } - - // polymorphic - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - return IsSupportedArgument(*dynamic_cast(p_arg)); - } - - static auto MakeArgument( - const ADataType* p_a, - const B0DataType* p_b0, - const B1DataType* p_b1, - CDataType* p_c, - const std::array p_acc0_biases, - const std::array p_acc1_biases, - const std::array& a_gs_ms_ks_lengths, - const std::array& a_gs_ms_ks_strides, - const std::array& b0_gs_ls_ks_lengths, - const std::array& b0_gs_ls_ks_strides, - const std::array& b1_gs_ns_ls_lengths, - const std::array& b1_gs_ns_ls_strides, - const std::array& c_gs_ms_ns_lengths, - const std::array& c_gs_ms_ns_strides, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, - AElementwiseOperation a_element_op, - B0ElementwiseOperation b0_element_op, - AccElementwiseOperation acc_element_op, - B1ElementwiseOperation b1_element_op, - CElementwiseOperation c_element_op) - { - return Argument{p_a, - p_b0, - p_b1, - p_c, - p_acc0_biases, - p_acc1_biases, - a_gs_ms_ks_lengths, - a_gs_ms_ks_strides, - b0_gs_ls_ks_lengths, - b0_gs_ls_ks_strides, - b1_gs_ns_ls_lengths, - b1_gs_ns_ls_strides, - c_gs_ms_ns_lengths, - c_gs_ms_ns_strides, - acc0_biases_gs_ms_ls_lengths, - acc0_biases_gs_ms_ls_strides, - acc1_biases_gs_ms_ns_lengths, - acc1_biases_gs_ms_ns_strides, - 1, - 1, - a_element_op, - b0_element_op, - acc_element_op, - b1_element_op, - c_element_op}; - } -#endif // polymorphic std::unique_ptr MakeArgumentPointer( diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp index a303b6f808..a9d916c6a0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp @@ -948,147 +948,6 @@ struct DeviceMultiQueryAttentionForward_Wmma // TODO: properly implement this check return true; } -#if 0 - static bool IsSupportedArgument(const Argument& arg) - { - if(ck::is_gfx11_supported()) - { - if constexpr(!(is_same_v || is_same_v)) - { - printf("DeviceOp: Acc0 Type err"); - return false; - } - - if constexpr(!(is_same_v || is_same_v)) - { - printf("DeviceOp: Acc1 Type err"); - return false; - } - } - else - { - printf("DeviceOp: Arch err"); - return false; - } - - if(!GridwiseOp::CheckValidity(arg.a_grid_desc, - arg.b0_grid_desc, - arg.b1_grid_desc, - arg.c_grid_desc_m_n_, - arg.block_2_ctile_map_)) - { - return false; - } - - // Check if C permute dimension matches GEMM + GEMM shape - const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded - - if(!(c_g == arg.batch_count_)) - { - printf("DeviceOp: BatchCount err"); - return false; - } - - // Note: we need raw lengths since threadwise copy can not handle vector load when part of - // vector is out of bounds - // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O - const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0]; - const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1]; - const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2]; - const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3]; - - // Check scalar per vector requirement - const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; - const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw; - const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw; - const auto c_extent_lowest = NzRaw; - - if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && - b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 && - b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && - c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) - { - printf("DeviceOp: Data Transfer Vector scalar err"); - return false; - } - - // Check vector load/store requirement - const auto a_stride_lowest = - ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0]; - const auto b0_stride_lowest = - B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0]; - const auto b1_stride_lowest = - B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0]; - const auto c_stride_lowest = arg.c_mz_nz_strides_[1]; - - if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || - c_stride_lowest == 1)) - { - printf("DeviceOp: Data Vectorize transfer err"); - return false; - } - - return true; - } - - // polymorphic - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - return IsSupportedArgument(*dynamic_cast(p_arg)); - } - - static auto MakeArgument( - const ADataType* p_a, - const B0DataType* p_b0, - const B1DataType* p_b1, - CDataType* p_c, - const std::array p_acc0_biases, - const std::array p_acc1_biases, - const std::array& a_gs_ms_ks_lengths, - const std::array& a_gs_ms_ks_strides, - const std::array& b0_gs_ls_ks_lengths, - const std::array& b0_gs_ls_ks_strides, - const std::array& b1_gs_ns_ls_lengths, - const std::array& b1_gs_ns_ls_strides, - const std::array& c_gs_ms_ns_lengths, - const std::array& c_gs_ms_ns_strides, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, - AElementwiseOperation a_element_op, - B0ElementwiseOperation b0_element_op, - AccElementwiseOperation acc_element_op, - B1ElementwiseOperation b1_element_op, - CElementwiseOperation c_element_op) - { - return Argument{p_a, - p_b0, - p_b1, - p_c, - p_acc0_biases, - p_acc1_biases, - a_gs_ms_ks_lengths, - a_gs_ms_ks_strides, - b0_gs_ls_ks_lengths, - b0_gs_ls_ks_strides, - b1_gs_ns_ls_lengths, - b1_gs_ns_ls_strides, - c_gs_ms_ns_lengths, - c_gs_ms_ns_strides, - acc0_biases_gs_ms_ls_lengths, - acc0_biases_gs_ms_ls_strides, - acc1_biases_gs_ms_ns_lengths, - acc1_biases_gs_ms_ns_strides, - 1, - 1, - a_element_op, - b0_element_op, - acc_element_op, - b1_element_op, - c_element_op}; - } -#endif // polymorphic std::unique_ptr MakeArgumentPointer( diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp index d66679a318..76f0b5a893 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp @@ -464,12 +464,6 @@ struct GridwiseGemmMultipleD_xdl_cshuffle return false; } - // check block-to-E-tile - // if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n)) - //{ - // return false; - //} - // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) // check tensor size: cannot be larger than 2GB each constexpr long_index_t TwoGB = (long_index_t{1} << 31); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp index 4b679adc8d..2252ebf980 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp @@ -351,74 +351,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 make_tuple(Sequence<0, 2>{}, Sequence<1>{})); return a_grid_desc_ak0_m_ak1; -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::MKPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad both M and K - const auto a_grid_desc_m_k = - transform_tensor_descriptor(a_grid_desc_mraw_kraw, - make_tuple(make_right_pad_transform(M, MPad - M), - make_right_pad_transform(K, KPad - K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor( - a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)), - make_pass_through_transform(MPad)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - - return a_grid_desc_ak0_m_ak1; - } - else if constexpr(GemmSpec == GemmSpecialization::MPadding || - GemmSpec == GemmSpecialization::MNPadding) - { - // pad M, but not K - const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor( - a_grid_desc_mraw_kraw, - make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)), - make_right_pad_transform(M, MPad - M)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - - return a_grid_desc_ak0_m_ak1; - } - else if constexpr(GemmSpec == GemmSpecialization::KPadding || - GemmSpec == GemmSpecialization::NKPadding) - { - // pad K, but not M - const auto a_grid_desc_m_k = transform_tensor_descriptor( - a_grid_desc_mraw_kraw, - make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor( - a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)), - make_pass_through_transform(M)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - - return a_grid_desc_ak0_m_ak1; - } - else - { - // not pad M or K - const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor( - a_grid_desc_mraw_kraw, - make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)), - make_pass_through_transform(M)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - - return a_grid_desc_ak0_m_ak1; - } -#endif } __device__ static auto MakeBGridDescriptor_BK0_N_BK1( @@ -451,74 +383,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 make_tuple(Sequence<0, 2>{}, Sequence<1>{})); return b_grid_desc_bk0_n_bk1; -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::NKPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad both N and K - const auto b_grid_desc_n_k = - transform_tensor_descriptor(b_grid_desc_nraw_kraw, - make_tuple(make_right_pad_transform(N, NPad - N), - make_right_pad_transform(K, KPad - K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( - b_grid_desc_n_k, - make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)), - make_pass_through_transform(NPad)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - - return b_grid_desc_bk0_n_bk1; - } - else if constexpr(GemmSpec == GemmSpecialization::NPadding || - GemmSpec == GemmSpecialization::MNPadding) - { - // pad N, but not K - const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( - b_grid_desc_nraw_kraw, - make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)), - make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - - return b_grid_desc_bk0_n_bk1; - } - else if constexpr(GemmSpec == GemmSpecialization::KPadding || - GemmSpec == GemmSpecialization::MKPadding) - { - // pad K, but not N - const auto b_grid_desc_n_k = transform_tensor_descriptor( - b_grid_desc_nraw_kraw, - make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( - b_grid_desc_n_k, - make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)), - make_pass_through_transform(N)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - - return b_grid_desc_bk0_n_bk1; - } - else - { - // not pad N or K - const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( - b_grid_desc_nraw_kraw, - make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)), - make_pass_through_transform(N)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - - return b_grid_desc_bk0_n_bk1; - } -#endif } template @@ -559,45 +423,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 make_right_pad_transform(N, NPad - N)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::MNPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad M and N - return transform_tensor_descriptor(c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), - make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::MPadding || - GemmSpec == GemmSpecialization::MKPadding) - { - // pad M, but not N - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::NPadding || - GemmSpec == GemmSpecialization::NKPadding) - { - // pad N, but not M - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else - { - // not pad M or N - return c_grid_desc_mraw_nraw; - } -#endif } struct Problem diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index 5c5eb9405f..d926efab84 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -682,45 +682,6 @@ struct GridwiseGemm_xdl_cshuffle_v3 make_right_pad_transform(N, NPad - N)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::MNPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad M and N - return transform_tensor_descriptor(c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), - make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::MPadding || - GemmSpec == GemmSpecialization::MKPadding) - { - // pad M, but not N - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::NPadding || - GemmSpec == GemmSpecialization::NKPadding) - { - // pad N, but not M - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else - { - // not pad M or N - return c_grid_desc_mraw_nraw; - } -#endif } struct Problem diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp index 7f1a42fb26..a81679ea78 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp @@ -613,45 +613,6 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle make_right_pad_transform(N, NPad - N)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::MNPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad M and N - return transform_tensor_descriptor(c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), - make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::MPadding || - GemmSpec == GemmSpecialization::MKPadding) - { - // pad M, but not N - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::NPadding || - GemmSpec == GemmSpecialization::NKPadding) - { - // pad N, but not M - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else - { - // not pad M or N - return c_grid_desc_mraw_nraw; - } -#endif } struct Problem diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp index daa4fd2e8a..f9be9e494b 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp @@ -568,45 +568,6 @@ struct GridwiseGemm_xdl_cshuffle_v3 make_right_pad_transform(N, NPad - N)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::MNPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad M and N - return transform_tensor_descriptor(c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), - make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::MPadding || - GemmSpec == GemmSpecialization::MKPadding) - { - // pad M, but not N - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::NPadding || - GemmSpec == GemmSpecialization::NKPadding) - { - // pad N, but not M - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else - { - // not pad M or N - return c_grid_desc_mraw_nraw; - } -#endif } struct Problem diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp index f018730300..529248093b 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp @@ -806,58 +806,6 @@ struct GridwiseGemm_xdl_cshuffle_v3 index_t b_k_split_offset; }; -#if 0 - struct SplitKBatchOffsetMultiABD - { - __device__ SplitKBatchOffsetMultiABD(AsGridPointer& p_as_grid, - BsGridPointer& p_bs_grid, - Argument& karg) - { - static_for<0, NumATensor, 1>{}([&](auto i) { - using ALayout_ = remove_cvref_t>; - if constexpr(is_same_v) - { - as_k_split_offset[i] = blockIdx.z * karg.KRead; - } - else if constexpr(is_same_v) - { - as_k_split_offset[i] = blockIdx.z * karg.KRead * karg.StrideAs[i]; - } - - p_as_grid_(i) = p_as_grid[i] + as_k_split_offset[i]; - }); - - static_for<0, NumBTensor, 1>{}([&](auto i) { - using BLayout_ = remove_cvref_t>; - if constexpr(is_same_v) - { - bs_k_split_offset[i] = blockIdx.z * karg.KRead * karg.StrideBs[i]; - } - else if constexpr(is_same_v) - { - bs_k_split_offset[i] = blockIdx.z * karg.KRead; - } - - p_bs_grid_(i) = p_bs_grid[i] + bs_k_split_offset[i]; - }); - - if(blockIdx.z < static_cast(karg.KBatch - 1)) - { - karg.K = karg.KRead; - } - else - { - karg.K = karg.K - karg.KRead * (karg.KBatch - 1); - } - } - - AsGridPointer p_as_grid_; - BsGridPointer p_bs_grid_; - std::array as_k_split_offset; - std::array bs_k_split_offset; - }; -#endif - using BlockwiseGemmPipe = remove_cvref_t< decltype(BlockGemmPipeline_Selector< BlkGemmPipelineVer, @@ -1129,10 +1077,6 @@ struct GridwiseGemm_xdl_cshuffle_v3 // BsGridPointer p_bs_grid; // DsGridPointer p_ds_grid; - // const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1( - // problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0); - // const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1( - // problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0); const auto as_grid_desc_ak0_m_ak1 = MakeAsGridDescriptor_AK0_M_AK1( problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideAs, problem.AK0); const auto bs_grid_desc_bk0_n_bk1 = MakeBsGridDescriptor_BK0_N_BK1( @@ -1147,22 +1091,10 @@ struct GridwiseGemm_xdl_cshuffle_v3 const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N( problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs); -#if 0 - static_for<0, NumDTensor, 1>{}([&](auto j) { - ds_grid_desc_m_n(j) = MakeCGridDescriptor_M_N( - problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs[j]); - }); -#endif - const auto ds_grid_desc_mblock_mperblock_nblock_nperblock = MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( ds_grid_desc_m_n, problem.MBlock, problem.NBlock); - // const auto a_grid_buf = make_dynamic_buffer( - // p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - // const auto b_grid_buf = make_dynamic_buffer( - // p_bs_grid[I0], b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); - const auto as_grid_buf = generate_tuple( [&](auto i) { return make_dynamic_buffer( @@ -1406,10 +1338,6 @@ struct GridwiseGemm_xdl_cshuffle_v3 const BElementwiseOperation& b_element_op, const CElementwiseOperation& c_element_op) { - // const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1( - // problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0); - // const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1( - // problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0); const auto as_grid_desc_ak0_m_ak1 = MakeAsGridDescriptor_AK0_M_AK1( problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideAs, problem.AK0); const auto bs_grid_desc_bk0_n_bk1 = MakeBsGridDescriptor_BK0_N_BK1( @@ -1428,10 +1356,6 @@ struct GridwiseGemm_xdl_cshuffle_v3 MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( ds_grid_desc_m_n, problem.MBlock, problem.NBlock); - // const auto a_grid_buf = make_dynamic_buffer( - // p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - // const auto b_grid_buf = make_dynamic_buffer( - // p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); const auto as_grid_buf = generate_tuple( [&](auto i) { return make_dynamic_buffer( diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp index a3dffed09d..671cfe4967 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp @@ -642,45 +642,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3 make_right_pad_transform(N, NPad - N)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::MNPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad M and N - return transform_tensor_descriptor(c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), - make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::MPadding || - GemmSpec == GemmSpecialization::MKPadding) - { - // pad M, but not N - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::NPadding || - GemmSpec == GemmSpecialization::NKPadding) - { - // pad N, but not M - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else - { - // not pad M or N - return c_grid_desc_mraw_nraw; - } -#endif } __host__ __device__ static auto MakeDsGridDescriptor_M_N( diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp index 36895f55ea..54260d4386 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp @@ -558,45 +558,6 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3 make_right_pad_transform(N, NPad - N)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::MNPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad M and N - return transform_tensor_descriptor(c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), - make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::MPadding || - GemmSpec == GemmSpecialization::MKPadding) - { - // pad M, but not N - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::NPadding || - GemmSpec == GemmSpecialization::NKPadding) - { - // pad N, but not M - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else - { - // not pad M or N - return c_grid_desc_mraw_nraw; - } -#endif } __host__ __device__ static auto MakeDsGridDescriptor_M_N( diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp index e810a467e7..28bcf14cd0 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp @@ -609,45 +609,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle make_right_pad_transform(N, NPad - N)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::MNPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad M and N - return transform_tensor_descriptor(c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), - make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::MPadding || - GemmSpec == GemmSpecialization::MKPadding) - { - // pad M, but not N - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::NPadding || - GemmSpec == GemmSpecialization::NKPadding) - { - // pad N, but not M - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else - { - // not pad M or N - return c_grid_desc_mraw_nraw; - } -#endif } __host__ __device__ static auto MakeDsGridDescriptor_M_N( diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp index d2dd1d243c..fa231c9b02 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp @@ -669,45 +669,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3 make_right_pad_transform(N, NPad - N)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::MNPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad M and N - return transform_tensor_descriptor(c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), - make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::MPadding || - GemmSpec == GemmSpecialization::MKPadding) - { - // pad M, but not N - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::NPadding || - GemmSpec == GemmSpecialization::NKPadding) - { - // pad N, but not M - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else - { - // not pad M or N - return c_grid_desc_mraw_nraw; - } -#endif } struct Problem diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp index 88f5dd44f3..43a46d6ff4 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp @@ -696,45 +696,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle make_right_pad_transform(N, NPad - N)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); -#if 0 - using GemmSpecialization = tensor_operation::device::GemmSpecialization; - - if constexpr(GemmSpec == GemmSpecialization::MNPadding || - GemmSpec == GemmSpecialization::MNKPadding) - { - // pad M and N - return transform_tensor_descriptor(c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), - make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::MPadding || - GemmSpec == GemmSpecialization::MKPadding) - { - // pad M, but not N - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else if constexpr(GemmSpec == GemmSpecialization::NPadding || - GemmSpec == GemmSpecialization::NKPadding) - { - // pad N, but not M - return transform_tensor_descriptor( - c_grid_desc_mraw_nraw, - make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else - { - // not pad M or N - return c_grid_desc_mraw_nraw; - } -#endif } struct Problem diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp index 79e3a44660..d1d136bcc8 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp @@ -30,48 +30,6 @@ namespace ck { // 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds // buffer when we declare __shared__ inside blkgemmpipe -#if 0 -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS -__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy) -#endif - // __attribute__((amdgpu_waves_per_eu(1, 1))) - kernel_moe_mxgemm(typename GridwiseGemm::Argument karg) -{ -#if defined(__gfx9__) - if constexpr(GridwiseGemm::template IsValidCompilationParameter()) - { - __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; - - auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z); - - GridwiseGemm::template Run( - karg.p_sorted_token_ids, - karg.p_sorted_expert_ids, - karg.p_max_token_id, - karg.p_a_grid + splitk_batch_offset.a_k_split_offset, - karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset, - karg.p_b_grid + splitk_batch_offset.b_k_split_offset, - karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset, - karg.p_ds_grid, - karg.p_c_grid, - p_shared, - karg, - karg.a_element_op, - karg.b_element_op, - karg.c_element_op); - } -#else - ignore = karg; -#endif // end of if (defined(__gfx9__)) -} -#endif - template , "A/B ElementwiseOperation should be PassThrough as load_to_lds is used!"); -#if 0 - template - __device__ static void Run(const index_t* p_sorted_token_ids, - const index_t* p_sorted_expert_ids, - const index_t* p_max_token_id, - const ADataType* p_a_grid, - const AScaleDataType* p_a_scale_grid, - const BDataType* p_b_grid, - const BScaleDataType* p_b_scale_grid, - DsGridPointer& p_ds_grid, - CDataType* p_c_grid, - void* p_shared, - const Problem& problem, - AElementwiseOperation a_element_op, - BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op) - { - ignore = a_element_op; - ignore = b_element_op; - const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1( - IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK, - problem.MPadded, - problem.K, - problem.KPadded, - problem.StrideA, - problem.AK0); - const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1( - problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0); - const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( - IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens, - problem.MPadded, - problem.N, - problem.NPadded, - problem.StrideC); - - const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed( - make_tuple(problem.M / (MXdlPack * MPerXdl), - math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) / - (KXdlPack * 64 / MPerXdl), - 64 * KXdlPack * MXdlPack / scale_pack_size_a)); - - const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor_packed( - make_tuple(problem.N / (NXdlPack * NPerXdl), - math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) / - (KXdlPack * 64 / NPerXdl), - 64 * KXdlPack * NXdlPack / scale_pack_size_b)); - - const auto c_grid_desc_mblock_mperblock_nblock_nperblock = - MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - c_grid_desc_m_n, problem.MBlock, problem.NBlock); - - const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]); - const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y; - if(expert_block_id * MPerBlock >= max_token_id) - return; - const index_t expert_id = - __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]); - - const auto block_mn = [&]() -> std::pair { - if constexpr(NSwizzle) - { - const index_t ecnt_prefix = p_max_token_id[1 + expert_id]; - const index_t prefix_block = ecnt_prefix * problem.NBlock; - const index_t ecnt = p_max_token_id[2 + expert_id] - ecnt_prefix; - const index_t expert_swizzle = - ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2 - const index_t bid_new = blockIdx.x - prefix_block; - const index_t nid = __builtin_amdgcn_readfirstlane( - bid_new % 8 + bid_new / (8 * expert_swizzle) * 8); - const index_t mid = - __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle); - return {nid, mid}; - } - else - { - return {blockIdx.x, blockIdx.y}; - } - }(); - - const index_t block_n_id = block_mn.first; - const index_t block_m_id = block_mn.second; - const index_t token0 = - __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff); - - // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1); - constexpr auto AMThreads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1); - constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0); - constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2); - constexpr auto AKThreads = AK0Threads * AK1Threads; - constexpr auto AMRepeats = MPerBlock / AMThreads; - const index_t token_pos = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats; - - if(token_pos >= max_token_id || token0 >= problem.NumTokens) - return; - StaticallyIndexedArray gather_offsets; - static_for<0, AMRepeats, 1>{}([&](auto m0) { - const index_t fused_token = p_sorted_token_ids[token_pos + m0]; - index_t token_offset = fused_token & 0xffffff; - if constexpr(!IsInputGemm) - { - token_offset = token_offset * problem.TopK + (fused_token >> 24); - } - gather_offsets(m0) = static_cast(token_offset); - }); - - const long_index_t expert_stride = - __builtin_amdgcn_readfirstlane(static_cast(problem.N) * problem.K * (IsInputGemm ? 2 : 1)); - const long_index_t expert_scale_stride = __builtin_amdgcn_readfirstlane( - static_cast(problem.N) * (IsInputGemm ? 2 : 1) * - math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize)); - - // N0, K0, Blocksize*KPack - const index_t n_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock); - - // Gride buffer creation - const auto a_grid_buf = make_dynamic_buffer( - p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( - p_b_grid + static_cast(expert_id) * expert_stride, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); - - // A, B scale buffer - const auto a_scale_grid_buf = make_dynamic_buffer( - p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize()); - const auto b_scale_grid_buf = make_dynamic_buffer( - p_b_scale_grid + (static_cast(expert_id) * expert_scale_stride) / sizeof(BScaleDataType), - b_scale_grid_desc_bn_ak.GetElementSpaceSize()); - - // lds max alignment - constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number); - - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1(); - - // A matrix blockwise direct to LDS copy - auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_Gather_DirectLoad< - ThisThreadBlock, - Sequence, - ABlockTransferThreadClusterLengths_AK0_M_AK1, - ABlockTransferThreadClusterArrangeOrder, - ADataType, - ADataType, - decltype(a_grid_desc_ak0_m_ak1), - decltype(a_block_desc_ak0_m_ak1), - ABlockTransferSrcAccessOrder, - ABlockTransferSrcVectorDim, - 2, - ABlockTransferSrcScalarPerVector, - IndexType, - 1>(a_grid_desc_ak0_m_ak1, - make_multi_index(0, 0, 0), - a_block_desc_ak0_m_ak1, - make_multi_index(0, 0, 0), - gather_offsets); - - // B matrix blockwise copy - auto b_blockwise_copy = - ThreadGroupTensorSliceTransfer_DirectLoad, - BBlockTransferThreadClusterLengths_BK0_N_BK1, - BBlockTransferThreadClusterArrangeOrder, - BDataType, - BDataType, - decltype(b_grid_desc_bk0_n_bk1), - decltype(b_block_desc_bk0_n_bk1), - BBlockTransferSrcAccessOrder, - BBlockTransferSrcVectorDim, - 2, - BBlockTransferSrcScalarPerVector>( - b_grid_desc_bk0_n_bk1, - make_multi_index(0, n_block_data_idx_on_grid, 0), - b_block_desc_bk0_n_bk1, - make_multi_index(0, 0, 0)); - - // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_space_size_aligned = math::integer_least_multiple( - a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); - - // Cast after lds - auto a_block_buf = make_dynamic_buffer( - static_cast(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); - - auto b_block_buf = make_dynamic_buffer( - reinterpret_cast(static_cast(p_shared) + - a_block_space_size_aligned * sizeof(ADataType)), - b_block_desc_bk0_n_bk1.GetElementSpaceSize()); - - constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0); - constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0); - - // Blockwise GEMM pipeline - static_assert(std::is_default_constructible_v); - auto blockwise_gemm_pipeline = BlockwiseGemmPipe{}; - auto c_thread_buf = blockwise_gemm_pipeline.GetCThreadBuffer(); - decltype(c_thread_buf) c_thread_buf_up; - - StaticBufferTupleOfVector - c_thread_buf_fp32; - - const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane( - (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / - KPerBlock); - - // a and b scale processing - const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx(); - const auto waveId_m = wave_idx[I0]; - const auto waveId_n = wave_idx[I1]; - - auto thread_offset_shuffled = - get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack; - - auto a_thread_offset_m = waveId_m; - - auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2< - AScaleDataType, - AScaleDataType, - decltype(a_scale_grid_desc_am_ak), - decltype(BlockwiseGemmPipe::a_scale_thread_desc), - Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths - Sequence<0, 1, 2>, // DimAccessOrder - 2, // SrcVectorDim - KXdlPack * MXdlPack / scale_pack_size_a, // SrcScalarPerVector - 1, // SrcScalarStrideInVector - true>(a_scale_grid_desc_am_ak, - make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m, - 0, - thread_offset_shuffled / scale_pack_size_a)); - - // B scale load - auto b_thread_offset_n = waveId_n; - - auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2< - BScaleDataType, - BScaleDataType, - decltype(b_scale_grid_desc_bn_ak), - decltype(BlockwiseGemmPipe::b_scale_thread_desc), - Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths - Sequence<0, 1, 2>, // DimAccessOrder - 2, // SrcVectorDim - KXdlPack * NXdlPack / scale_pack_size_b, // SrcScalarPerVector - 1, // SrcScalarStrideInVector - true>(b_scale_grid_desc_bn_ak, - make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n, - 0, - thread_offset_shuffled / scale_pack_size_b)); - - if constexpr(IsInputGemm) - { - constexpr auto b_block_space_size_aligned = math::integer_least_multiple( - b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align); - auto b_block_buf_up = make_dynamic_buffer( - reinterpret_cast(static_cast(p_shared) + - a_block_space_size_aligned * sizeof(ADataType) + - b_block_space_size_aligned * sizeof(BDataType)), - b_block_desc_bk0_n_bk1.GetElementSpaceSize()); - - const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2; - const auto b_grid_buf_up = make_dynamic_buffer( - p_b_grid_up + static_cast(expert_id) * expert_stride, - b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); - - auto b_blockwise_copy_up = ThreadGroupTensorSliceTransfer_DirectLoad< - ThisThreadBlock, - Sequence, - BBlockTransferThreadClusterLengths_BK0_N_BK1, - BBlockTransferThreadClusterArrangeOrder, - BDataType, - BDataType, - decltype(b_grid_desc_bk0_n_bk1), - decltype(b_block_desc_bk0_n_bk1), - BBlockTransferSrcAccessOrder, - BBlockTransferSrcVectorDim, - 2, - BBlockTransferSrcScalarPerVector>(b_grid_desc_bk0_n_bk1, - make_multi_index(0, n_block_data_idx_on_grid, 0), - b_block_desc_bk0_n_bk1, - make_multi_index(0, 0, 0)); - - const BScaleDataType* p_b_scale_grid_up = - p_b_scale_grid + expert_scale_stride / 2 / sizeof(BScaleDataType); - const auto b_scale_grid_buf_up = make_dynamic_buffer( - p_b_scale_grid_up + static_cast(expert_id) * expert_scale_stride / sizeof(BScaleDataType), - b_scale_grid_desc_bn_ak.GetElementSpaceSize()); - - auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2< - BScaleDataType, - BScaleDataType, - decltype(b_scale_grid_desc_bn_ak), - decltype(BlockwiseGemmPipe::b_scale_thread_desc), - Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths - Sequence<0, 1, 2>, // DimAccessOrder - 2, // SrcVectorDim - KXdlPack * MXdlPack / scale_pack_size_b, // SrcScalarPerVector - 1, // SrcScalarStrideInVector - true>( - b_scale_grid_desc_bn_ak, - make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n, - 0, - thread_offset_shuffled / scale_pack_size_b)); - - blockwise_gemm_pipeline.template Run( - // A - a_grid_desc_ak0_m_ak1, - a_block_desc_ak0_m_ak1, - a_blockwise_copy, - a_grid_buf, - a_block_buf, - a_block_slice_copy_step, - // Gate and Up - b_grid_desc_bk0_n_bk1, - b_block_desc_bk0_n_bk1, - b_blockwise_copy, - b_blockwise_copy_up, - b_grid_buf, - b_grid_buf_up, - b_block_buf, - b_block_buf_up, - b_block_slice_copy_step, - // C - c_thread_buf, - c_thread_buf_up, - // A scale - a_scale_grid_desc_am_ak, - a_scale_thread_copy, - a_scale_grid_buf, - // Gate and Up scale - b_scale_grid_desc_bn_ak, - b_scale_thread_copy, - b_scale_thread_copy_up, - b_scale_grid_buf, - b_scale_grid_buf_up, - num_k_block_main_loop); - } - else - { - blockwise_gemm_pipeline.template Run( - a_grid_desc_ak0_m_ak1, // A - a_block_desc_ak0_m_ak1, - a_blockwise_copy, - a_grid_buf, - a_block_buf, - a_block_slice_copy_step, - b_grid_desc_bk0_n_bk1, // B - b_block_desc_bk0_n_bk1, - b_blockwise_copy, - b_grid_buf, - b_block_buf, - b_block_slice_copy_step, - c_thread_buf, // C - a_scale_grid_desc_am_ak, // A scale - a_scale_thread_copy, - a_scale_grid_buf, - b_scale_grid_desc_bn_ak, // B scale - b_scale_thread_copy, - b_scale_grid_buf, - num_k_block_main_loop); - } - - // shuffle C and write out - { - static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 && - NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0, - "wrong!"); - static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 && - CShuffleNXdlPerWavePerShuffle % NXdlPack == 0, - "wrong!"); - - constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl); - constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl); - - // TODO: hacky, fix it! - constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 = - blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3(); - - // TODO: hacky, fix it! - // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths - constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp = - blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3(); - - constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0); - constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1); - constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2); - constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3); - constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4); - constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5); - constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6); - constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7); - constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8); - constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9); - - // mul scales - static_assert(M0 * M1 * M2 * M3 * M4 * M5 == MPerBlock); - static_assert(M5 == 4); - const index_t m1 = get_warp_local_1d_id() / NWave; // Mwave id - const index_t m4 = threadIdx.x % get_warp_size() / MPerXdl; - - vector_type topk_weights; // for gemm2 only - static_for<0, NXdlPerWave / NXdlPack, 1>{}([&](auto n0) { - static_for<0, NXdlPack, 1>{}([&](auto inxdl) { // NXdlPack - static_for<0, MXdlPerWave / MXdlPack, 1>{}([&](auto m0) { // MXDLPerWave - static_for<0, MXdlPack, 1>{}([&](auto imxdl) { // MXdlPack - static_for<0, M3, 1>{}([&](auto m3) { // m_inst_num_groups_per_blk - const index_t m_pos = block_m_id * MPerBlock + - m0 * M2 * M1 * M3 * M4 * M5 + - m1 * M2 * M3 * M4 * M5 + - imxdl * M3 * M4 * M5 + m3 * M4 * M5 + m4 * M5; - if constexpr(MulRoutedWeight) - { - topk_weights = - *c_style_pointer_cast*>( - p_ds_grid[I2] + m_pos); - } - static_for<0, M5, 1>{}([&](auto m5) { // m_inst_group_size - constexpr index_t c_offset = - blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset( - make_tuple(m0, n0, imxdl, inxdl, m3 * M5 + m5)); - constexpr auto cidx = Number{}; - - if constexpr(IsInputGemm) // gu fusion - { - if constexpr(ActivationOperation == - Activation::silu_and_mul) - { - float gate = c_thread_buf[cidx]; - float up = c_thread_buf_up[cidx]; - if constexpr(MulRoutedWeight) - { - gate = gate * topk_weights.AsType()[m5]; - up = up * topk_weights.AsType()[m5]; - } - tensor_operation::element_wise::Silu{}(gate, gate); - c_thread_buf_fp32(cidx) = gate * up; - } - else if(ActivationOperation == Activation::gelu_and_mul) - { - float gate = c_thread_buf[cidx]; - float up = c_thread_buf_up[cidx]; - if constexpr(MulRoutedWeight) - { - gate = gate * topk_weights.AsType()[m5]; - up = up * topk_weights.AsType()[m5]; - } - tensor_operation::element_wise::Gelu{}(gate, gate); - c_thread_buf_fp32(cidx) = gate * up; - - /*float gate = c_thread_buf[cidx]; - float up = c_thread_buf_up[cidx]; - if constexpr(MulRoutedWeight) - { - gate = gate * topk_weights.AsType()[m5]; - //up = up * topk_weights.AsType()[m5]; - } - tensor_operation::element_wise::Gelu{}(gate, gate); - c_thread_buf_fp32(cidx) = up;*/ - } - } - else - { - c_thread_buf_fp32(cidx) = c_thread_buf[cidx]; - if constexpr(MulRoutedWeight) - { - c_thread_buf_fp32(cidx) = - topk_weights.AsType()[m5] * - c_thread_buf_fp32[cidx]; - } - } - }); - }); - }); - }); - }); - }); - - constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = - GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); - - auto c_shuffle_block_buf = make_dynamic_buffer( - static_cast(p_shared), - c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); - - constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( - c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, - make_tuple( - make_freeze_transform(I0), - make_unmerge_transform(make_tuple( - Number{}, // M0 (MXdlPerWave) - // per shuffle - M1, // M1 = MWave - M2, // M2 = MXdlPack - M3, // M3 * M4 * M5 = MPerXdl - M4, - M5)), - make_freeze_transform(I0), - make_unmerge_transform(make_tuple( - Number{}, // N0 (NXdlPerWave) - // per shuffle - N1, // N1 = NWave - N2, // N2 = NXdlPack - N3))), // N3 = NPerXdl - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<>{}, - Sequence<0, 2, 4, 6, 7, 8>{}, - Sequence<>{}, - Sequence<1, 3, 5, 9>{})); - - // calculate origin of thread output tensor on global memory - // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = - blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0); - - const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; - const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; - - const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))), - make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}), - make_tuple(Sequence<0>{})); - - const auto m_thread_data_on_block_idx = - m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex( - make_multi_index(m_thread_data_on_block)); - - const auto n_thread_data_on_block_to_n0_n1_n2_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))), - make_tuple(Sequence<0, 1, 2, 3>{}), - make_tuple(Sequence<0>{})); - - const auto n_thread_data_on_block_idx = - n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex( - make_multi_index(n_thread_data_on_block)); - - // shuffle: threadwise copy C from VGPR to LDS - auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3< - AccDataType, - CShuffleDataType, - decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2), - decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2), - ck::tensor_operation::element_wise::PassThrough, - Sequence, - Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, - 9, - 1, - InMemoryDataOperationEnum::Set, - 1, - true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, - make_multi_index(0, - 0, - m_thread_data_on_block_idx[I1], - n_thread_data_on_block_idx[I1], - m_thread_data_on_block_idx[I2], - n_thread_data_on_block_idx[I2], - m_thread_data_on_block_idx[I3], - m_thread_data_on_block_idx[I4], - m_thread_data_on_block_idx[I5], - n_thread_data_on_block_idx[I3]), - ck::tensor_operation::element_wise::PassThrough{}}; - - using EDataType = CDataType; - - const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N( - problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs); - - const auto ds_grid_desc_mblock_mperblock_nblock_nperblock = - MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - ds_grid_desc_m_n, problem.MBlock, problem.NBlock); - - const auto ds_grid_buf = generate_tuple( - [&](auto i) { - return make_dynamic_buffer( - p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize()); - }, - Number{}); - - // tuple of reference to C/Ds tensor descriptors - const auto c_ds_desc_refs = concat_tuple_of_reference( - tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock), - generate_tie([&](auto i) -> const auto& // return type should be reference - { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; }, - Number{})); - - // tuple of reference to C/Ds tensor descriptors - const auto c_ds_buf_refs = concat_tuple_of_reference( - tie(c_shuffle_block_buf), - generate_tie([&](auto i) -> const auto& // return type should be reference - { return ds_grid_buf[i]; }, - Number{})); - - // tuple of starting index of C/Ds blockwise copy - const auto idx_c_ds_block_begin = - container_concat(make_tuple(make_multi_index(0, 0, 0, 0)), - generate_tuple( - [&](auto) { - return make_multi_index(block_m_id, 0, block_n_id, 0); - // return make_multi_index(block_work_idx[I0], 0, - // block_work_idx[I1], 0); - }, - Number{})); - - const auto e_grid_desc_mblock_mperblock_nblock_nperblock = - c_grid_desc_mblock_mperblock_nblock_nperblock; - - using CDEBlockTransferCluster = - CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock; - const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation; - constexpr index_t scatter_weight_idx = 3; // hack fix felix - auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r3_scatter< - ThisThreadBlock, - decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})), - Tuple, - decltype(c_ds_desc_refs), - decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)), - CElementwiseOperation, - Sequence(EGlobalMemoryDataOperation)>, // FIXME: make - // Sequence support - // arbitray type - Sequence<1, - CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, - 1, - CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths, - CDEBlockTransferCluster, - Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, - Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder, - Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder, - 3, // index_t SrcVectorDim, - 3, // index_t DstVectorDim, - CDEShuffleBlockTransferScalarPerVectors, - CShuffleBlockTransferScalarPerVector_NPerBlock, - sequence_merge_t< - Sequence, - uniform_sequence_gen_t>, // ThreadTransferSrcResetCoordinateAfterRunFlags - Sequence, // ThreadTransferDstResetCoordinateAfterRunFlags - IndexType, - 1, // ScatterDim - true, // OutputScatter: false, only use scatter weights - scatter_weight_idx // ScatterWeightIdx: ascale - >{c_ds_desc_refs, - idx_c_ds_block_begin, - tie(e_grid_desc_mblock_mperblock_nblock_nperblock), - make_tuple(make_multi_index(0, 0, block_n_id, 0)), - c_element_op}; - - auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); - - constexpr auto sfc_c_vgpr = - SpaceFillingCurve, - Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, - Sequence>{}; - - constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess(); - - // space filling curve for shuffled blockwise C/D/E - constexpr auto sfc_cde_block = - SpaceFillingCurve, - Sequence<0, 2, 1, 3>, - Sequence<1, - CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, - 1, - CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{}; - - static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!"); - constexpr auto EMThreads = - CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1); - constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads; - constexpr auto ENThreads = - CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3); - static_for<0, num_access, 1>{}([&](auto access_id) { - // make sure it's safe to write to LDS - StaticallyIndexedArray scatter_offsets; - - auto dstidx = sfc_cde_block.GetIndex(access_id); - const index_t c_token_pos = - block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1); - static_for<0, EMRepeats, 1>{}([&](auto m0) { - const index_t fused_token = p_sorted_token_ids[c_token_pos + m0]; - IndexType token_offset = fused_token & 0xffffff; - if constexpr(IsInputGemm) - { - token_offset = token_offset * problem.TopK + (fused_token >> 24); - } - scatter_offsets(m0) = static_cast(token_offset) * problem.N; - }); - - block_sync_lds(); - - // each thread write its data from VGPR to LDS - c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2, - sfc_c_vgpr.GetIndexTupleOfNumber(access_id), - c_thread_buf_fp32, - c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, - c_shuffle_block_buf); - - // make sure it's safe to read from LDS - block_sync_lds(); - - // each block copy its data from LDS to global - cde_block_copy_lds_and_global.Run( - c_ds_desc_refs, - c_ds_buf_refs, - tie(e_grid_desc_mblock_mperblock_nblock_nperblock), - tie(c_grid_buf), - scatter_offsets); - - if constexpr(access_id < num_access - 1) - { - constexpr auto cde_lds_and_global_step = - sfc_cde_block.GetForwardStep(access_id); - - // move on Ds - static_for<0, NumDTensor, 1>{}([&](auto i) { - cde_block_copy_lds_and_global.MoveSrcSliceWindow( - c_ds_desc_refs, i + I1, cde_lds_and_global_step); - }); - - // move on E - cde_block_copy_lds_and_global.MoveDstSliceWindow( - tie(e_grid_desc_mblock_mperblock_nblock_nperblock), - I0, - cde_lds_and_global_step); - } - }); - } - } -#endif - template diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp index 8559b78fe0..d428cb5e99 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp @@ -70,50 +70,6 @@ __launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy) #endif // end of if (defined(__gfx9__)) } -#if 0 -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS -__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy) -#endif - // __attribute__((amdgpu_waves_per_eu(1, 1))) - kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg) -{ -#if defined(__gfx9__) - if constexpr(GridwiseGemm::template IsValidCompilationParameter()) - { - __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; - __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()]; - - // auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z); - - GridwiseGemm::template Run_2Lds( - karg.p_sorted_token_ids, - karg.p_sorted_expert_ids, - karg.p_max_token_id, - karg.p_a_grid, - karg.p_a_scale_grid, - karg.p_b_grid, - karg.p_b_scale_grid, - karg.p_ds_grid, - karg.p_c_grid, - p_shared, - p_shared1, - karg, - karg.a_element_op, - karg.b_element_op, - karg.c_element_op); - } -#else - ignore = karg; -#endif // end of if (defined(__gfx9__)) -} -#endif - template & gs_ms_ns_lengths_vec, const std::array& gs_ms_ns_strides_vec) { - // if(!(gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN && - // gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN)) - // { - // throw std::runtime_error("wrong! dimension must match input lengths"); - // } const auto to_tuple = [&](auto& vec, auto start, auto end) { return generate_tuple([&](auto i) { return vec[start + i]; }, Number{}); diff --git a/include/ck/utility/container_element_picker.hpp b/include/ck/utility/container_element_picker.hpp index 9de2466e71..cec6c85298 100644 --- a/include/ck/utility/container_element_picker.hpp +++ b/include/ck/utility/container_element_picker.hpp @@ -15,9 +15,6 @@ template struct ContainerElementPicker { using type = ContainerElementPicker; -#if 0 - using data_type = typename Arr::data_type; -#endif __host__ __device__ constexpr ContainerElementPicker() = delete; @@ -81,9 +78,6 @@ template struct ConstantContainerElementPicker { using type = ConstantContainerElementPicker; -#if 0 - using data_type = typename Arr::data_type; -#endif __host__ __device__ constexpr ConstantContainerElementPicker() = delete; diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index 00fab270e8..ce4c92425e 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -361,14 +361,8 @@ struct DynamicBuffer { if(is_valid_element) { -#if 0 - X tmp = x; - - __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X)); -#else // if(i >= 2169041600) *c_style_pointer_cast(&p_data_[i]) = x; -#endif } } } diff --git a/include/ck/utility/transpose_vectors.hpp b/include/ck/utility/transpose_vectors.hpp index de20674ef2..11b503da69 100644 --- a/include/ck/utility/transpose_vectors.hpp +++ b/include/ck/utility/transpose_vectors.hpp @@ -18,22 +18,6 @@ struct transpose_vectors; // transpose fp16 2x2 __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t& y0, half2_t& y1) { -#if 0 - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - const vector_type vx0{x0}, vx1{x1}; - vector_type vy0, vy1; - - vy0.template AsType()(I0) = vx0.template AsType()[I0]; - vy0.template AsType()(I1) = vx1.template AsType()[I0]; - - vy1.template AsType()(I0) = vx0.template AsType()[I1]; - vy1.template AsType()(I1) = vx1.template AsType()[I1]; - - y0 = vy0.template AsType()[I0]; - y1 = vy1.template AsType()[I0]; -#else constexpr int32_t m0 = 0x05040100; constexpr int32_t m1 = 0x07060302; @@ -43,7 +27,6 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t // index is reversed because of little endianness (least significant bits first) y0 = bit_cast(__builtin_amdgcn_perm(bit_cast(x1), bit_cast(x0), m0)); y1 = bit_cast(__builtin_amdgcn_perm(bit_cast(x1), bit_cast(x0), m1)); -#endif } template diff --git a/include/ck/utility/workgroup_barrier.hpp b/include/ck/utility/workgroup_barrier.hpp index 0e440799be..0be341da88 100644 --- a/include/ck/utility/workgroup_barrier.hpp +++ b/include/ck/utility/workgroup_barrier.hpp @@ -12,20 +12,6 @@ struct workgroup_barrier __device__ uint32_t ld(uint32_t offset) { -#if 0 - float d = llvm_amdgcn_raw_buffer_load_fp32( - amdgcn_make_buffer_resource(base_ptr), - 0, - offset, - AMDGCN_BUFFER_GLC); - union cvt { - float f32; - uint32_t u32; - }; - cvt x; - x.f32 = d; - return x.u32; -#endif return __atomic_load_n(base_ptr + offset, __ATOMIC_RELAXED); } diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index a32f26dadf..6a9c9e3faf 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -2166,27 +2166,11 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer src_thread_d } else if constexpr(N == 8) { -#if 0 - thread_buffer tmp{src_thread_data}; - - llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as()[number<0>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - static_cast(coherence)); - - llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as()[number<1>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + 4 * sizeof(fp16_t), - static_cast(coherence)); -#else llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast(src_thread_data), dst_wave_buffer_resource, dst_thread_addr_offset, dst_wave_addr_offset, static_cast(coherence)); -#endif } } else if constexpr(std::is_same::value) // bf16 diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 7d57858f26..8056b76af7 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -1992,27 +1992,11 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer src_thread_d } else if constexpr(N == 8) { -#if 0 - thread_buffer tmp{src_thread_data}; - - llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as()[number<0>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - static_cast(coherence)); - - llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as()[number<1>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + 4 * sizeof(fp16_t), - static_cast(coherence)); -#else llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast(src_thread_data), dst_wave_buffer_resource, dst_thread_addr_offset, dst_wave_addr_offset, static_cast(coherence)); -#endif } } else if constexpr(std::is_same::value) // bf16 diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp index 45adbded2c..d6ba1efcbe 100644 --- a/include/ck_tile/core/container/array.hpp +++ b/include/ck_tile/core/container/array.hpp @@ -84,19 +84,6 @@ struct array data[i] = static_cast(c); } - // template - // CK_TILE_HOST_DEVICE constexpr array(const array& o) - // { - // // static_assert(ArrayType::size() == size(), "wrong! size not the same"); - // __content = o.__content; - // } - // CK_TILE_HOST_DEVICE constexpr array& operator=(const array& o) - // { - // // static_assert(ArrayType::size() == size(), "wrong! size not the same"); - // __content = o.__content; - // return *this; - // } - CK_TILE_HOST_DEVICE static constexpr auto size() { return N; } CK_TILE_HOST_DEVICE static constexpr bool is_static() { return is_static_v; } @@ -247,13 +234,6 @@ CK_TILE_HOST_DEVICE constexpr details::return_type make_array(Ts&&... return {std::forward(ts)...}; } -// // make empty array -// template -// CK_TILE_HOST_DEVICE constexpr auto make_array() -// { -// return array{}; -// } - // compatible with old ck's initializer, make an array and fill it withe the last element from // initializer_list template diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp index 73ce09b20e..4e94d6e902 100644 --- a/include/ck_tile/core/container/sequence.hpp +++ b/include/ck_tile/core/container/sequence.hpp @@ -480,32 +480,6 @@ struct sequence_split using right_type = decltype(Seq::extract(range1{})); }; -#if 0 -// reverse sequence -template -struct sequence_reverse -{ - static constexpr index_t NSize = Seq{}.size(); - - using seq_split = sequence_split; - using type = typename sequence_merge< - typename sequence_reverse::type, - typename sequence_reverse::type>::type; -}; - -template -struct sequence_reverse> -{ - using type = sequence; -}; - -template -struct sequence_reverse> -{ - using type = sequence; -}; -#endif - namespace detail { template struct seq_reverse; diff --git a/include/ck_tile/core/container/statically_indexed_array.hpp b/include/ck_tile/core/container/statically_indexed_array.hpp index d35934ab04..111b8a8c58 100644 --- a/include/ck_tile/core/container/statically_indexed_array.hpp +++ b/include/ck_tile/core/container/statically_indexed_array.hpp @@ -24,18 +24,4 @@ using statically_indexed_array = array; #endif // consider always use ck_tile::array for this purpose -#if 0 -template -CK_TILE_HOST_DEVICE constexpr auto make_statically_indexed_array(const X& x, const Xs&... xs) -{ - return statically_indexed_array(x, static_cast(xs)...); -} - -// make empty statically_indexed_array -template -CK_TILE_HOST_DEVICE constexpr auto make_statically_indexed_array() -{ - return statically_indexed_array(); -} -#endif } // namespace ck_tile diff --git a/include/ck_tile/core/container/thread_buffer.hpp b/include/ck_tile/core/container/thread_buffer.hpp index a955b7f84f..58e417a612 100644 --- a/include/ck_tile/core/container/thread_buffer.hpp +++ b/include/ck_tile/core/container/thread_buffer.hpp @@ -23,18 +23,6 @@ CK_TILE_HOST_DEVICE constexpr auto make_thread_buffer(Ts&&... ts) } #else -#if 0 -template -using thread_buffer = array; - -template -CK_TILE_HOST_DEVICE constexpr auto make_thread_buffer(Ts&&... ts) -{ - return make_array(ts...); -} - -#endif - // clang-format off template struct thread_buffer { @@ -103,25 +91,6 @@ struct thread_buffer { return vx.data; } -#if 0 - template ::value, bool>::type = false> - CK_TILE_HOST_DEVICE constexpr void _set_as(number is, X_ x) - { - using X = remove_cvref_t; - - constexpr index_t kSPerX = vector_traits::vector_size; - - union { - X_ data; - tuple_array sub_data; - } vx {x}; - - static_for<0, kSPerX, 1>{}( - [&](auto j) { operator()((is * number{}) + j) = vx.sub_data[j]; }); - } -#endif #define TB_COMMON_AS() \ diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp index 97d5ae10df..d7da0e1467 100644 --- a/include/ck_tile/core/container/tuple.hpp +++ b/include/ck_tile/core/container/tuple.hpp @@ -292,9 +292,6 @@ struct tuple : impl::tuple_base, T...> // below function should be used under tuple_array<> type, no extra check will perform here template CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as() { return reinterpret_cast&>(*this); } template CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as() const { return reinterpret_cast&>(*this); } - // below index is for index *AFTER* type convert, not before - //template CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(index_t i) { TP_COM_(); return reinterpret_cast&>(*this).at(i); } - //template CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(index_t i) const { TP_COM_(); return reinterpret_cast&>(*this).at(i); } template CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(number) { TP_COM_(); return reinterpret_cast&>(*this).at(number{}); } template CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(number) const { TP_COM_(); return reinterpret_cast&>(*this).at(number{}); } @@ -333,13 +330,6 @@ struct vector_traits, void> static constexpr index_t vector_size = sizeof...(T); }; -// template -// CK_TILE_HOST_DEVICE constexpr -// tuple -// make_tuple(T const&... t) -// { -// return {t...}; -// } template CK_TILE_HOST_DEVICE constexpr bool operator==(const tuple& a, const tuple& b) { diff --git a/include/ck_tile/core/numeric/half.hpp b/include/ck_tile/core/numeric/half.hpp index b6a7e86d3c..c172f48cad 100644 --- a/include/ck_tile/core/numeric/half.hpp +++ b/include/ck_tile/core/numeric/half.hpp @@ -264,93 +264,6 @@ bool operator>(const half_t& x, const half_t& y) { return __hgt(x.to_fp16(), y.t CK_TILE_DEVICE bool operator>=(const half_t& x, const half_t& y) { return __hge(x.to_fp16(), y.to_fp16()); } -#if 0 -CK_TILE_DEVICE -half_t operator+(const half_t& x, const half_t& y) -{ - return half_t(__hadd(x.to_fp16(), y.to_fp16())); -} - -CK_TILE_DEVICE -half_t operator-(const half_t& x) { return half_t(__hneg(x.to_fp16())); } - -CK_TILE_DEVICE -half_t operator-(const half_t& x, const half_t& y) -{ - return half_t(__hsub(x.to_fp16(), y.to_fp16())); -} - -CK_TILE_DEVICE -half_t operator*(const half_t& x, const half_t& y) -{ - return half_t(__hmul(x.to_fp16(), y.to_fp16())); -} - -CK_TILE_DEVICE -half_t operator/(const half_t& x, const half_t& y) -{ - return half_t(__hdiv(x.to_fp16(), y.to_fp16())); -} - -CK_TILE_DEVICE -half_t& operator+=(half_t& x, const half_t& y) -{ - x = half_t(__hadd(x.to_fp16(), y.to_fp16())); - return x; -} - -CK_TILE_DEVICE -half_t& operator-=(half_t& x, const half_t& y) -{ - x = half_t(__hsub(x.to_fp16(), y.to_fp16())); - return x; -} - -CK_TILE_DEVICE -half_t& operator*=(half_t& x, const half_t& y) -{ - x = half_t(__hmul(x.to_fp16(), y.to_fp16())); - return x; -} - -CK_TILE_DEVICE -half_t& operator/=(half_t& x, const half_t& y) -{ - x = half_t(__hdiv(x.to_fp16(), y.to_fp16())); - return x; -} - -CK_TILE_DEVICE -half_t& operator++(half_t& x) -{ - x = half_t(__hadd(x.to_fp16(), half_t(1.0f).to_fp16())); - return x; -} - -CK_TILE_DEVICE -half_t& operator--(half_t& x) -{ - x = half_t(__hsub(x.to_fp16(), half_t(1.0f).to_fp16())); - return x; -} - -CK_TILE_DEVICE -half_t operator++(half_t& x, int) -{ - half_t y(x); - x = half_t(__hadd(x.to_fp16(), half_t(1.0f).to_fp16())); - return y; -} - -CK_TILE_DEVICE -half_t operator--(half_t& x, int) -{ - half_t y(x); - x = half_t(__hsub(x.to_fp16(), half_t(1.0f).to_fp16())); - return y; -} -#endif - #if CK_TILE_USE_CUSTOM_DATA_TYPE CK_TILE_ARITHMETIC_USING_FLOAT(CK_TILE_HOST, half_t) #endif diff --git a/include/ck_tile/core/numeric/int8.hpp b/include/ck_tile/core/numeric/int8.hpp index aa9f820c17..7b0f102f2b 100644 --- a/include/ck_tile/core/numeric/int8.hpp +++ b/include/ck_tile/core/numeric/int8.hpp @@ -73,27 +73,6 @@ struct numeric CK_TILE_HOST_DEVICE static constexpr int8_t zero() { return 0; } }; -#if 0 - -template <> -struct numeric_traits -{ - static constexpr int exp = 5; - static constexpr int mant = 10; - static constexpr int bias = 15; - static constexpr uint16_t nan_mask = 0x7C00; - static constexpr uint16_t head_mask = 0xFC00; - static constexpr uint16_t mant_mask = 0x3FF; - static constexpr uint16_t exp_mask = 0x1F; - static constexpr uint32_t Inf = 0x7C00; - static constexpr uint32_t NegInf = 0xFC00; - static constexpr uint32_t NaN = 0x7C01; - static constexpr uint32_t Neg0 = 0x8000; - static constexpr int PackedSize = 1; - using bitwise_type = uint16_t; -}; -#endif - CK_TILE_HOST_DEVICE constexpr float int8_to_float(const int8_t& x) { return static_cast(x); } diff --git a/include/ck_tile/core/tensor/sweep_tile.hpp b/include/ck_tile/core/tensor/sweep_tile.hpp index 1947ce0289..35440f10f8 100644 --- a/include/ck_tile/core/tensor/sweep_tile.hpp +++ b/include/ck_tile/core/tensor/sweep_tile.hpp @@ -295,10 +295,6 @@ struct tile_sweeper F f; }; -// partial deduction is not allowed -// template -// tile_sweeper(const F&, U = {})->tile_sweeper; - // deduction guide template -CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistributionEncoding_) -{ - using DstrEncode = remove_cvref_t; - - constexpr auto adaptor_impl = - detail::make_adaptor_encoding_for_tile_distribution(StaticTileDistributionEncoding_{}); - - constexpr auto ps_ys_to_xs_adaptor_impl = adaptor_impl.template at<0>(); - constexpr auto ys_to_d_adaptor_impl = adaptor_impl.template at<1>(); - constexpr index_t d_length = adaptor_impl.template at<2>(); - constexpr auto rh_major_minor_to_hidden_ids_impl = adaptor_impl.template at<3>(); - - constexpr auto ps_ys_to_xs_adaptor = - CONSTRUCT_TENSOR_ADAPTOR_FROM_ENCODING(ps_ys_to_xs_adaptor_impl); - - constexpr auto ys_to_d_adaptor = CONSTRUCT_TENSOR_ADAPTOR_FROM_ENCODING(ys_to_d_adaptor_impl); - - constexpr auto ys_to_d_descriptor = - make_tensor_descriptor_from_adaptor(ys_to_d_adaptor, d_length); - - // - constexpr index_t ndim_rh_major = DstrEncode::detail::ndim_rh_major_; - constexpr auto ndims_rhs_minor = DstrEncode::detail::ndims_rhs_minor_; - - constexpr auto rh_major_minor_to_hidden_ids = - TO_TUPLE_OF_SEQUENCE(rh_major_minor_to_hidden_ids_impl, ndim_rh_major, ndims_rhs_minor); - - return tile_distribution< - remove_cvref_t, - remove_cvref_t, - remove_cvref_t, - detail::tile_distribution_detail>>{ - ps_ys_to_xs_adaptor, ys_to_d_descriptor}; -} -#endif - // this returns a static tile_distribution template CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution(StaticTileDistributionEncoding_) diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp index 4ad699629c..4e971649d0 100644 --- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp +++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp @@ -745,14 +745,6 @@ struct PassThroughPack2 template CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const; -#if 0 - CK_TILE_HOST_DEVICE constexpr void operator()(ck_tile::fp16x2_t& y, const ck_tile::f8x2_t& x) const - { - auto t = type_convert(x); - y = type_convert(t); - } -#endif - CK_TILE_HOST_DEVICE constexpr void operator()(fp16x2_t& y, const pk_int4_t& x) const { uint8_t x_u8 = bit_cast(x); @@ -871,61 +863,6 @@ struct UnaryConvert } }; -#if 0 -struct ConvertBF16RTN -{ - // convert to bf16 using round to nearest (rtn) - template - CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const - { - // check Y datatype - static_assert(std::is_same_v, "Data type is not supported by this operation!"); - - // check X datatype - static_assert(std::is_same_v || std::is_same_v, - "Data type is not supported by this operation!"); - - y = bf16_convert_rtn(x); - } -}; - -struct ConvertF8SR -{ - // convert to fp8 using stochastic rounding (SR) - template - CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const - { - // check Y datatype - static_assert(std::is_same_v || std::is_same_v, - "Data type is not supported by this operation!"); - - // check X datatype - static_assert(std::is_same_v || std::is_same_v, - "Data type is not supported by this operation!"); - - y = f8_convert_sr(x); - } -}; - -struct ConvertF8RNE -{ - // convert to fp8 using rounding to nearest even - template - CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const - { - // check Y datatype - static_assert(std::is_same_v || std::is_same_v, - "Data type is not supported by this operation!"); - - // check X datatype - static_assert(std::is_same_v || std::is_same_v, - "Data type is not supported by this operation!"); - - y = f8_convert_rne(x); - } -}; -#endif - struct Scale { static constexpr const char* name = "Scale"; diff --git a/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp index ae33137459..ff96139f18 100644 --- a/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp +++ b/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp @@ -339,16 +339,6 @@ struct GroupedFlatmmKernel : FlatmmKernel, class ScaleN = FlatmmScalePointer<-1>, diff --git a/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp index 81cf76cb07..6721577018 100644 --- a/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp +++ b/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp @@ -483,13 +483,6 @@ struct MoeFlatmmKernel if constexpr(std::is_same_v) { - // if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false) - // { - // std::cerr << "Can't support N that is not a multiple of NPerBlock" - // " without padding!" - // << std::endl; - // return false; - // } if(kargs.N % FlatmmPipeline::GetVectorSizeB() != 0) { std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl; diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp index 76d191a40c..99c35e9f30 100644 --- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp @@ -392,10 +392,6 @@ struct UniversalFlatmmPipelineAgBgCrPolicy constexpr index_t M1 = BlockSize / get_warp_size(); static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error."); static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error."); - // constexpr index_t M0 = MPerBlock / (M2 * M1); - // static_assert(M0 * M1 * M2 == MPerBlock, - // "Incorrect M0, M2, M1 configuration! " - // "M0, M1, M2 must cover whole MPerBlock!"); return make_static_tile_distribution( tile_distribution_encoding, diff --git a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp index 0f7f742fa0..6e6547b837 100644 --- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp @@ -1151,11 +1151,6 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1 a_warp_tensor(number{}) = load_tile(a_warp_windows_pong(number{})(number{})); } - // barrier - // if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last)) - // { - // block_sync_lds(); - // } }); } }); @@ -1636,10 +1631,6 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 ? Aload_rep : 0; } - // if((kIter % KPerScaleLoad == 0) && (mIter == 0)) - // { - // load_perM = load_perM + 1; - // } SchedulerPerM(dsread_perM, dswrite_perM, load_perM); } } diff --git a/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp b/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp index 543f4dc92a..fd1bb6da5a 100644 --- a/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp @@ -103,13 +103,8 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1 static constexpr index_t Aload_num_perK = dswrite_num_perK; static constexpr index_t Aload_rep = dswrite_rep; static constexpr index_t Bload_num_perK = kNPerBlock * WG::kK / NWarp / BK1 / WaveSize; - // static constexpr index_t ScaleBload_K1 = ContinuousScaleNPerThread * - // ContinuousScaleKPerThread; static constexpr index_t ScaleBload_num = - // kNPerBlock * kKPerBlock / NWarp / 32 / ScaleBload_K1 / - // WaveSize; // BlockN * BlockK / NWarp / ScalePerK / ScaleB_K1 / wavesize - // static constexpr index_t KPerScaleLoad = KIterPerWarp / ScaleBload_num; - static constexpr index_t HalfMIter = (MIterPerWarp + 1) / 2; - static constexpr index_t Bload_rep = (Bload_num_perK + HalfMIter - 1) / HalfMIter; + static constexpr index_t HalfMIter = (MIterPerWarp + 1) / 2; + static constexpr index_t Bload_rep = (Bload_num_perK + HalfMIter - 1) / HalfMIter; static constexpr index_t mfma_perM_perK = NIterPerWarp * mfma_per_wg; static constexpr index_t dswrite_mIter = (DsWritePreIssue - 1) % MIterPerWarp; @@ -352,10 +347,6 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1 ? Aload_rep : 0; } - // if((kIter % KPerScaleLoad == 0) && (mIter == 0)) - // { - // load_perM = load_perM + 1; - // } SchedulerPerM(dsread_perM, dswrite_perM, load_perM); } } diff --git a/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp index f698541dbf..cef66e470f 100644 --- a/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp @@ -390,10 +390,6 @@ struct MXFlatmmPipelineAGmemBGmemCRegV1 : FlatmmPipelineAGmemBGmemCRegV1().get_element_space_size(); constexpr index_t BufferSize = GetSingleSmemElementSpaceSize(); // max(SingleKSize, SingleVSize); diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp index 52b2b86574..06ab134f85 100644 --- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp +++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp @@ -456,9 +456,6 @@ struct MoeSortingKernel template __device__ static constexpr T wave_reduce(T local, F reduce_f, number = {}) { - // constexpr int wave_size = 64; - // constexpr int reduce_stage = 6; // 1<<6=64 - // clang-format off constexpr int reduce_stage = [](){ if constexpr(wave_size_ == 2) return 1; else if constexpr(wave_size_ == 4) return 2; @@ -1206,17 +1203,21 @@ CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size() template CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number = {}) { - // constexpr int wave_size = 64; - // constexpr int reduce_stage = 6; // 1<<6=64 - // clang-format off - constexpr int reduce_stage = [](){ - if constexpr(wave_size_ == 2) return 1; - else if constexpr(wave_size_ == 4) return 2; - else if constexpr(wave_size_ == 8) return 3; - else if constexpr(wave_size_ == 16) return 4; - else if constexpr(wave_size_ == 32) return 5; - else if constexpr(wave_size_ == 64) return 6; - else return 0; + constexpr int reduce_stage = []() { + if constexpr(wave_size_ == 2) + return 1; + else if constexpr(wave_size_ == 4) + return 2; + else if constexpr(wave_size_ == 8) + return 3; + else if constexpr(wave_size_ == 16) + return 4; + else if constexpr(wave_size_ == 32) + return 5; + else if constexpr(wave_size_ == 64) + return 6; + else + return 0; }(); // clang-format on T v_local = local; @@ -3047,53 +3048,6 @@ struct MoeSortingMultiPhaseKernel_P23 x_r = x_v; #endif { -#if 0 -#pragma unroll - for(int j = 0; j < index_pack / 2; j++) - { - int i_token = i * kBlockSize * index_pack + threadIdx.x + j * kBlockSize; - index_t x = x_d[j]; - int i_topk = x - 1; // topk of this token - int i_show = x != 0 ? 1 : 0; // has this token or not - int cumsum = i_show; - impl::moe_sorting_wave_cumsum(cumsum); - - __syncthreads(); - if(lane_id == get_warp_size() - 1) - { - s[4 + wave_id] = cumsum; - } - __syncthreads(); - - // reduce cross wave - static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) { - IndexType prev = s[4 + i_w]; - prev = wave_id > i_w ? prev : 0; // mask out - cumsum += prev; - }); - cumsum += prev_cumsum; // add previous round cumsum - if(threadIdx.x == kBlockSize - 1) - { - s[0] = cumsum; - } - __syncthreads(); - - int position = cumsum - i_show; - prev_cumsum = s[0]; // update the last cumsum - - if(i_show) - { -#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID - p_sorted_token_ids[e_start + position] = - MOE_SORTING_MOCK_ID(i_token, i_topk); -#else - p_sorted_token_ids[e_start + position] = i_token; -#endif - p_sorted_weights[e_start + position] = - p_weights[i_token * kargs.topk_mdiv.divisor + i_topk]; - } - } -#endif { d_t i_topk; d_t i_show; @@ -3151,68 +3105,6 @@ struct MoeSortingMultiPhaseKernel_P23 } position += i_show[j]; }); - -#if 0 - int i_token = i * kBlockSize * index_pack + threadIdx.x * 2 + j * kBlockSize * 2; - index_t x = x_d[j]; - index_t x0 = static_cast(x & 0xffff); - index_t x1 = static_cast(x >> 16); - int i_topk_0 = x0 - 1; // topk of this token - int i_show_0 = x0 != 0 ? 1 : 0; // has this token or not - int i_topk_1 = x1 - 1; // topk of this token - int i_show_1 = x1 != 0 ? 1 : 0; // has this token or not - int cumsum = i_show_0 + i_show_1; - impl::moe_sorting_wave_cumsum(cumsum); - - __syncthreads(); - if(lane_id == get_warp_size() - 1) - { - s[4 + wave_id] = cumsum; - } - __syncthreads(); - - // reduce cross wave - static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) { - IndexType prev = s[4 + i_w]; - prev = wave_id > i_w ? prev : 0; // mask out - cumsum += prev; - }); - cumsum += prev_cumsum; // add previous round cumsum - if(threadIdx.x == kBlockSize - 1) - { - s[0] = cumsum; - } - __syncthreads(); - - int position_0 = cumsum - i_show_0 - i_show_1; - prev_cumsum = s[0]; // update the last cumsum - - if(i_show_0) - { -#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID - p_sorted_token_ids[e_start + position_0] = - MOE_SORTING_MOCK_ID(i_token, i_topk_0); -#else - p_sorted_token_ids[e_start + position_0] = i_token; -#endif - p_sorted_weights[e_start + position_0] = - p_weights[i_token * kargs.topk_mdiv.divisor + i_topk_0]; - } - - int position_1 = cumsum - i_show_1; - - if(i_show_1) - { -#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID - p_sorted_token_ids[e_start + position_1] = - MOE_SORTING_MOCK_ID(i_token + 1, i_topk_1); -#else - p_sorted_token_ids[e_start + position_1] = i_token + 1; -#endif - p_sorted_weights[e_start + position_1] = - p_weights[(i_token + 1) * kargs.topk_mdiv.divisor + i_topk_1]; - } -#endif } } } diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp index f70f4ddacc..828847091a 100644 --- a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp +++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp @@ -14,14 +14,6 @@ namespace ck_tile { -// template -// struct MoeSortingPipeline -// { -// // TODO: this kernel only support warp per row -// using Problem = remove_cvref_t; -// using Policy = remove_cvref_t; -// using WeightType = typename Problem::WeightType; - // template // CK_TILE_DEVICE auto operator()(const TopkIdWindow& topk_id_window, // const WeightWindow& weight_window, diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp index a7f1cef519..1a61b69b34 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp @@ -36,9 +36,6 @@ struct BlockGemmARegBSmemCRegOneWarpV1 std::is_same_v>, "wrong!"); - // constexpr index_t MPerBlock = ABlockTensorTmp{}.get_lengths()[number<0>{}]; - // constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}]; - // constexpr index_t KPerBlock = ABlockTensorTmp{}.get_lengths()[number<1>{}]; constexpr index_t MPerBlock = BlockGemmShape::kM; constexpr index_t NPerBlock = BlockGemmShape::kN; constexpr index_t KPerBlock = BlockGemmShape::kK; diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp index 2280f6f875..3a7c0362f7 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp @@ -19,30 +19,7 @@ struct BlockGemmARegBSmemCRegV1DefaultPolicy std::is_same_v && std::is_same_v) { -#if 0 - constexpr index_t kBlockSize = Problem::kBlockSize; - - constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM; - constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN; - constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK; - - static_assert(kBlockSize % get_warp_size() == 0, "wrong!"); - - constexpr index_t NumWarp = kBlockSize / get_warp_size(); - - // FIXME - if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 && - kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0) - { - return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1); - } - else - { - return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1); - } -#else return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1); -#endif } else if constexpr(std::is_same_v && std::is_same_v && diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp index b8290c95d8..0b1cea9425 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp @@ -16,30 +16,7 @@ struct BlockGemmARegBSmemCRegV2DefaultPolicy CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp() { -#if 0 - constexpr index_t kBlockSize = Problem::kBlockSize; - - constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM; - constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN; - constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK; - - static_assert(kBlockSize % get_warp_size() == 0, "wrong!"); - - constexpr index_t NumWarp = kBlockSize / get_warp_size(); - - // FIXME - if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 && - kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0) - { - return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1); - } - else - { - return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1); - } -#else return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1); -#endif } }; diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp index 29022e764f..0622cc624f 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp @@ -19,30 +19,7 @@ struct BlockGemmASmemBRegCRegV1DefaultPolicy std::is_same_v && std::is_same_v) { -#if 0 - constexpr index_t kBlockSize = Problem::kBlockSize; - - constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM; - constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN; - constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK; - - static_assert(kBlockSize % get_warp_size() == 0, "wrong!"); - - constexpr index_t NumWarp = kBlockSize / get_warp_size(); - - // FIXME - if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 && - kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0) - { - return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1); - } - else - { - return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1); - } -#else return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1); -#endif } else if constexpr(std::is_same_v && std::is_same_v && diff --git a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp index da9c5c4d57..717fb4678c 100644 --- a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp +++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp @@ -120,10 +120,6 @@ struct BlockNormReduceSync constexpr index_t idim_p_lane = NDimP - 1; - // const auto ps_idx = make_array(get_warp_id(), get_lane_id()); - // const auto rs_idx = - // mean_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx); - constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size(); static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size()); @@ -360,17 +356,6 @@ struct BlockNormReduceCrossWarpSync template CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count(int row_size) { -#if 0 - using S = BlockShape; - index_t LastloopN = row_size % S::Block_N == 0 ? S::Block_N : row_size % S::Block_N; - constexpr index_t NThread = S::WarpPerBlock_N * S::ThreadPerWarp_N; - index_t iNLane = get_thread_id() % NThread; - index_t iN0 = LastloopN / (S::Vector_N * S::ThreadPerWarp_N); - index_t iN1 = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) / S::Vector_N; - index_t N2 = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) % S::Vector_N; - index_t iN3 = iNLane < iN1 ? S::Vector_N : iNLane == iN1 ? N2 : 0; - return iN0 * S::Vector_N + iN3; -#endif using S_ = BlockShape; constexpr index_t ThreadsPerBlock_N = S_::WarpPerBlock_N * S_::ThreadPerWarp_N; diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp index abad5ed031..a14f103eb6 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp @@ -140,28 +140,6 @@ struct BlockReduce2d ReducePacksPerXDim{}); } -#if 0 - constexpr auto I0 = number<0>{}; - constexpr auto I1 = number<1>{}; - constexpr auto spans = XDistributedTensor_::get_distributed_spans(); - - // FIXME: hard coded to reduce 2nd axis - sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) { - constexpr auto y_dstr_idx = make_tuple(dstr_idx_i0); - - auto y = y_tensor[y_dstr_idx]; - - sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) { - constexpr auto in_dstr_idx = make_tuple(dstr_idx_i0, dstr_idx_i1); - const auto x = ck_tile::type_convert(x_tensor[in_dstr_idx]); - - y = reduce_func(y, x); - }); - - y_tensor(y_dstr_idx) = y; - }); -#endif - template CK_TILE_DEVICE static auto MakeYBlockTile() { @@ -240,10 +218,6 @@ struct BlockReduce2dSync constexpr index_t idim_p_lane = NDimP - 1; - // const auto ps_idx = make_array(get_warp_id(), get_lane_id()); - // const auto rs_idx = - // y_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx); - constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size(); // loop over thread data diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp index f46ebbacf7..348216129f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp @@ -52,66 +52,6 @@ struct DeviceOperationInstanceFactory> op_ptrs; -#if 0 - if constexpr(is_same_v && is_same_v && - is_same_v) - { - if constexpr(is_same_v && is_same_v && - is_same_v) - { - add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v && - is_same_v) - { - add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v && - is_same_v) - { - add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v && - is_same_v) - { - add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(op_ptrs); - } - } - else if constexpr(is_same_v && is_same_v && - is_same_v) - { - if constexpr(is_same_v && is_same_v && - is_same_v) - { - add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_instances(op_ptrs); - add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instances(op_ptrs); - add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instances(op_ptrs); - add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_irregular_instances(op_ptrs); - add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v2_instances(op_ptrs); - add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v2_irregular_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v && - is_same_v) - { - add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_instances(op_ptrs); - add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_irregular__instances(op_ptrs); - add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instances(op_ptrs); - add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_irregular_instances(op_ptrs); - add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v2_instances(op_ptrs); - add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v2_irregular_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v && - is_same_v) - { - add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v && - is_same_v) - { - add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(op_ptrs); - } - } -#endif if constexpr(is_same_v && is_same_v && is_same_v) { diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp index 3d391ae931..a8d69afb9a 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp @@ -33,12 +33,6 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; -#if 0 -template -using device_gemm_xdl_b_scale_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple< - -#endif - template using device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple< // clang-format off diff --git a/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp index 600154a9fd..919236deee 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp @@ -26,9 +26,6 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; -// static constexpr auto GemmMNPadding = -// ck::tensor_operation::device::GemmSpecialization::MNPadding; using device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_generic_instances = std::tuple< // clang-format off //##################|AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp index 8ba6c485cb..99e809f0ec 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp @@ -33,25 +33,6 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; -#if 0 -template -using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_comp_instances = std::tuple< - // clang-format off - //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| - //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - - // Compute friendly - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 16, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> - // clang-format on - >; -#endif - template using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_instances = std::tuple< // clang-format off diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp index 088378b918..c52b9723a9 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp @@ -33,25 +33,6 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; -#if 0 -template -using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple< - // clang-format off - //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| - //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - - // Compute friendly - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 16, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> - // clang-format on - >; -#endif - template using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple< diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp index 096a2d4eb4..31cc0fd23a 100644 --- a/profiler/src/profile_softmax.cpp +++ b/profiler/src/profile_softmax.cpp @@ -278,11 +278,4 @@ int profile_softmax(int argc, char* argv[]) return 0; } -// hijack main() for quick debugging -// int main(int argc, char* argv[]) -// { -// profile_normalization(argc, argv); -// return 0; -// } - REGISTER_PROFILER_OPERATION("softmax", "Softmax", profile_softmax); diff --git a/test/block_swizzle_test/block_swizzle_test.cpp b/test/block_swizzle_test/block_swizzle_test.cpp index 36a26492cf..af1bc0658e 100644 --- a/test/block_swizzle_test/block_swizzle_test.cpp +++ b/test/block_swizzle_test/block_swizzle_test.cpp @@ -120,17 +120,7 @@ struct block_dispatcher_t uint32_t get_grid_dims_x() { return dp_start_block_idx + dp_num_blocks; } - uint32_t get_block_idx(uint32_t bid) - { - // block id is linearily allocated along sk blocks (dp blocks are fine) - // this function will compute blockIdx.x and the linear sk block mapping - // uint32_t block_idx = 0; - // if(bid < sk_num_big_blocks) { - // uint32_t current_k_iter = bid * k_iters_per_big_block; - // tile_idx = current_k_iter / k_iters_per_tile; - // } - return bid; - } + uint32_t get_block_idx(uint32_t bid) { return bid; } uint32_t get_current_itr(uint32_t block_idx) { diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp index 8f4813a47e..ca49114844 100644 --- a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp +++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd x 3p -#if 0 -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); - -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -#endif template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp index e357d7e3ac..f754d8e959 100644 --- a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp +++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd x 3p -#if 0 -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); - -template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); -#endif template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp index 8c72b81dc1..56fcca3beb 100644 --- a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd 2p -#if 0 -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); - -template float moe_smoothquant_>(const S&, A); -#endif template float moe_smoothquant_>(const S&, A); template float moe_smoothquant_>(const S&, A); diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp index 6d7a5e7c1f..2462cd218e 100644 --- a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd 2p -#if 0 -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); -template float moe_smoothquant_>(const S&, A); - -template float moe_smoothquant_>(const S&, A); -#endif template float moe_smoothquant_>(const S&, A); template float moe_smoothquant_>(const S&, A); diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp index 8a5e0c74a0..66f427247a 100644 --- a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp +++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd 2p -#if 0 -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); - -template float smoothquant_>(const S&, A); -#endif template float smoothquant_>(const S&, A); template float smoothquant_>(const S&, A); diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp index 9c08cf64f0..103f7281b0 100644 --- a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp +++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp @@ -5,14 +5,6 @@ // clang-format off // rm rn tm tn vn pd 2p -#if 0 -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); -template float smoothquant_>(const S&, A); - -template float smoothquant_>(const S&, A); -#endif template float smoothquant_>(const S&, A); template float smoothquant_>(const S&, A);