diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index dc1824931e..f7843bd300 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.1.1 +rocm-docs-core==1.1.2 sphinxcontrib-bibtex==2.6.2 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 9a451d9708..02d5f65015 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.31.0 # via # pygithub # sphinx -rocm-docs-core==1.1.1 +rocm-docs-core==1.1.2 # via -r requirements.in six==1.16.0 # via diff --git a/include/ck/host_utility/flush_cache.hpp b/include/ck/host_utility/flush_cache.hpp index 36993d0ae2..041428e6a9 100644 --- a/include/ck/host_utility/flush_cache.hpp +++ b/include/ck/host_utility/flush_cache.hpp @@ -117,7 +117,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, #define MEDIAN 1 if(stream_config.time_kernel_) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n", __func__, @@ -142,7 +142,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, { return 0.0; } - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { printf("Start running %d times...\n", nrepeat); } @@ -186,7 +186,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, total_time += cur_time; #endif - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "i: " << i << " cur_time: " << cur_time << std::endl; diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp index 1cdb7f9c5a..a616433ac9 100644 --- a/include/ck/host_utility/kernel_launch.hpp +++ b/include/ck/host_utility/kernel_launch.hpp @@ -20,7 +20,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config, #if CK_TIME_KERNEL if(stream_config.time_kernel_) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n", __func__, @@ -41,7 +41,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config, } const int nrepeat = stream_config.nrepeat_; - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { printf("Start running %d times...\n", nrepeat); } @@ -95,7 +95,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, #if CK_TIME_KERNEL if(stream_config.time_kernel_) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n", __func__, @@ -117,7 +117,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, } const int nrepeat = stream_config.nrepeat_; - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { printf("Start running %d times...\n", nrepeat); } diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp index 701dd04f6c..e5e6245cb8 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp @@ -795,11 +795,6 @@ struct BlockwiseGemmXdlops_v2 "wrong!"); } - __host__ __device__ BlockwiseGemmXdlops_v2(const BlockwiseGemmXdlops_v2& other) - : a_thread_copy_(other.a_origin), b_thread_copy_(other.b_origin) - { - } - // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl' __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4() { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp index 4521b2161f..6ab1669e30 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp @@ -587,7 +587,7 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle BatchStrideD1s, BatchStrideE1} { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "a0_grid_desc_m_k_{" << a0_grid_desc_m_k_.GetLength(I0) << ", " << a0_grid_desc_m_k_.GetLength(I1) << "}" << std::endl; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp index 37ebe2f85c..34b1d503af 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp @@ -658,7 +658,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { { std::cout << "arg.Batch_ = " << arg.Batch_ << std::endl; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp index 445467be55..e178b8f525 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp @@ -719,7 +719,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle static bool IsSupportedArgument(const Argument& arg) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { arg.Print(); } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp index 6fd8c03232..0b73317c5e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp @@ -516,7 +516,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K float ave_time = 0; for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { { std::cout << "arg.a_grid_desc_k0_m_k1_container_{" diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp index f5c1460f56..13eb23574f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp @@ -644,7 +644,7 @@ struct float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << DeviceOp{}.GetTypeString() << std::endl; std::cout << "N " << arg.Conv_N_ << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp index 9015f640ad..28778d825b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp @@ -614,7 +614,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << DeviceOp{}.GetTypeString() << std::endl; std::cout << "N " << arg.Conv_N_ << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp index e815c0784d..7fa231d4f4 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp @@ -579,7 +579,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << DeviceOp{}.GetTypeString() << std::endl; std::cout << "N " << arg.Conv_N_ << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp index 760e2840d4..3be7313d2b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp @@ -431,7 +431,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp index de48719398..6e69213513 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp @@ -401,7 +401,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "num_batches_of_GEMM = " << arg.num_subbatches_ << std::endl; std::cout << "a_grid_desc_k0_m_k1{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp index 149aca7e3e..b84e181306 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp @@ -1272,7 +1272,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Dl float ave_time = 0; for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "arg.a_grid_desc_k0_m_k1_container_{" << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp index 4398724553..de8f35a640 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp @@ -1220,7 +1220,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl float ave_time = 0; for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "arg.a_grid_desc_k0_m_k1{" << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp index d3af5e63d3..b1784b3858 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp @@ -334,7 +334,7 @@ struct DeviceGemmDl : public DeviceGemm(arg.gemm_kernel_args_.size()) + arg.skipped_group_count_) != arg.group_count_) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "The group count is not equal to sum of skipped groups " "and kernel args size!" @@ -835,7 +836,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage bool group_arg_valid = GridwiseGemm::CheckValidity(gemm_arg); if(not group_arg_valid) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "[" << __func__ << "] group id: " << i << " has invalid GridwiseGemm settings!" << std::endl; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp index 403bc7fad6..36cbd1cd26 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp @@ -620,7 +620,7 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop GridwiseGemm::template CheckTensorTransfersValidity( M, N, K))) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "The provided GEMM problem size (M,N,K) [" << M << "," << N << "," << K << "] are not supported by current template parameters!" diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp index 90c0593b28..658f323516 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp @@ -514,7 +514,7 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm(arg.gemm_kernel_args_.size()) + arg.skipped_group_count_) != arg.group_count_) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "The group count is not equal to sum of skipped groups " "and kernel args size!" @@ -545,7 +545,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK, bhalf_t>::value) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << " KBatch: " << karg.KBatch << " > 1 is not support yet" << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ << std::endl; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp index fdafa9ca5c..aea1f5d387 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp @@ -1113,7 +1113,7 @@ struct GridwiseGemm_xdl_cshuffle_v3 { if(!(karg.M % MPerBlock == 0)) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ @@ -1130,7 +1130,7 @@ struct GridwiseGemm_xdl_cshuffle_v3 { if(!(karg.N % NPerBlock == 0)) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ @@ -1149,7 +1149,7 @@ struct GridwiseGemm_xdl_cshuffle_v3 auto K_t = karg.KBatch * KPerBlock; if(!(karg.K % K_t == 0)) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: " << karg.K << " " << __FILE__ << ":" << __LINE__ @@ -1173,7 +1173,7 @@ struct GridwiseGemm_xdl_cshuffle_v3 { if(karg.K % ABlockTransferSrcScalarPerVector != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg K (" << karg.K << ") value is not a multiple of ABlockTransferSrcScalarPerVector (" @@ -1187,7 +1187,7 @@ struct GridwiseGemm_xdl_cshuffle_v3 { if(karg.M % ABlockTransferSrcScalarPerVector != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg M (" << karg.M << ") value is not a multiple of ABlockTransferSrcScalarPerVector (" @@ -1202,7 +1202,7 @@ struct GridwiseGemm_xdl_cshuffle_v3 { if(karg.N % BBlockTransferSrcScalarPerVector != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg N (" << karg.N << ") value is not a multiple of BBlockTransferSrcScalarPerVector (" @@ -1216,7 +1216,7 @@ struct GridwiseGemm_xdl_cshuffle_v3 { if(karg.K % BBlockTransferSrcScalarPerVector != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg K (" << karg.K << ") value is not a multiple of BBlockTransferSrcScalarPerVector (" @@ -1231,7 +1231,7 @@ struct GridwiseGemm_xdl_cshuffle_v3 { if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg N (" << karg.N << ") value is not a multiple of " @@ -1247,7 +1247,7 @@ struct GridwiseGemm_xdl_cshuffle_v3 { if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg M (" << karg.M << ") value is not a multiple of " diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp index f2eeaf7e3d..6ee279a3f1 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp @@ -446,7 +446,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 { if(!(karg.M % MPerBlock == 0)) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ @@ -463,7 +463,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 { if(!(karg.N % NPerBlock == 0)) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ @@ -482,7 +482,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 auto K_t = karg.k_batch * K0PerBlock * K1; if(!(karg.K % K_t == 0)) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: " << karg.K << " " << __FILE__ << ":" << __LINE__ @@ -496,7 +496,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 { if(karg.K % ABlockTransferSrcScalarPerVector != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg K (" << karg.K << ") value is not a multiple of ABlockTransferSrcScalarPerVector (" @@ -510,7 +510,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 { if(karg.M % ABlockTransferSrcScalarPerVector != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg M (" << karg.M << ") value is not a multiple of ABlockTransferSrcScalarPerVector (" @@ -525,7 +525,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 { if(karg.N % BBlockTransferSrcScalarPerVector != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg N (" << karg.N << ") value is not a multiple of BBlockTransferSrcScalarPerVector (" @@ -539,7 +539,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 { if(karg.K % BBlockTransferSrcScalarPerVector != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg K (" << karg.K << ") value is not a multiple of BBlockTransferSrcScalarPerVector (" @@ -554,7 +554,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 { if(karg.N % CBlockTransferScalarPerVector_NWaveNPerXDL != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg N (" << karg.N << ") value is not a multiple of " @@ -569,7 +569,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 { if(karg.M % CBlockTransferScalarPerVector_NWaveNPerXDL != 0) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "Arg M (" << karg.M << ") value is not a multiple of " @@ -584,7 +584,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 const auto num_k_loop = karg.K0Padded / K0PerBlock; if(!GridwiseGemmPipe::IsSupported(num_k_loop)) { - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "The number of k loops (" << num_k_loop << ") value is not supported by GridwiseGemm Pipeline." diff --git a/include/ck/utility/env.hpp b/include/ck/utility/env.hpp index 0b6504e528..6455402dcb 100644 --- a/include/ck/utility/env.hpp +++ b/include/ck/utility/env.hpp @@ -124,7 +124,7 @@ struct EnvVar #define CK_DECLARE_ENV_VAR_STR(name) CK_DECLARE_ENV_VAR(name, std::string, "") -#define ENV(name) \ +#define CK_ENV(name) \ ck::env::name {} template diff --git a/include/ck_tile/core/numeric/half.hpp b/include/ck_tile/core/numeric/half.hpp index c616b6939f..752145f711 100644 --- a/include/ck_tile/core/numeric/half.hpp +++ b/include/ck_tile/core/numeric/half.hpp @@ -129,8 +129,8 @@ constexpr double fp16_to_double_hip(const fp16_hip_t& x) CK_TILE_HOST_DEVICE constexpr fp16_hip_t float_to_fp16_hip(const float& x) { - return __float2half(x); - // return static_cast(x); + // return __float2half(x); + return static_cast(x); } CK_TILE_HOST_DEVICE diff --git a/include/ck_tile/core/numeric/integral_constant.hpp b/include/ck_tile/core/numeric/integral_constant.hpp index ea7a67abcc..33c24da8c5 100644 --- a/include/ck_tile/core/numeric/integral_constant.hpp +++ b/include/ck_tile/core/numeric/integral_constant.hpp @@ -56,7 +56,6 @@ CK_TILE_LEFT_UNARY_OP(+) CK_TILE_LEFT_UNARY_OP(-) CK_TILE_LEFT_UNARY_OP(~) CK_TILE_LEFT_UNARY_OP(!) -CK_TILE_LEFT_UNARY_OP(*) CK_TILE_BINARY_OP(+) CK_TILE_BINARY_OP(-) diff --git a/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp index 80c1c42b83..09e03de99c 100644 --- a/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp @@ -88,7 +88,7 @@ bool profile_grouped_gemm_fixed_nk_impl(int do_verification, c_m_n_host_results.push_back( Tensor(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp index 476ec37eb2..0b73e4fcd1 100644 --- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp @@ -87,7 +87,7 @@ bool profile_grouped_gemm_impl(int do_verification, c_m_n_host_results.push_back( Tensor(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i diff --git a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp index 33e758f406..74faf15be3 100644 --- a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp @@ -82,7 +82,7 @@ bool profile_grouped_gemm_tile_loop_impl(int do_verification, Tensor(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); c_m_n_host_results.push_back( Tensor(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i diff --git a/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp index feb0be87e7..14df96d505 100644 --- a/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp @@ -88,7 +88,7 @@ bool profile_grouped_gemm_two_stage_impl(int do_verification, c_m_n_host_results.push_back( Tensor(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); - if(ck::EnvIsEnabled(ENV(CK_LOGGING))) + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i diff --git a/test/grouped_gemm/CMakeLists.txt b/test/grouped_gemm/CMakeLists.txt index f47685cf91..55cb209772 100644 --- a/test/grouped_gemm/CMakeLists.txt +++ b/test/grouped_gemm/CMakeLists.txt @@ -6,6 +6,12 @@ if(result EQUAL 0) add_dependencies(test_grouped_gemm test_grouped_gemm_splitk) endif() +add_gtest_executable(test_grouped_gemm_two_stage_splitk test_grouped_gemm_two_stage_multiple_d_splitk_xdl.cpp) +if(result EQUAL 0) + target_link_libraries(test_grouped_gemm_two_stage_splitk PRIVATE utility device_grouped_gemm_instance) + add_dependencies(test_grouped_gemm test_grouped_gemm_two_stage_splitk) +endif() + add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface_xdl.cpp) if(result EQUAL 0) target_link_libraries(test_grouped_gemm_interface PRIVATE utility device_grouped_gemm_instance) diff --git a/test/grouped_gemm/test_grouped_gemm_two_stage_multiple_d_splitk_xdl.cpp b/test/grouped_gemm/test_grouped_gemm_two_stage_multiple_d_splitk_xdl.cpp new file mode 100644 index 0000000000..67ecbaea30 --- /dev/null +++ b/test/grouped_gemm/test_grouped_gemm_two_stage_multiple_d_splitk_xdl.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include + +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/utility/data_type.hpp" + +#include "gtest/gtest.h" +#include "test_grouped_gemm_util.hpp" + +using F16 = ck::half_t; +using BF16 = ck::bhalf_t; +using I8 = int8_t; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using RRR_F16_F16_F16 = ck::test::TestGroupedGemmTwoStage>; +using RCR_F16_F16_F16 = ck::test::TestGroupedGemmTwoStage>; +using RRR_F16_F16_F16_LargeK = + ck::test::TestGroupedGemmTwoStage>; +using RCR_F16_F16_F16_LargeK = + ck::test::TestGroupedGemmTwoStage>; +using RRR_BF16_BF16_BF16 = + ck::test::TestGroupedGemmTwoStage>; +using RCR_BF16_BF16_BF16 = + ck::test::TestGroupedGemmTwoStage>; +using RRR_BF16_I8_BF16 = + ck::test::TestGroupedGemmTwoStage>; +using RCR_BF16_I8_BF16 = + ck::test::TestGroupedGemmTwoStage>; + +const std::vector KBATCH{1, 2, 3, 5, 8}; + +INSTANTIATE_TEST_SUITE_P(TestGroupedGemmTwoStage_splitk_MK_KN, + RRR_F16_F16_F16, + testing::ValuesIn(KBATCH)); +INSTANTIATE_TEST_SUITE_P(TestGroupedGemmTwoStage_splitk_MK_NK, + RCR_F16_F16_F16, + testing::ValuesIn(KBATCH)); +INSTANTIATE_TEST_SUITE_P(TestGroupedGemmTwoStage_splitk_MK_KN_BF16, + RRR_BF16_BF16_BF16, + testing::ValuesIn(KBATCH)); +INSTANTIATE_TEST_SUITE_P(TestGroupedGemmTwoStage_splitk_MK_NK_BF16, + RCR_BF16_BF16_BF16, + testing::ValuesIn(KBATCH)); +INSTANTIATE_TEST_SUITE_P(TestGroupedGemmTwoStage_splitk_MK_KN_BF16_INT8, + RRR_BF16_I8_BF16, + testing::ValuesIn(KBATCH)); +INSTANTIATE_TEST_SUITE_P(TestGroupedGemmTwoStage_splitk_MK_NK_BF16_INT8, + RCR_BF16_I8_BF16, + testing::ValuesIn(KBATCH)); +INSTANTIATE_TEST_SUITE_P(TestGroupedGemmTwoStage_splitk_LargeK_MK_KN, + RRR_F16_F16_F16_LargeK, + testing::Values(32, 64)); +INSTANTIATE_TEST_SUITE_P(TestGroupedGemmTwoStage_splitk_LargeK_MK_NK, + RCR_F16_F16_F16_LargeK, + testing::Values(32, 64)); + +#include "test_grouped_gemm_ut_cases.inc" +#include "test_grouped_gemm_two_stage_ut_cases.inc" diff --git a/test/grouped_gemm/test_grouped_gemm_two_stage_ut_cases.inc b/test/grouped_gemm/test_grouped_gemm_two_stage_ut_cases.inc new file mode 100644 index 0000000000..40d48f4ec0 --- /dev/null +++ b/test/grouped_gemm/test_grouped_gemm_two_stage_ut_cases.inc @@ -0,0 +1,61 @@ +#pragma once + +TEST_P(RRR_BF16_BF16_BF16, MNKPadded) +{ + const std::vector Ms{127, 150, 188, 210}; + constexpr int N = 136; + constexpr int K = 280; + + const std::vector Ns(Ms.size(), N); + const std::vector Ks(Ms.size(), K); + const std::vector StrideAs(Ms.size(), K); + const std::vector StrideBs(Ms.size(), N); + const std::vector StrideCs(Ms.size(), N); + + this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); +} + +TEST_P(RCR_BF16_BF16_BF16, MNKPadded) +{ + const std::vector Ms{127, 150, 188, 210}; + constexpr int N = 136; + constexpr int K = 280; + + const std::vector Ns(Ms.size(), N); + const std::vector Ks(Ms.size(), K); + const std::vector StrideAs(Ms.size(), K); + const std::vector StrideBs(Ms.size(), K); + const std::vector StrideCs(Ms.size(), N); + + this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); +} + +TEST_P(RRR_BF16_I8_BF16, MNKPadded) +{ + const std::vector Ms{127, 150, 188, 210}; + constexpr int N = 136; + constexpr int K = 280; + + const std::vector Ns(Ms.size(), N); + const std::vector Ks(Ms.size(), K); + const std::vector StrideAs(Ms.size(), K); + const std::vector StrideBs(Ms.size(), N); + const std::vector StrideCs(Ms.size(), N); + + this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); +} + +TEST_P(RCR_BF16_I8_BF16, MNKPadded) +{ + const std::vector Ms{127, 150, 188, 210}; + constexpr int N = 136; + constexpr int K = 280; + + const std::vector Ns(Ms.size(), N); + const std::vector Ks(Ms.size(), K); + const std::vector StrideAs(Ms.size(), K); + const std::vector StrideBs(Ms.size(), K); + const std::vector StrideCs(Ms.size(), N); + + this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); +} diff --git a/test/grouped_gemm/test_grouped_gemm_util.hpp b/test/grouped_gemm/test_grouped_gemm_util.hpp index 50f423ada3..9e1395b9f8 100644 --- a/test/grouped_gemm/test_grouped_gemm_util.hpp +++ b/test/grouped_gemm/test_grouped_gemm_util.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -22,6 +22,7 @@ #include "ck/utility/tuple.hpp" #include "ck/utility/number.hpp" #include "profiler/profile_grouped_gemm_impl.hpp" +#include "profiler/profile_grouped_gemm_two_stage_impl.hpp" namespace ck { namespace test { @@ -90,6 +91,58 @@ class TestGroupedGemm : public testing::TestWithParam } }; +template +class TestGroupedGemmTwoStage : public testing::TestWithParam +{ + protected: + using ALayout = std::tuple_element_t<0, Tuple>; + using BLayout = std::tuple_element_t<1, Tuple>; + using ELayout = std::tuple_element_t<2, Tuple>; + using ADataType = std::tuple_element_t<3, Tuple>; + using BDataType = std::tuple_element_t<4, Tuple>; + using EDataType = std::tuple_element_t<5, Tuple>; + + public: + static constexpr bool verify_ = true; + static constexpr int init_method_ = 1; // decimal value initialization + static constexpr bool log_ = false; + static constexpr bool bench_ = false; // measure kernel performance + + void SetUp() override {} + + void Run(const std::vector& Ms, + const std::vector& Ns, + const std::vector& Ks, + const std::vector& StrideAs, + const std::vector& StrideBs, + const std::vector& StrideCs, + int kbatch = 1, + int n_warmup = 1, + int n_iter = 10) + { + bool pass = ck::profiler::profile_grouped_gemm_two_stage_impl(verify_, + init_method_, + log_, + bench_, + Ms, + Ns, + Ks, + StrideAs, + StrideBs, + StrideCs, + kbatch, + n_warmup, + n_iter); + EXPECT_TRUE(pass); + } +}; + template