From c03045ce2d685f3b0d407ab8fa945abcf0c61cd4 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 10 Aug 2021 23:45:36 +0000 Subject: [PATCH] rename --- README.md | 10 +- ...volution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp | 72 ++--- ...lution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp | 72 ++--- ...volution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp | 74 ++--- ...volution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp | 50 ++-- ...lution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp | 56 ++-- ...lution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp | 56 ++-- ...lution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp | 56 ++-- ...volution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp | 46 +-- ...ransform.hpp => multi_index_transform.hpp} | 192 ++++++------- ...r.hpp => multi_index_transform_helper.hpp} | 41 ++- .../tensor_description/tensor_adaptor.hpp | 4 +- ...r_descriptor.hpp => tensor_descriptor.hpp} | 86 +++--- ...elper.hpp => tensor_descriptor_helper.hpp} | 43 ++- .../blockwise_gemm_dlops_v2r2.hpp | 48 ++-- .../blockwise_gemm_dlops_v2r3.hpp | 14 +- .../blockwise_gemm_dlops_v3.hpp | 25 +- .../blockwise_gemm_xdlops.hpp | 98 +++---- ...pp => blockwise_tensor_slice_transfer.hpp} | 56 ++-- ...=> blockwise_tensor_slice_transfer_v2.hpp} | 53 ++-- ...pp => gridwise_contraction_dlops_v1r2.hpp} | 81 +++--- ..._v1r2.hpp => gridwise_gemm_dlops_v1r2.hpp} | 160 ++++++----- ..._v1r3.hpp => gridwise_gemm_dlops_v1r3.hpp} | 96 ++++--- ...lops_v2.hpp => gridwise_gemm_dlops_v2.hpp} | 138 +++++---- ...v2r3.hpp => gridwise_gemm_xdlops_v2r3.hpp} | 189 ++++++------- ...et.hpp => threadwise_tensor_slice_set.hpp} | 12 +- ...p => threadwise_tensor_slice_transfer.hpp} | 157 +++++------ ...> threadwise_tensor_slice_transfer_v2.hpp} | 113 ++++---- composable_kernel/include/utility/config.hpp | 4 +- .../include/utility/dynamic_buffer.hpp | 4 +- ...plicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp} | 194 +++++++------ ...licit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp} | 190 ++++++------- ...licit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp} | 190 ++++++------- ...plicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp} | 59 ++-- ...licit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp} | 15 +- ...cit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp} | 15 +- ...plicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp} | 15 +- ...licit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp} | 17 +- ...icit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp} | 15 +- ...cit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp} | 15 +- ...cit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp} | 15 +- ...cit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp} | 15 +- ...cit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp} | 15 +- ...plicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} | 14 +- ...plicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp} | 15 +- ....hpp => driver_contraction_dlops_v1r2.hpp} | 46 +-- ...plicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} | 32 +-- ...gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp} | 32 +-- ...ps_v1r2.hpp => driver_gemm_dlops_v1r2.hpp} | 266 +++++++++--------- ...ps_v1r3.hpp => driver_gemm_dlops_v1r3.hpp} | 257 +++++++++-------- ...s_v2r3.hpp => driver_gemm_xdlops_v2r3.hpp} | 132 ++++----- .../src/conv_bwd_driver_offline.cpp | 66 ++--- .../src/conv_fwd_driver_offline.cpp | 142 +++++----- host/host_tensor/include/conv_common.hpp | 8 +- 54 files changed, 1904 insertions(+), 1982 deletions(-) rename composable_kernel/include/tensor_description/{dynamic_multi_index_transform.hpp => multi_index_transform.hpp} (91%) rename composable_kernel/include/tensor_description/{dynamic_multi_index_transform_helper.hpp => multi_index_transform_helper.hpp} (65%) rename composable_kernel/include/tensor_description/{dynamic_tensor_descriptor.hpp => tensor_descriptor.hpp} (87%) rename composable_kernel/include/tensor_description/{dynamic_tensor_descriptor_helper.hpp => tensor_descriptor_helper.hpp} (74%) rename composable_kernel/include/tensor_operation/{blockwise_dynamic_tensor_slice_transfer.hpp => blockwise_tensor_slice_transfer.hpp} (74%) rename composable_kernel/include/tensor_operation/{blockwise_dynamic_tensor_slice_transfer_v2.hpp => blockwise_tensor_slice_transfer_v2.hpp} (75%) rename composable_kernel/include/tensor_operation/{gridwise_dynamic_contraction_dlops_v1r2.hpp => gridwise_contraction_dlops_v1r2.hpp} (91%) rename composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_dlops_v1r2.hpp => gridwise_gemm_dlops_v1r2.hpp} (82%) rename composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_dlops_v1r3.hpp => gridwise_gemm_dlops_v1r3.hpp} (89%) rename composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_dlops_v2.hpp => gridwise_gemm_dlops_v2.hpp} (79%) rename composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_xdlops_v2r3.hpp => gridwise_gemm_xdlops_v2r3.hpp} (81%) rename composable_kernel/include/tensor_operation/{threadwise_dynamic_tensor_slice_set.hpp => threadwise_tensor_slice_set.hpp} (83%) rename composable_kernel/include/tensor_operation/{threadwise_dynamic_tensor_slice_transfer.hpp => threadwise_tensor_slice_transfer.hpp} (89%) rename composable_kernel/include/tensor_operation/{threadwise_dynamic_tensor_slice_transfer_v2.hpp => threadwise_tensor_slice_transfer_v2.hpp} (86%) rename composable_kernel/src/kernel_wrapper/{dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp => convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp} (67%) rename composable_kernel/src/kernel_wrapper/{dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp => convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp} (64%) rename composable_kernel/src/kernel_wrapper/{dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp => convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp} (64%) rename composable_kernel/src/kernel_wrapper/{dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp => convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp} (93%) rename host/driver_offline/include/{device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp} (96%) rename host/driver_offline/include/{device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp} (96%) rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp} (94%) rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp} (94%) rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp} (96%) rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp} (94%) rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp} (94%) rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp} (96%) rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp} (96%) rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} (91%) rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp} (95%) rename host/driver_offline/include/{driver_dynamic_contraction_dlops_v1r2.hpp => driver_contraction_dlops_v1r2.hpp} (88%) rename host/driver_offline/include/{driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp => driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} (92%) rename host/driver_offline/include/{driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp => driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp} (92%) rename host/driver_offline/include/{driver_dynamic_gemm_dlops_v1r2.hpp => driver_gemm_dlops_v1r2.hpp} (56%) rename host/driver_offline/include/{driver_dynamic_gemm_dlops_v1r3.hpp => driver_gemm_dlops_v1r3.hpp} (57%) rename host/driver_offline/include/{driver_dynamic_gemm_xdlops_v2r3.hpp => driver_gemm_xdlops_v2r3.hpp} (50%) diff --git a/README.md b/README.md index 6e6019601a..4f071d5896 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ InLeftPads size 2, {1, 1, } InRightPads size 2, {1, 1, } ConvStrides size 2, {2, 2, } ConvDilations size 2, {1, 1, } -device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw +device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw a_k0_m_k1_grid_desc{216, 256, 8} b_k0_n_k1_grid_desc{216, 165888, 8} c_m_n_grid_desc{ 256, 165888} @@ -100,7 +100,7 @@ InLeftPads size 2, {1, 1, } InRightPads size 2, {1, 1, } ConvStrides size 2, {1, 1, } ConvDilations size 2, {1, 1, } -device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw +device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw a_k0_m_k1_grid_desc{288, 1024, 8} b_k0_n_k1_grid_desc{288, 50176, 8} c_m_n_grid_desc{ 1024, 50176} @@ -122,7 +122,7 @@ InLeftPads size 2, {1, 1, } InRightPads size 2, {1, 1, } ConvStrides size 2, {2, 2, } ConvDilations size 2, {1, 1, } -device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk +device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk a_k0_m_k1_grid_desc{216, 165888, 8} b_k0_n_k1_grid_desc{216, 256, 8} c_m_n_grid_desc{ 165888, 256} @@ -144,7 +144,7 @@ InLeftPads size 2, {1, 1, } InRightPads size 2, {1, 1, } ConvStrides size 2, {1, 1, } ConvDilations size 2, {1, 1, } -device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk +device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk a_k0_m_k1_grid_desc{288, 50176, 8} b_k0_n_k1_grid_desc{288, 1024, 8} c_m_n_grid_desc{ 50176, 1024} @@ -166,7 +166,7 @@ InLeftPads size 2, {1, 1, } InRightPads size 2, {1, 1, } ConvStrides size 2, {1, 1, } ConvDilations size 2, {1, 1, } -device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk +device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk a_k0_m_k1_grid_desc{288, 50176, 8} b_k0_n_k1_grid_desc{288, 1024, 8} c_m_n_grid_desc{ 50176, 1024} diff --git a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp index 5c582dea46..09ea16fa23 100644 --- a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp +++ b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp @@ -2,8 +2,8 @@ #define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -23,9 +23,9 @@ template __host__ __device__ constexpr auto transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( - const DynamicTensorDescriptor& wei_k_y_x_c_grid_desc, - const DynamicTensorDescriptor& out_n_ho_wo_k_grid_desc, - const DynamicTensorDescriptor& in_n_hi_wi_c_grid_desc, + const TensorDescriptor& wei_k_y_x_c_grid_desc, + const TensorDescriptor& out_n_ho_wo_k_grid_desc, + const TensorDescriptor& in_n_hi_wi_c_grid_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -102,7 +102,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( const auto K0 = K / K1; // weight tensor - const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_tensor_descriptor( wei_k_y_x_c_grid_desc, make_tuple(make_pass_through_transform(K), make_embed_transform(make_tuple(YDot, YTilda), @@ -114,28 +114,28 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc = - transform_dynamic_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(K0, K1)), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_freeze_transform(IYTilda), - make_freeze_transform(IXTilda), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<3>{}, - Sequence<2>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<0, 1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<>{}, - Sequence<>{}, - Sequence<4>{})); + transform_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(K0, K1)), + make_slice_transform(YDot, I0, YDotSlice), + make_slice_transform(XDot, I0, XDotSlice), + make_freeze_transform(IYTilda), + make_freeze_transform(IXTilda), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<3>{}, + Sequence<2>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<0, 1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<>{}, + Sequence<>{}, + Sequence<4>{})); #if 1 - const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( + const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( wei_k0_k1_ydotslice_xdotslice_c_grid_desc, make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)), make_pass_through_transform(C), @@ -143,7 +143,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); #else - const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( + const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( wei_k0_k1_ydotslice_xdotslice_c_grid_desc, make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)), make_pass_through_transform(C), @@ -154,7 +154,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( // output tensor // this add padding check - const auto out_n_hop_wop_k_grid_desc = transform_dynamic_tensor_descriptor( + const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor( out_n_ho_wo_k_grid_desc, make_tuple(make_pass_through_transform(N), make_pad_transform(Ho, I0, I0), @@ -163,7 +163,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_dynamic_tensor_descriptor( + const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_tensor_descriptor( out_n_hop_wop_k_grid_desc, make_tuple(make_pass_through_transform(N), make_embed_transform(make_tuple(YDot, HTilda), @@ -175,7 +175,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc = - transform_dynamic_tensor_descriptor( + transform_tensor_descriptor( out_n_ydot_htilda_xdot_wtilda_k_grid_desc, make_tuple(make_pass_through_transform(N), make_slice_transform(YDot, I0, YDotSlice), @@ -197,7 +197,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( Sequence<5, 6>{})); #if 1 - const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( + const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc, make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)), make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)), @@ -205,7 +205,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); #else - const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( + const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc, make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)), make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)), @@ -215,7 +215,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( #endif // input tensor - const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor( in_n_hi_wi_c_grid_desc, make_tuple(make_pass_through_transform(N), make_pad_transform(Hi, InLeftPadH, InRightPadH), @@ -224,7 +224,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_tensor_descriptor( in_n_hip_wip_c_grid_desc, make_tuple(make_pass_through_transform(N), make_embed_transform(make_tuple(YTilda, HTilda), @@ -235,7 +235,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); - const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_tensor_descriptor( in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc, make_tuple(make_pass_through_transform(N), make_freeze_transform(IYTilda), @@ -256,7 +256,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( Sequence<2>{}, Sequence<3>{})); - const auto in_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor( in_n_htildaslice_wtildaslice_c_grid_desc, make_tuple(make_pass_through_transform(C), make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice))), diff --git a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp index 377a1ac29b..9c60e8c3ac 100644 --- a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp +++ b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp @@ -2,8 +2,8 @@ #define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -26,9 +26,9 @@ template __host__ __device__ constexpr auto transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( - const DynamicTensorDescriptor& out_n_ho_wo_k_grid_desc, - const DynamicTensorDescriptor& wei_k_y_x_c_grid_desc, - const DynamicTensorDescriptor& in_n_hi_wi_c_grid_desc, + const TensorDescriptor& out_n_ho_wo_k_grid_desc, + const TensorDescriptor& wei_k_y_x_c_grid_desc, + const TensorDescriptor& in_n_hi_wi_c_grid_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -106,7 +106,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( // A: output tensor // this add padding check - const auto out_n_hop_wop_k_grid_desc = transform_dynamic_tensor_descriptor( + const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor( out_n_ho_wo_k_grid_desc, make_tuple(make_pass_through_transform(N), make_pad_transform(Ho, I0, I0), @@ -115,7 +115,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_dynamic_tensor_descriptor( + const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_tensor_descriptor( out_n_hop_wop_k_grid_desc, make_tuple(make_pass_through_transform(N), make_embed_transform(make_tuple(YDot, HTilda), @@ -127,7 +127,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc = - transform_dynamic_tensor_descriptor( + transform_tensor_descriptor( out_n_ydot_htilda_xdot_wtilda_k_grid_desc, make_tuple(make_pass_through_transform(N), make_slice_transform(YDot, I0, YDotSlice), @@ -149,7 +149,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( Sequence<5, 6>{})); #if 1 - const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( + const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc, make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)), make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)), @@ -157,7 +157,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); #else - const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( + const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc, make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)), make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)), @@ -167,7 +167,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( #endif // B: weight tensor - const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_tensor_descriptor( wei_k_y_x_c_grid_desc, make_tuple(make_pass_through_transform(K), make_embed_transform(make_tuple(YDot, YTilda), @@ -179,28 +179,28 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc = - transform_dynamic_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(K0, K1)), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_freeze_transform(IYTilda), - make_freeze_transform(IXTilda), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<3>{}, - Sequence<2>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<0, 1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<>{}, - Sequence<>{}, - Sequence<4>{})); + transform_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(K0, K1)), + make_slice_transform(YDot, I0, YDotSlice), + make_slice_transform(XDot, I0, XDotSlice), + make_freeze_transform(IYTilda), + make_freeze_transform(IXTilda), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<3>{}, + Sequence<2>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<0, 1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<>{}, + Sequence<>{}, + Sequence<4>{})); #if 1 - const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( + const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( wei_k0_k1_ydotslice_xdotslice_c_grid_desc, make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)), make_pass_through_transform(C), @@ -208,7 +208,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); #else - const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( + const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( wei_k0_k1_ydotslice_xdotslice_c_grid_desc, make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)), make_pass_through_transform(C), @@ -218,7 +218,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( #endif // C: input tensor - const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor( in_n_hi_wi_c_grid_desc, make_tuple(make_pass_through_transform(N), make_pad_transform(Hi, InLeftPadH, InRightPadH), @@ -227,7 +227,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_tensor_descriptor( in_n_hip_wip_c_grid_desc, make_tuple(make_pass_through_transform(N), make_embed_transform(make_tuple(YTilda, HTilda), @@ -238,7 +238,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); - const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_tensor_descriptor( in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc, make_tuple(make_pass_through_transform(N), make_freeze_transform(IYTilda), @@ -259,7 +259,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk( Sequence<2>{}, Sequence<3>{})); - const auto in_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor( in_n_htildaslice_wtildaslice_c_grid_desc, make_tuple(make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)), make_pass_through_transform(C)), diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp index 4378314108..093a46256d 100644 --- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -2,8 +2,8 @@ #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -18,9 +18,9 @@ template __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad( - const DynamicTensorDescriptor& wei_k_c_y_x_global_desc, - const DynamicTensorDescriptor& in_n_c_hi_wi_global_desc, - const DynamicTensorDescriptor& out_n_k_ho_wo_global_desc, + const TensorDescriptor& wei_k_c_y_x_global_desc, + const TensorDescriptor& in_n_c_hi_wi_global_desc, + const TensorDescriptor& out_n_k_ho_wo_global_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -57,14 +57,14 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_ const auto InRightPadW = in_right_pads[I1]; // weight tensor - const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)), + const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)), make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); // input tensor - const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor( + const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( in_n_c_hi_wi_global_desc, make_tuple(make_pass_through_transform(N), make_pass_through_transform(C), @@ -73,7 +73,7 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_ make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor( + const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( in_n_c_hip_wip_global_desc, make_tuple(make_pass_through_transform(N), make_pass_through_transform(C), @@ -83,15 +83,15 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_ make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); const auto in_gemmk_gemmn_global_desc = - transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc, - make_tuple(make_merge_transform(make_tuple(C, Y, X)), - make_merge_transform(make_tuple(N, Ho, Wo))), - make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc, + make_tuple(make_merge_transform(make_tuple(C, Y, X)), + make_merge_transform(make_tuple(N, Ho, Wo))), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); // output tensor - const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)), + const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)), make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))), make_tuple(Sequence<1>{}, Sequence<0, 2>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -109,9 +109,9 @@ template __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad( - const DynamicTensorDescriptor& wei_k_c_y_x_global_desc, - const DynamicTensorDescriptor& in_n_c_hi_wi_global_desc, - const DynamicTensorDescriptor& out_n_k_ho_wo_global_desc, + const TensorDescriptor& wei_k_c_y_x_global_desc, + const TensorDescriptor& in_n_c_hi_wi_global_desc, + const TensorDescriptor& out_n_k_ho_wo_global_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -147,14 +147,14 @@ transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad( assert(InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 && InRightPadW == 0); // weight tensor - const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)), + const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)), make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); // input tensor - const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor( + const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( in_n_c_hi_wi_global_desc, make_tuple(make_pass_through_transform(N), make_pass_through_transform(C), @@ -164,15 +164,15 @@ transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); const auto in_gemmk_gemmn_global_desc = - transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc, - make_tuple(make_merge_transform(make_tuple(C, Y, X)), - make_merge_transform(make_tuple(N, Ho, Wo))), - make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc, + make_tuple(make_merge_transform(make_tuple(C, Y, X)), + make_merge_transform(make_tuple(N, Ho, Wo))), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); // output tensor - const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)), + const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)), make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))), make_tuple(Sequence<1>{}, Sequence<0, 2>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -189,9 +189,9 @@ template __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_1x1( - const DynamicTensorDescriptor& wei_k_c_y_x_global_desc, - const DynamicTensorDescriptor& in_n_c_hi_wi_global_desc, - const DynamicTensorDescriptor& out_n_k_ho_wo_global_desc, + const TensorDescriptor& wei_k_c_y_x_global_desc, + const TensorDescriptor& in_n_c_hi_wi_global_desc, + const TensorDescriptor& out_n_k_ho_wo_global_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -229,22 +229,22 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_ InRightPadW == 0); // weight tensor - const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)), + const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, C)), make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); // input tensor - const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor( + const auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor( in_n_c_hi_wi_global_desc, make_tuple(make_pass_through_transform(C), make_merge_transform(make_tuple(N, Ho, Wo))), make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); // output tensor - const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)), + const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)), make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))), make_tuple(Sequence<1>{}, Sequence<0, 2>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp index 4764f02787..9aa27884da 100644 --- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp +++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp @@ -2,8 +2,8 @@ #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -18,9 +18,9 @@ template __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_pad( - const DynamicTensorDescriptor& wei_k_y_x_c_grid_desc, - const DynamicTensorDescriptor& in_n_hi_wi_c_grid_desc, - const DynamicTensorDescriptor& out_n_ho_wo_k_grid_desc, + const TensorDescriptor& wei_k_y_x_c_grid_desc, + const TensorDescriptor& in_n_hi_wi_c_grid_desc, + const TensorDescriptor& out_n_ho_wo_k_grid_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -57,14 +57,14 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_ const auto InRightPadW = in_right_pads[I1]; // weight tensor - const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)), + const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)), make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); // input tensor - const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor( in_n_hi_wi_c_grid_desc, make_tuple(make_pass_through_transform(N), make_pad_transform(Hi, InLeftPadH, InRightPadH), @@ -73,7 +73,7 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_ make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor( in_n_hip_wip_c_grid_desc, make_tuple(make_pass_through_transform(N), make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)), @@ -83,15 +83,15 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_ make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto in_gemmk_gemmn_grid_desc = - transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(Y, X, C)), - make_merge_transform(make_tuple(N, Ho, Wo))), - make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc, + make_tuple(make_merge_transform(make_tuple(Y, X, C)), + make_merge_transform(make_tuple(N, Ho, Wo))), + make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); // output tensor - const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)), + const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)), make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); @@ -108,9 +108,9 @@ template __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_1x1( - const DynamicTensorDescriptor& wei_k_y_x_c_grid_desc, - const DynamicTensorDescriptor& in_n_hi_wi_c_grid_desc, - const DynamicTensorDescriptor& out_n_ho_wo_k_grid_desc, + const TensorDescriptor& wei_k_y_x_c_grid_desc, + const TensorDescriptor& in_n_hi_wi_c_grid_desc, + const TensorDescriptor& out_n_ho_wo_k_grid_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -148,22 +148,22 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_ InRightPadW == 0); // weight tensor - const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)), + const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, C)), make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); // input tensor - const auto in_gemmk_gemmn_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, C)), + const auto in_gemmk_gemmn_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, C)), make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(C)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); // output tensor - const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)), + const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)), make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp index 49ae26518e..16ae8b470d 100644 --- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp @@ -2,8 +2,8 @@ #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -20,9 +20,9 @@ template __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad( - const DynamicTensorDescriptor& wei_k_c_y_x_grid_desc, - const DynamicTensorDescriptor& in_n_c_hi_wi_grid_desc, - const DynamicTensorDescriptor& out_n_k_ho_wo_grid_desc, + const TensorDescriptor& wei_k_c_y_x_grid_desc, + const TensorDescriptor& in_n_c_hi_wi_grid_desc, + const TensorDescriptor& out_n_k_ho_wo_grid_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -67,21 +67,21 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad( const auto GemmK0 = GemmK / GemmK1; // weight tensor - const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)), + const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)), make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); - const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( - wei_gemmk_gemmm_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), - make_pass_through_transform(GemmM)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + const auto wei_gemmk0_gemmm_gemmk1_grid_desc = + transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), + make_pass_through_transform(GemmM)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // input tensor - const auto in_n_c_hip_wip_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor( in_n_c_hi_wi_grid_desc, make_tuple(make_pass_through_transform(N), make_pass_through_transform(C), @@ -90,7 +90,7 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto in_n_c_y_ho_x_wo_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor( in_n_c_hip_wip_grid_desc, make_tuple(make_pass_through_transform(N), make_pass_through_transform(C), @@ -100,22 +100,22 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); const auto in_gemmk_gemmn_grid_desc = - transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc, - make_tuple(make_merge_transform(make_tuple(C, Y, X)), - make_merge_transform(make_tuple(N, Ho, Wo))), - make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc, + make_tuple(make_merge_transform(make_tuple(C, Y, X)), + make_merge_transform(make_tuple(N, Ho, Wo))), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); - const auto in_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( - in_gemmk_gemmn_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), - make_pass_through_transform(GemmN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + const auto in_gemmk0_gemmn_gemmk1_grid_desc = + transform_tensor_descriptor(in_gemmk_gemmn_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), + make_pass_through_transform(GemmN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // output tensor - const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)), + const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)), make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))), make_tuple(Sequence<1>{}, Sequence<0, 2>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp index 5814e66766..e81c87d046 100644 --- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp +++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp @@ -2,8 +2,8 @@ #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -20,9 +20,9 @@ template __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad( - const DynamicTensorDescriptor& wei_k_y_x_c_grid_desc, - const DynamicTensorDescriptor& in_n_hi_wi_c_grid_desc, - const DynamicTensorDescriptor& out_n_ho_wo_k_grid_desc, + const TensorDescriptor& wei_k_y_x_c_grid_desc, + const TensorDescriptor& in_n_hi_wi_c_grid_desc, + const TensorDescriptor& out_n_ho_wo_k_grid_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -67,21 +67,21 @@ transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad( const auto GemmK0 = GemmK / GemmK1; // weight tensor - const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)), + const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)), make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); - const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( - wei_gemmk_gemmm_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), - make_pass_through_transform(GemmM)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + const auto wei_gemmk0_gemmm_gemmk1_grid_desc = + transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), + make_pass_through_transform(GemmM)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // input tensor - const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor( in_n_hi_wi_c_grid_desc, make_tuple(make_pass_through_transform(N), make_pad_transform(Hi, InLeftPadH, InRightPadH), @@ -90,7 +90,7 @@ transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor( in_n_hip_wip_c_grid_desc, make_tuple(make_pass_through_transform(N), make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)), @@ -100,22 +100,22 @@ transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad( make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto in_gemmk_gemmn_grid_desc = - transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(Y, X, C)), - make_merge_transform(make_tuple(N, Ho, Wo))), - make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc, + make_tuple(make_merge_transform(make_tuple(Y, X, C)), + make_merge_transform(make_tuple(N, Ho, Wo))), + make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); - const auto in_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( - in_gemmk_gemmn_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), - make_pass_through_transform(GemmN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + const auto in_gemmk0_gemmn_gemmk1_grid_desc = + transform_tensor_descriptor(in_gemmk_gemmn_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), + make_pass_through_transform(GemmN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // output tensor - const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)), + const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)), make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp index ad9d99f4e7..b0b07505e5 100644 --- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp +++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp @@ -2,8 +2,8 @@ #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -23,9 +23,9 @@ template __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad( - const DynamicTensorDescriptor& in_n_hi_wi_c_grid_desc, - const DynamicTensorDescriptor& wei_k_y_x_c_grid_desc, - const DynamicTensorDescriptor& out_n_ho_wo_k_grid_desc, + const TensorDescriptor& in_n_hi_wi_c_grid_desc, + const TensorDescriptor& wei_k_y_x_c_grid_desc, + const TensorDescriptor& out_n_ho_wo_k_grid_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -70,7 +70,7 @@ transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad( const auto GemmK0 = GemmK / GemmK1; // A: input tensor - const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor( in_n_hi_wi_c_grid_desc, make_tuple(make_pass_through_transform(N), make_pad_transform(Hi, InLeftPadH, InRightPadH), @@ -79,7 +79,7 @@ transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor( in_n_hip_wip_c_grid_desc, make_tuple(make_pass_through_transform(N), make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)), @@ -89,36 +89,36 @@ transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad( make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto in_gemmk_gemmm_grid_desc = - transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(Y, X, C)), - make_merge_transform(make_tuple(N, Ho, Wo))), - make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc, + make_tuple(make_merge_transform(make_tuple(Y, X, C)), + make_merge_transform(make_tuple(N, Ho, Wo))), + make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); - const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( - in_gemmk_gemmm_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), - make_pass_through_transform(GemmM)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + const auto in_gemmk0_gemmm_gemmk1_grid_desc = + transform_tensor_descriptor(in_gemmk_gemmm_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), + make_pass_through_transform(GemmM)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // B: weight tensor - const auto wei_gemmk_gemmn_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)), + const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)), make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); - const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor( - wei_gemmk_gemmn_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), - make_pass_through_transform(GemmN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + const auto wei_gemmk0_gemmn_gemmk1_grid_desc = + transform_tensor_descriptor(wei_gemmk_gemmn_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)), + make_pass_through_transform(GemmN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // C: output tensor - const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)), + const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)), make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp index e709f768cb..f5cb7f4877 100644 --- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp @@ -2,8 +2,8 @@ #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -24,9 +24,9 @@ template __host__ __device__ constexpr auto transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad( - const DynamicTensorDescriptor& wei_k_c_y_x_grid_desc, - const DynamicTensorDescriptor& in_n_c_hi_wi_grid_desc, - const DynamicTensorDescriptor& out_n_k_ho_wo_grid_desc, + const TensorDescriptor& wei_k_c_y_x_grid_desc, + const TensorDescriptor& in_n_c_hi_wi_grid_desc, + const TensorDescriptor& out_n_k_ho_wo_grid_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -68,15 +68,15 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad( const auto C1 = C / C0; // weight tensor - const auto wei_gk0_gm0_gm1_gk1_grid_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)), - make_tuple(make_unmerge_transform(make_tuple(I1, K)), - make_unmerge_transform(make_tuple(C0, C1 * Y * X))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<1, 2>{}, Sequence<3, 0>{})); + const auto wei_gk0_gm0_gm1_gk1_grid_desc = + transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)), + make_tuple(make_unmerge_transform(make_tuple(I1, K)), + make_unmerge_transform(make_tuple(C0, C1 * Y * X))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<1, 2>{}, Sequence<3, 0>{})); // input tensor - const auto in_n_c_hip_wip_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor( in_n_c_hi_wi_grid_desc, make_tuple(make_pass_through_transform(N), make_pass_through_transform(C), @@ -85,7 +85,7 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto in_n0_n1_c0_c1_y_ho_x_wo_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_n0_n1_c0_c1_y_ho_x_wo_grid_desc = transform_tensor_descriptor( in_n_c_hip_wip_grid_desc, make_tuple(make_unmerge_transform(make_tuple(N0, N1)), make_unmerge_transform(make_tuple(C0, C1)), @@ -94,7 +94,7 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad( make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6, 7>{})); - const auto in_gk0_gn0_gn1_gk1_grid_desc = transform_dynamic_tensor_descriptor( + const auto in_gk0_gn0_gn1_gk1_grid_desc = transform_tensor_descriptor( in_n0_n1_c0_c1_y_ho_x_wo_grid_desc, make_tuple(make_merge_transform(make_tuple(C1, Y, X)), make_pass_through_transform(N0), @@ -105,17 +105,17 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad( // output tensor const auto out_n_k_howo_grid_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)); + make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)); - const auto out_n0_n1_1_k_howo_grid_desc = transform_dynamic_tensor_descriptor( - out_n_k_howo_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(N0, N1)), - make_unmerge_transform(make_tuple(I1, K)), - make_pass_through_transform(Ho * Wo)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4>{})); + const auto out_n0_n1_1_k_howo_grid_desc = + transform_tensor_descriptor(out_n_k_howo_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(N0, N1)), + make_unmerge_transform(make_tuple(I1, K)), + make_pass_through_transform(Ho * Wo)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4>{})); - const auto out_gm0_gm1_gn0_gn1_grid_desc = transform_dynamic_tensor_descriptor( + const auto out_gm0_gm1_gn0_gn1_grid_desc = transform_tensor_descriptor( out_n0_n1_1_k_howo_grid_desc, make_tuple(make_pass_through_transform(I1), make_pass_through_transform(K), diff --git a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp similarity index 91% rename from composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp rename to composable_kernel/include/tensor_description/multi_index_transform.hpp index 967517bef7..fa5d2246d7 100644 --- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -1,5 +1,5 @@ -#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP -#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP +#ifndef CK_MULTI_INDEX_TRANSFORM_HPP +#define CK_MULTI_INDEX_TRANSFORM_HPP #include "common_header.hpp" #include "multi_index.hpp" @@ -7,7 +7,7 @@ namespace ck { template -struct DynamicPassThrough +struct PassThrough { using LowerIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>; @@ -16,9 +16,9 @@ struct DynamicPassThrough UpLengths up_lengths_; - __host__ __device__ constexpr DynamicPassThrough() = default; + __host__ __device__ constexpr PassThrough() = default; - __host__ __device__ constexpr DynamicPassThrough(const LowLength& low_length) + __host__ __device__ constexpr PassThrough(const LowLength& low_length) : up_lengths_{make_tuple(low_length)} { } @@ -82,33 +82,36 @@ struct DynamicPassThrough __host__ __device__ void Print() const { printf("{"); - printf("DynamicPassThrough, "); + printf("PassThrough, "); printf("up_lengths_"); print_multi_index(up_lengths_); printf("}"); } }; -template -struct DynamicPad +template +struct Pad { using LowerIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>; - using UpLengths = decltype(make_tuple(LowLength{} + LeftPad{} + RightPad{})); + using UpLengths = decltype(make_tuple(LowLength{} + LeftPadLength{} + RightPadLength{})); UpLengths up_lengths_; - LeftPad left_pad_; - RightPad right_pad_; + LeftPadLength left_pad_length_; + RightPadLength right_pad_length_; - __host__ __device__ constexpr DynamicPad() = default; + __host__ __device__ constexpr Pad() = default; - __host__ __device__ constexpr DynamicPad(const LowLength& low_length, - const LeftPad& left_pad, - const RightPad& right_pad) - : up_lengths_{make_tuple(low_length + left_pad + right_pad)}, - left_pad_{left_pad}, - right_pad_{right_pad} + __host__ __device__ constexpr Pad(const LowLength& low_length, + const LeftPadLength& left_pad_length, + const RightPadLength& right_pad_length) + : up_lengths_{make_tuple(low_length + left_pad_length + right_pad_length)}, + left_pad_length_{left_pad_length}, + right_pad_length_{right_pad_length} { } @@ -125,7 +128,7 @@ struct DynamicPad static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1, "wrong! inconsistent # of dimension"); - idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_; + idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_length_; } template {}] >= left_pad_) && - (idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_)); + return SkipIsValidCheck || + ((idx_up[Number<0>{}] >= left_pad_length_) && + (idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_length_)); } __host__ __device__ static constexpr bool IsKnownAtCompileTime() { return is_known_at_compile_time::value && - is_known_at_compile_time::value && - is_known_at_compile_time::value; + is_known_at_compile_time::value && + is_known_at_compile_time::value; } __host__ __device__ void Print() const { printf("{"); - printf("DynamicPad, "); + printf("Pad, "); printf("up_lengths_"); print_multi_index(up_lengths_); - printf("left_pad_ %d", index_t{left_pad_}); - printf("right_pad_ %d", index_t{right_pad_}); + printf("left_pad_length %d", index_t{left_pad_length_}); + printf("right_pad_length %d", index_t{right_pad_length_}); printf("}"); } }; -template -struct DynamicLeftPad +template +struct LeftPad { using LowerIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>; - using UpLengths = decltype(make_tuple(LowLength{} + LeftPad{})); + using UpLengths = decltype(make_tuple(LowLength{} + LeftPadLength{})); UpLengths up_lengths_; - LeftPad left_pad_; + LeftPadLength left_pad_length_; - __host__ __device__ constexpr DynamicLeftPad() = default; + __host__ __device__ constexpr LeftPad() = default; - __host__ __device__ constexpr DynamicLeftPad(const LowLength& low_length, - const LeftPad& left_pad) - : up_lengths_{make_tuple(low_length + left_pad)}, left_pad_{left_pad} + __host__ __device__ constexpr LeftPad(const LowLength& low_length, + const LeftPadLength& left_pad_length) + : up_lengths_{make_tuple(low_length + left_pad_length)}, left_pad_length_{left_pad_length} { } @@ -216,7 +220,7 @@ struct DynamicLeftPad static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1, "wrong! inconsistent # of dimension"); - idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_; + idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_length_; } template {}] >= left_pad_); + return SkipIsValidCheck || (idx_up[Number<0>{}] >= left_pad_length_); } __host__ __device__ static constexpr bool IsKnownAtCompileTime() { return is_known_at_compile_time::value && - is_known_at_compile_time::value; + is_known_at_compile_time::value; } __host__ __device__ void Print() const { printf("{"); - printf("DynamicLeftPad, "); + printf("LeftPad, "); printf("up_lengths_"); print_multi_index(up_lengths_); - printf("left_pad_ %d", index_t{left_pad_}); + printf("left_pad_length_ %d", index_t{left_pad_length_}); printf("}"); } }; -template -struct DynamicRightPad +template +struct RightPad { using LowerIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>; - using UpLengths = decltype(make_tuple(LowLength{} + RightPad{})); + using UpLengths = decltype(make_tuple(LowLength{} + RightPadLength{})); UpLengths up_lengths_; LowLength low_length_; - RightPad right_pad_; + RightPadLength right_pad_length_; - __host__ __device__ constexpr DynamicRightPad() = default; + __host__ __device__ constexpr RightPad() = default; - __host__ __device__ constexpr DynamicRightPad(const LowLength& low_length, - const RightPad& right_pad) - : up_lengths_{make_tuple(low_length + right_pad)}, + __host__ __device__ constexpr RightPad(const LowLength& low_length, + const RightPadLength& right_pad_length) + : up_lengths_{make_tuple(low_length + right_pad_length)}, low_length_{low_length}, - right_pad_{right_pad} + right_pad_length_{right_pad_length} { } @@ -350,17 +354,17 @@ struct DynamicRightPad { return is_known_at_compile_time::value && is_known_at_compile_time::value && - is_known_at_compile_time::value; + is_known_at_compile_time::value; } __host__ __device__ void Print() const { printf("{"); - printf("DynamicRightPad, "); + printf("RightPad, "); printf("up_lengths_"); print_multi_index(up_lengths_); printf("low_length_ %d", index_t{low_length_}); - printf("left_pad_ %d", index_t{right_pad_}); + printf("left_pad_length_ %d", index_t{right_pad_length_}); printf("}"); } }; @@ -374,7 +378,7 @@ struct DynamicRightPad template ::type = false> -struct DynamicEmbed +struct Embed { static constexpr index_t NDimUp = UpLengths::Size(); @@ -384,10 +388,10 @@ struct DynamicEmbed UpLengths up_lengths_; Coefficients coefficients_; - __host__ __device__ constexpr DynamicEmbed() = default; + __host__ __device__ constexpr Embed() = default; - __host__ __device__ constexpr DynamicEmbed(const UpLengths& up_lengths, - const Coefficients& coefficients) + __host__ __device__ constexpr Embed(const UpLengths& up_lengths, + const Coefficients& coefficients) : up_lengths_{up_lengths}, coefficients_{coefficients} { } @@ -458,7 +462,7 @@ struct DynamicEmbed __host__ __device__ void Print() const { printf("{"); - printf("DynamicEmbed, "); + printf("Embed, "); printf("up_lengths_ "); print_multi_index(up_lengths_); printf("coefficients_ "); @@ -470,7 +474,7 @@ struct DynamicEmbed // Implementation of "Merge" transformation primitive that uses regular to do lowering of // multi-index and use carry-and-borrow check to do lowering of multi-index delta template -struct DynamicMerge_v1_carry_check +struct Merge_v1_carry_check { static constexpr index_t NDimLow = LowLengths::Size(); @@ -487,9 +491,9 @@ struct DynamicMerge_v1_carry_check LowLengthsScan low_lengths_scan_; UpLengths up_lengths_; - __host__ __device__ constexpr DynamicMerge_v1_carry_check() = default; + __host__ __device__ constexpr Merge_v1_carry_check() = default; - __host__ __device__ constexpr DynamicMerge_v1_carry_check(const LowLengths& low_lengths) + __host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths) : low_lengths_{low_lengths}, low_lengths_scan_{ container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})}, @@ -555,7 +559,7 @@ struct DynamicMerge_v1_carry_check LowerIndex idx_low_length_minus_idx_diff_low_const; LowerIndex idx_low_length_plus_idx_diff_low_const; -#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE +#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE index_t tmp = idx_diff_up[Number<0>{}]; static_for<0, NDimLow - 1, 1>{}([&](auto i) { @@ -698,7 +702,7 @@ struct DynamicMerge_v1_carry_check LowerIndex idx_low_length_minus_idx_diff_low_const; LowerIndex idx_low_length_plus_idx_diff_low_const; -#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE +#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE index_t tmp = idx_diff_up[Number<0>{}]; static_for<0, NDimLow - 1, 1>{}([&](auto i) { @@ -838,7 +842,7 @@ struct DynamicMerge_v1_carry_check // very expensive. LowerIndex idx_diff_low_const; -#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE +#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE index_t tmp = idx_diff_up[Number<0>{}]; static_for<0, NDimLow - 1, 1>{}([&](auto i) { @@ -981,7 +985,7 @@ struct DynamicMerge_v1_carry_check __host__ __device__ void Print() const { printf("{"); - printf("DynamicMerge_v1_carry_check, "); + printf("Merge_v1_carry_check, "); printf("low_lengths_ "); print_multi_index(low_lengths_); printf("low_lengths_scan_ "); @@ -1025,7 +1029,7 @@ struct lambda_merge_generate_MagicDivision_calculate_magic_shift // 5. When upper-index is int32_t type (when index_t is int32_t), its value need to be // non-negative. template -struct DynamicMerge_v2_magic_division +struct Merge_v2_magic_division { static constexpr index_t NDimLow = LowLengths::Size(); @@ -1048,9 +1052,9 @@ struct DynamicMerge_v2_magic_division LowLengthsMagicDivisorShift low_lengths_magic_divisor_shift_; UpLengths up_lengths_; - __host__ __device__ constexpr DynamicMerge_v2_magic_division() = default; + __host__ __device__ constexpr Merge_v2_magic_division() = default; - __host__ __device__ constexpr DynamicMerge_v2_magic_division(const LowLengths& low_lengths) + __host__ __device__ constexpr Merge_v2_magic_division(const LowLengths& low_lengths) : low_lengths_{low_lengths}, low_lengths_magic_divisor_multiplier_{generate_tuple( [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths[i]); }, @@ -1151,7 +1155,7 @@ struct DynamicMerge_v2_magic_division __host__ __device__ void Print() const { printf("{"); - printf("DynamicMerge_v2_magic_division, "); + printf("Merge_v2_magic_division, "); printf("low_lengths_ "); print_multi_index(low_lengths_); printf("low_lengths_magic_divisor_multiplier_ "); @@ -1177,7 +1181,7 @@ struct DynamicMerge_v2_magic_division // 5. When upper-index is int32_t type (when index_t is int32_t), its value need to be // non-negative. template -struct DynamicMerge_v2r2_magic_division +struct Merge_v2r2_magic_division { static constexpr index_t NDimLow = LowLengths::Size(); @@ -1204,9 +1208,9 @@ struct DynamicMerge_v2r2_magic_division LowLengthsScanMagicDivisorShift low_lengths_scan_magic_divisor_shift_; UpLengths up_lengths_; - __host__ __device__ constexpr DynamicMerge_v2r2_magic_division() = default; + __host__ __device__ constexpr Merge_v2r2_magic_division() = default; - __host__ __device__ constexpr DynamicMerge_v2r2_magic_division(const LowLengths& low_lengths) + __host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths) : low_lengths_{low_lengths}, low_lengths_scan_{ container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})}, @@ -1308,7 +1312,7 @@ struct DynamicMerge_v2r2_magic_division __host__ __device__ void Print() const { printf("{"); - printf("DynamicMerge_v2r2_magic_division, "); + printf("Merge_v2r2_magic_division, "); printf("low_lengths_ "); print_multi_index(low_lengths_); printf("low_lengths_scan "); @@ -1324,7 +1328,7 @@ struct DynamicMerge_v2r2_magic_division }; template -struct DynamicUnMerge +struct UnMerge { static constexpr index_t NDimUp = UpLengths::Size(); @@ -1337,9 +1341,9 @@ struct DynamicUnMerge UpLengths up_lengths_; UpLengthsScan up_lengths_scan_; - __host__ __device__ constexpr DynamicUnMerge() = default; + __host__ __device__ constexpr UnMerge() = default; - __host__ __device__ constexpr DynamicUnMerge(const UpLengths& up_lengths) + __host__ __device__ constexpr UnMerge(const UpLengths& up_lengths) : up_lengths_{up_lengths}, up_lengths_scan_{ container_reverse_exclusive_scan(up_lengths, math::multiplies_v2{}, Number<1>{})} @@ -1414,7 +1418,7 @@ struct DynamicUnMerge __host__ __device__ void Print() const { printf("{"); - printf("DynamicUnMerge, "); + printf("UnMerge, "); printf("up_lengths_"); print_multi_index(up_lengths_); printf("up_lengths_scan_"); @@ -1424,13 +1428,13 @@ struct DynamicUnMerge }; template -struct DynamicFreeze +struct Freeze { LowerIndex low_idx_; - __host__ __device__ constexpr DynamicFreeze() = default; + __host__ __device__ constexpr Freeze() = default; - __host__ __device__ constexpr DynamicFreeze(const LowerIndex& low_idx) : low_idx_{low_idx} {} + __host__ __device__ constexpr Freeze(const LowerIndex& low_idx) : low_idx_{low_idx} {} __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; } @@ -1483,22 +1487,22 @@ struct DynamicFreeze __host__ __device__ void Print() const { - printf("DynamicFreeze"); + printf("Freeze"); printf("low_idx_ %d", index_t{low_idx_}); } }; // Insert a dangling upper dimension without lower dimension template -struct DynamicInsert +struct Insert { using UpLengths = decltype(make_tuple(UpperLength{})); UpLengths up_lengths_; - __host__ __device__ constexpr DynamicInsert() = default; + __host__ __device__ constexpr Insert() = default; - __host__ __device__ constexpr DynamicInsert(const UpperLength& up_length) + __host__ __device__ constexpr Insert(const UpperLength& up_length) : up_lengths_{make_tuple(up_length)} { } @@ -1550,13 +1554,13 @@ struct DynamicInsert __host__ __device__ void Print() const { - printf("DynamicInsert"); + printf("Insert"); print_multi_index(up_lengths_); } }; template -struct DynamicVectorize +struct Vectorize { using LowerIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>; @@ -1566,10 +1570,10 @@ struct DynamicVectorize UpLengths up_lengths_; VectorSize vector_size_; - __host__ __device__ constexpr DynamicVectorize() = default; + __host__ __device__ constexpr Vectorize() = default; - __host__ __device__ constexpr DynamicVectorize(const VectorSize& vector_size, - const UpLength& up_length) + __host__ __device__ constexpr Vectorize(const VectorSize& vector_size, + const UpLength& up_length) : vector_size_{vector_size}, up_lengths_{make_tuple(up_length)} { } @@ -1633,7 +1637,7 @@ struct DynamicVectorize __host__ __device__ void Print() const { printf("{"); - printf("DynamicVectorize, "); + printf("Vectorize, "); printf("up_lengths_"); print_multi_index(up_lengths_); printf("}"); @@ -1641,7 +1645,7 @@ struct DynamicVectorize }; template -struct DynamicSlice +struct Slice { using LowerIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>; @@ -1652,11 +1656,11 @@ struct DynamicSlice SliceBegin slice_begin_; SliceEnd slice_end_; - __host__ __device__ constexpr DynamicSlice() = default; + __host__ __device__ constexpr Slice() = default; - __host__ __device__ constexpr DynamicSlice(const LowLength&, - const SliceBegin& slice_begin, - const SliceEnd& slice_end) + __host__ __device__ constexpr Slice(const LowLength&, + const SliceBegin& slice_begin, + const SliceEnd& slice_end) : up_lengths_{make_tuple(slice_end - slice_begin)}, slice_begin_{slice_begin}, slice_end_{slice_end} @@ -1724,7 +1728,7 @@ struct DynamicSlice __host__ __device__ void Print() const { printf("{"); - printf("DynamicSlice, "); + printf("Slice, "); printf("up_lengths_"); print_multi_index(up_lengths_); printf("slice_begin_ %d", index_t{slice_begin_}); diff --git a/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp b/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp similarity index 65% rename from composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp rename to composable_kernel/include/tensor_description/multi_index_transform_helper.hpp index b3e1c60485..abb48c450b 100644 --- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp @@ -1,15 +1,15 @@ -#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HELPER_HPP -#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HELPER_HPP +#ifndef CK_MULTI_INDEX_TRANSFORM_HELPER_HPP +#define CK_MULTI_INDEX_TRANSFORM_HELPER_HPP #include "common_header.hpp" -#include "dynamic_multi_index_transform.hpp" +#include "multi_index_transform.hpp" namespace ck { template __host__ __device__ constexpr auto make_pass_through_transform(const LowLength& low_length) { - return DynamicPassThrough{low_length}; + return PassThrough{low_length}; } template @@ -19,26 +19,25 @@ make_pad_transform(const LowLength& low_length, const RightPad& right_pad, integral_constant = integral_constant{}) { - return DynamicPad{ - low_length, left_pad, right_pad}; + return Pad{low_length, left_pad, right_pad}; } -template +template __host__ __device__ constexpr auto make_left_pad_transform( const LowLength& low_length, - const LeftPad& left_pad, + const LeftPadLength& left_pad, integral_constant = integral_constant{}) { - return DynamicLeftPad{low_length, left_pad}; + return LeftPad{low_length, left_pad}; } -template +template __host__ __device__ constexpr auto make_right_pad_transform( const LowLength& low_length, - const RightPad& right_pad, + const RightPadLength& right_pad, integral_constant = integral_constant{}) { - return DynamicRightPad{low_length, right_pad}; + return RightPad{low_length, right_pad}; } template {up_lengths, coefficients}; + return Embed{up_lengths, coefficients}; } template __host__ __device__ constexpr auto make_merge_transform(const LowLengths& low_lengths) { #if !CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION - return DynamicMerge_v1_carry_check{low_lengths}; + return Merge_v1_carry_check{low_lengths}; #else #if 1 - return DynamicMerge_v2_magic_division{low_lengths}; + return Merge_v2_magic_division{low_lengths}; #else - return DynamicMerge_v2r2_magic_division{low_lengths}; + return Merge_v2r2_magic_division{low_lengths}; #endif #endif } @@ -68,7 +67,7 @@ template __host__ __device__ constexpr auto make_merge_transform_v2_magic_division(const LowLengths& low_lengths) { - return DynamicMerge_v2_magic_division{low_lengths}; + return Merge_v2_magic_division{low_lengths}; } template @@ -76,13 +75,13 @@ __host__ __device__ constexpr auto make_unmerge_transform( const UpLengths& up_lengths, integral_constant = integral_constant{}) { - return DynamicUnMerge{up_lengths}; + return UnMerge{up_lengths}; } template __host__ __device__ constexpr auto make_freeze_transform(const LowerIndex& low_idx) { - return DynamicFreeze{low_idx}; + return Freeze{low_idx}; } template @@ -90,14 +89,14 @@ __host__ __device__ constexpr auto make_slice_transform(const LowLength& low_len const SliceBegin& slice_begin, const SliceEnd& slice_end) { - return DynamicSlice{low_length, slice_begin, slice_end}; + return Slice{low_length, slice_begin, slice_end}; } template __host__ __device__ constexpr auto make_vectorize_transform(const VectorSize& vector_size, const UpLength& up_length) { - return DynamicVectorize{vector_size, up_length}; + return Vectorize{vector_size, up_length}; } } // namespace ck diff --git a/composable_kernel/include/tensor_description/tensor_adaptor.hpp b/composable_kernel/include/tensor_description/tensor_adaptor.hpp index 6affe6141f..2508abc6b9 100644 --- a/composable_kernel/include/tensor_description/tensor_adaptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_adaptor.hpp @@ -2,8 +2,8 @@ #define CK_TENSOR_ADAPTOR_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { diff --git a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp similarity index 87% rename from composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp rename to composable_kernel/include/tensor_description/tensor_descriptor.hpp index b9ca26c879..9821ee8641 100644 --- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -1,16 +1,16 @@ -#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP -#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP +#ifndef CK_TENSOR_DESCRIPTOR_HPP +#define CK_TENSOR_DESCRIPTOR_HPP #include "common_header.hpp" -#include "dynamic_multi_index_transform.hpp" +#include "multi_index_transform.hpp" namespace ck { template -struct DynamicTensorCoordinate; +struct TensorCoordinate; template -struct DynamicTensorCoordinateIterator; +struct TensorCoordinateIterator; // Transforms: Tuple // LowerDimensionIdss : Tuple, ...> @@ -21,7 +21,7 @@ template -struct DynamicTensorDescriptor +struct TensorDescriptor { // TODO make these private __host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); } @@ -105,16 +105,16 @@ struct DynamicTensorDescriptor using VisibleIndex = MultiIndex; using HiddenIndex = MultiIndex; - using Coordinate = DynamicTensorCoordinate; + using Coordinate = TensorCoordinate; // may be index_t or Number<> using ElementSize = remove_cv_t; public: - __host__ __device__ constexpr DynamicTensorDescriptor() = default; + __host__ __device__ constexpr TensorDescriptor() = default; - __host__ __device__ constexpr DynamicTensorDescriptor(const Transforms& transforms, - ElementSpaceSize element_space_size) + __host__ __device__ constexpr TensorDescriptor(const Transforms& transforms, + ElementSpaceSize element_space_size) : transforms_{transforms}, element_size_{InitializeElementSize(transforms)}, element_space_size_{element_space_size} @@ -159,7 +159,7 @@ struct DynamicTensorDescriptor { static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension"); - return make_dynamic_tensor_coordinate(*this, idx).GetOffset(); + return make_tensor_coordinate(*this, idx).GetOffset(); } // TODO make these private @@ -196,7 +196,7 @@ struct DynamicTensorDescriptor __host__ __device__ void Print() const { printf("{"); - printf("DynamicTensorDescriptor, "); + printf("TensorDescriptor, "); static_for<0, ntransform_, 1>{}([&](auto i) { printf("transforms: "); transforms_[i].Print(); @@ -217,7 +217,7 @@ struct DynamicTensorDescriptor }; template -struct DynamicTensorCoordinate +struct TensorCoordinate { // TODO make these private static constexpr index_t ndim_visible_ = VisibleDimensionIds::Size(); @@ -226,9 +226,9 @@ struct DynamicTensorCoordinate using VisibleIndex = MultiIndex; public: - __host__ __device__ constexpr DynamicTensorCoordinate() = default; + __host__ __device__ constexpr TensorCoordinate() = default; - __host__ __device__ constexpr DynamicTensorCoordinate(const HiddenIndex& idx_hidden) + __host__ __device__ constexpr TensorCoordinate(const HiddenIndex& idx_hidden) : idx_hidden_{idx_hidden} { } @@ -252,16 +252,17 @@ struct DynamicTensorCoordinate }; template -struct DynamicTensorCoordinateIterator +struct TensorCoordinateIterator { // TODO make these private using VisibleIndex = MultiIndex; public: - __host__ __device__ constexpr DynamicTensorCoordinateIterator() = default; + __host__ __device__ constexpr TensorCoordinateIterator() = default; - __host__ __device__ constexpr DynamicTensorCoordinateIterator( - const VisibleIndex& idx_diff_visible, const MultiIndex& do_transforms) + __host__ + __device__ constexpr TensorCoordinateIterator(const VisibleIndex& idx_diff_visible, + const MultiIndex& do_transforms) : idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms} { } @@ -283,7 +284,7 @@ struct DynamicTensorCoordinateIterator // TODO: How to fix this? It uses an struct instead of lambda because lambda // doesn't have constructor, and to put it outside the scope where it is used -// (transform_dynamic_tensor_descriptor) because template cannot be defined inside a function +// (transform_tensor_descriptor) because template cannot be defined inside a function // template template struct lambda_get_up_dim_num @@ -301,10 +302,10 @@ template __host__ __device__ constexpr auto -transform_dynamic_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc, - const NewTransforms& new_transforms, - NewLowerDimensionOldVisibleIdss, - NewUpperDimensionNewVisibleIdss) +transform_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc, + const NewTransforms& new_transforms, + NewLowerDimensionOldVisibleIdss, + NewUpperDimensionNewVisibleIdss) { // sanity check { @@ -376,17 +377,17 @@ transform_dynamic_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc, const auto element_space_size = old_tensor_desc.GetElementSpaceSize(); - return DynamicTensorDescriptor, - remove_cv_t, - remove_cv_t, - remove_cv_t, - remove_cv_t>{all_transforms, - element_space_size}; + return TensorDescriptor, + remove_cv_t, + remove_cv_t, + remove_cv_t, + remove_cv_t>{all_transforms, + element_space_size}; } template -__host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDesc& tensor_desc, - const VisibleIndex& idx_visible) +__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc& tensor_desc, + const VisibleIndex& idx_visible) { static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(), "wrong! # of dimension inconsistent"); @@ -416,13 +417,13 @@ __host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDe set_container_subset(idx_hidden, dims_low, idx_low); }); - return DynamicTensorCoordinate{idx_hidden}; + return TensorCoordinate{idx_hidden}; } // UpdateLowerIndexHack: Sequence<...> // HACK: control UpdateLowerIndex template -__host__ __device__ constexpr auto make_dynamic_tensor_coordinate_iterator( +__host__ __device__ constexpr auto make_tensor_coordinate_iterator( const TensorDesc&, const VisibleIndex& idx_diff_visible, UpdateLowerIndexHack) { static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(), @@ -470,23 +471,24 @@ __host__ __device__ constexpr auto make_dynamic_tensor_coordinate_iterator( set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low); }); - return DynamicTensorCoordinateIterator{ + return TensorCoordinateIterator{ idx_diff_visible, do_transforms}; } template __host__ __device__ constexpr auto -make_dynamic_tensor_coordinate_iterator(const TensorDesc&, const VisibleIndex& idx_diff_visible) +make_tensor_coordinate_iterator(const TensorDesc&, const VisibleIndex& idx_diff_visible) { constexpr index_t ntransform = TensorDesc::GetNumOfTransform(); - return make_dynamic_tensor_coordinate_iterator( + return make_tensor_coordinate_iterator( TensorDesc{}, idx_diff_visible, typename uniform_sequence_gen::type{}); } template -__host__ __device__ constexpr void move_dynamic_tensor_coordinate( - const TensorDesc& tensor_desc, TensorCoord& coord, const TensorCoordIterator& coord_iterator) +__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc& tensor_desc, + TensorCoord& coord, + const TensorCoordIterator& coord_iterator) { constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension(); constexpr index_t ntransform = TensorDesc::GetNumOfTransform(); @@ -524,7 +526,7 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate( MultiIndex idx_diff_low; - // HACK: control UpdateLowerIndex for DynamicMerge using hack + // HACK: control UpdateLowerIndex for Merge using hack constexpr index_t Hack = decltype(coord_iterator.update_lower_index_hack_)::At(itran); tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number{}); @@ -585,11 +587,11 @@ __host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc& } template -using DynamicTensorCoordinate_t = decltype(make_dynamic_tensor_coordinate( +using TensorCoordinate_t = decltype(make_tensor_coordinate( TensorDesc{}, MultiIndex>::GetNumOfDimension()>{})); template -using DynamicTensorCoordinateIterator_t = decltype(make_dynamic_tensor_coordinate_iterator( +using TensorCoordinateIterator_t = decltype(make_tensor_coordinate_iterator( TensorDesc{}, MultiIndex>::GetNumOfDimension()>{})); } // namespace ck diff --git a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp similarity index 74% rename from composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp rename to composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index 2e36451a66..93f9dac64f 100644 --- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -1,9 +1,9 @@ -#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP -#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP +#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP +#define CK_TENSOR_DESCRIPTOR_HELPER_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_multi_index_transform_helper.hpp" +#include "tensor_descriptor.hpp" +#include "multi_index_transform_helper.hpp" namespace ck { @@ -38,9 +38,8 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt template ::type = false> -__host__ __device__ constexpr auto -make_dynamic_naive_tensor_descriptor_v2(const Tuple& lengths, - const Tuple& strides) +__host__ __device__ constexpr auto make_naive_tensor_descriptor_v2(const Tuple& lengths, + const Tuple& strides) { constexpr index_t N = sizeof...(Lengths); @@ -75,12 +74,12 @@ make_dynamic_naive_tensor_descriptor_v2(const Tuple& lengths, calculate_element_space_size_impl(lengths, strides, Number<0>{}, Number<1>{}); #endif - return DynamicTensorDescriptor, - remove_cv_t, - remove_cv_t, - remove_cv_t, - remove_cv_t>{transforms, - element_space_size}; + return TensorDescriptor, + remove_cv_t, + remove_cv_t, + remove_cv_t, + remove_cv_t>{transforms, + element_space_size}; } // Lengths... can be: @@ -88,7 +87,7 @@ make_dynamic_naive_tensor_descriptor_v2(const Tuple& lengths, // 2) Number<>, which is known at compile-time template __host__ __device__ constexpr auto -make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple& lengths) +make_naive_tensor_descriptor_packed(const Tuple& lengths) { constexpr index_t N = sizeof...(Lengths); @@ -103,17 +102,17 @@ make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple& lengths) const auto element_space_size = container_reduce(lengths, math::multiplies_v2{}, Number<1>{}); - return DynamicTensorDescriptor, - remove_cv_t, - remove_cv_t, - remove_cv_t, - remove_cv_t>{transforms, - element_space_size}; + return TensorDescriptor, + remove_cv_t, + remove_cv_t, + remove_cv_t, + remove_cv_t>{transforms, + element_space_size}; } template __host__ __device__ constexpr auto -make_dynamic_naive_tensor_descriptor_aligned_v2(const Tuple& lengths, Align align) +make_naive_tensor_descriptor_aligned_v2(const Tuple& lengths, Align align) { constexpr auto I1 = Number<1>{}; @@ -143,7 +142,7 @@ make_dynamic_naive_tensor_descriptor_aligned_v2(const Tuple& lengths }, Number{}); - return make_dynamic_naive_tensor_descriptor_v2(lengths, strides); + return make_naive_tensor_descriptor_v2(lengths, strides); } } // namespace ck diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp index f021a7b9b4..796e6387da 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp @@ -3,7 +3,7 @@ #include "common_header.hpp" #include "tensor_adaptor.hpp" -#include "threadwise_dynamic_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_transfer.hpp" #include "threadwise_contraction_dlops.hpp" namespace ck { @@ -73,7 +73,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 __host__ __device__ static constexpr auto MakeAKM0M1BlockDescriptor(const AKMBlockDesc& /* a_k_m_block_desc */) { - const auto a_k_m0_m1_block_desc = transform_dynamic_tensor_descriptor( + const auto a_k_m0_m1_block_desc = transform_tensor_descriptor( AKMBlockDesc{}, make_tuple(make_pass_through_transform(Number{}), make_unmerge_transform(make_tuple(Number{}, Number{}))), @@ -86,7 +86,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 __host__ __device__ static constexpr auto MakeBKN0N1BlockDescriptor(const BKNBlockDesc& /* b_k_n_block_desc */) { - const auto b_k_n0_n1_block_desc = transform_dynamic_tensor_descriptor( + const auto b_k_n0_n1_block_desc = transform_tensor_descriptor( BKNBlockDesc{}, make_tuple(make_pass_through_transform(Number{}), make_unmerge_transform(make_tuple(Number{}, Number{}))), @@ -357,34 +357,32 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 private: // A[K, M0, M1] - static constexpr auto a_k_m0_m1_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2( + static constexpr auto a_k_m0_m1_thread_desc_ = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{}, Number{})); // B[K, N0, N1] - static constexpr auto b_k_n0_n1_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2( + static constexpr auto b_k_n0_n1_thread_desc_ = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{}, Number{})); - using AThreadCopy = - ThreadwiseDynamicTensorSliceTransfer_v4, - Sequence<0, 1, 2>, - 2, - AThreadCopyScalarPerVector_M11, - 1>; + using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2>, + 2, + AThreadCopyScalarPerVector_M11, + 1>; - using BThreadCopy = - ThreadwiseDynamicTensorSliceTransfer_v4, - Sequence<0, 1, 2>, - 2, - BThreadCopyScalarPerVector_N11, - 1>; + using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2>, + 2, + BThreadCopyScalarPerVector_N11, + 1>; CIndex c_thread_origin_data_idx_; diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp index 6a3885936e..ace940d4f3 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp @@ -3,7 +3,7 @@ #include "common_header.hpp" #include "tensor_adaptor.hpp" -#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp" +#include "threadwise_tensor_slice_transfer_v2.hpp" #include "threadwise_contraction_dlops.hpp" namespace ck { @@ -75,7 +75,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B __host__ __device__ static constexpr auto MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1) { - const auto a_block_bk0_bm0_bm1_bk1 = transform_dynamic_tensor_descriptor( + const auto a_block_bk0_bm0_bm1_bk1 = transform_tensor_descriptor( a_block_desc_bk0_bm_bk1, make_tuple(make_pass_through_transform(Number{}), make_unmerge_transform(make_tuple(Number{}, Number{})), @@ -89,7 +89,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B __host__ __device__ static constexpr auto MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1) { - const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_dynamic_tensor_descriptor( + const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_tensor_descriptor( b_block_desc_bk0_bn_bk1, make_tuple(make_pass_through_transform(Number{}), make_unmerge_transform(make_tuple(Number{}, Number{})), @@ -372,15 +372,15 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B private: // A[BK0, BM0, BM1, BK1] static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( + make_naive_tensor_descriptor_packed(make_tuple( Number{}, Number{}, Number{}, Number{})); // B[BK0, BN0, BN1, BK1] static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( + make_naive_tensor_descriptor_packed(make_tuple( Number{}, Number{}, Number{}, Number{})); - using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4r1< + using AThreadCopy = ThreadwiseTensorSliceTransfer_v4r1< FloatA, FloatA, decltype(a_block_desc_bk0_bm0_bm1_bk1_), @@ -390,7 +390,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B Sequence<1, 1, BM1PerThreadBM11, BK1>, // SrcVectorTensorLengths Sequence<0, 1, 2, 3>>; // SrcVectorTensorContiguousDimOrder - using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4r1< + using BThreadCopy = ThreadwiseTensorSliceTransfer_v4r1< FloatB, FloatB, decltype(b_block_desc_bk0_bn0_bn1_bk1_), diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp index b656b4595a..a15be541b5 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp @@ -31,25 +31,24 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 // HACK: fix this @Jing Zhang static constexpr index_t KPerThreadSubC = 4; - static constexpr auto a_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2( + static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{})); - static constexpr auto b_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( + static constexpr auto b_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple( Number{}, Number<1>{}, Number{}, Number{})); - static constexpr auto c_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( + static constexpr auto c_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple( Number{}, Number<1>{}, Number{}, Number{})); - using AThreadCopy = - ThreadwiseDynamicTensorSliceTransfer_v4, - Sequence<0, 1>, - 1, - ThreadGemmADataPerRead_K, - 1>; + using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1>, + 1, + ThreadGemmADataPerRead_K, + 1>; __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3() : c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())}, diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp index 715fbc0b41..ee6a0b7427 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp @@ -2,7 +2,7 @@ #define CK_BLOCKWISE_GEMM_XDLOPS_HPP #include "common_header.hpp" -#include "threadwise_dynamic_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_transfer.hpp" #include "xdlops_gemm.hpp" namespace ck { @@ -191,35 +191,35 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1 private: // A[K, M] - static constexpr auto a_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2( - make_tuple(I1, Number{}, I1, Number{})); + static constexpr auto a_thread_desc_ = + make_naive_tensor_descriptor_packed(make_tuple(I1, Number{}, I1, Number{})); // B[K, N] - static constexpr auto b_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2( - make_tuple(I1, Number{}, I1, Number{})); + static constexpr auto b_thread_desc_ = + make_naive_tensor_descriptor_packed(make_tuple(I1, Number{}, I1, Number{})); - static constexpr auto c_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2( - make_tuple(Number{}, Number{})); + static constexpr auto c_thread_desc_ = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number{})); - using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3>, - 3, - K1, - 1>; + using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3>, + 3, + K1, + 1>; - using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3>, - 3, - K1, - 1>; + using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3>, + 3, + K1, + 1>; AThreadCopy a_thread_copy_; BThreadCopy b_thread_copy_; @@ -486,35 +486,35 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline private: // A[K, M] - static constexpr auto a_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2( - make_tuple(I1, Number{}, I1, Number{})); + static constexpr auto a_thread_desc_ = + make_naive_tensor_descriptor_packed(make_tuple(I1, Number{}, I1, Number{})); // B[K, N] - static constexpr auto b_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2( - make_tuple(I1, Number{}, I1, Number{})); + static constexpr auto b_thread_desc_ = + make_naive_tensor_descriptor_packed(make_tuple(I1, Number{}, I1, Number{})); - static constexpr auto c_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2( - make_tuple(Number{}, Number{})); + static constexpr auto c_thread_desc_ = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number{})); - using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3>, - 3, - 1, // K1, - 1>; + using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3>, + 3, + 1, // K1, + 1>; - using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3>, - 3, - 1, // K1, - 1>; + using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3>, + 3, + 1, // K1, + 1>; AThreadCopy a_thread_copy_; BThreadCopy b_thread_copy_; diff --git a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp similarity index 74% rename from composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp rename to composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp index 694b2fd2cc..4303b6a4ca 100644 --- a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp @@ -1,18 +1,18 @@ -#ifndef CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP -#define CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP +#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP +#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "cluster_descriptor.hpp" -#include "threadwise_dynamic_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_transfer.hpp" namespace ck { // this version does following things to avoid scratch memory issue // 1. Use StaticallyIndexedArray instead of C array for thread buffer -// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor -// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate +// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor +// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate template -struct BlockwiseDynamicTensorSliceTransfer_v4 +struct BlockwiseTensorSliceTransfer_v4 { static constexpr index_t nDim = remove_reference_t::GetNumOfDimension(); using Index = MultiIndex; - __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4(const SrcDesc& src_desc, - const Index& src_block_slice_origin, - const DstDesc& dst_desc, - const Index& dst_block_slice_origin) + __device__ constexpr BlockwiseTensorSliceTransfer_v4(const SrcDesc& src_desc, + const Index& src_block_slice_origin, + const DstDesc& dst_desc, + const Index& dst_block_slice_origin) : threadwise_transfer_( src_desc, make_zero_multi_index(), dst_desc, make_zero_multi_index()) @@ -147,22 +147,22 @@ struct BlockwiseDynamicTensorSliceTransfer_v4 make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); using ThreadwiseTransfer = - ThreadwiseDynamicTensorSliceTransfer_v3; + ThreadwiseTensorSliceTransfer_v3; ThreadwiseTransfer threadwise_transfer_; }; diff --git a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp similarity index 75% rename from composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp rename to composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp index 20f3225f82..25df52904d 100644 --- a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp @@ -1,18 +1,18 @@ -#ifndef CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP -#define CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP +#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP +#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "cluster_descriptor.hpp" -#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp" +#include "threadwise_tensor_slice_transfer_v2.hpp" namespace ck { // this version does following things to avoid scratch memory issue // 1. Use StaticallyIndexedArray instead of C array for thread buffer -// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor -// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate +// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor +// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate template -struct BlockwiseDynamicTensorSliceTransfer_v4r1 +struct BlockwiseTensorSliceTransfer_v4r1 { static constexpr index_t nDim = remove_reference_t::GetNumOfDimension(); using Index = MultiIndex; - __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4r1( - const SrcDesc& src_desc, - const Index& src_block_slice_origin, - const DstDesc& dst_desc, - const Index& dst_block_slice_origin) + __device__ constexpr BlockwiseTensorSliceTransfer_v4r1(const SrcDesc& src_desc, + const Index& src_block_slice_origin, + const DstDesc& dst_desc, + const Index& dst_block_slice_origin) : threadwise_transfer_( src_desc, make_zero_multi_index(), dst_desc, make_zero_multi_index()) @@ -136,20 +135,20 @@ struct BlockwiseDynamicTensorSliceTransfer_v4r1 make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); using ThreadwiseTransfer = - ThreadwiseDynamicTensorSliceTransfer_v3r1; + ThreadwiseTensorSliceTransfer_v3r1; ThreadwiseTransfer threadwise_transfer_; }; diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp b/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp similarity index 91% rename from composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp rename to composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp index 6d48a18169..3070045554 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp @@ -1,14 +1,14 @@ -#ifndef CK_GRIDWISE_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP -#define CK_GRIDWISE_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP +#ifndef CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP +#define CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP #include "common_header.hpp" -#include "dynamic_multi_index_transform_helper.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "multi_index_transform_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "blockwise_gemm_dlops_v2r3.hpp" -#include "blockwise_dynamic_tensor_slice_transfer_v2.hpp" -#include "threadwise_dynamic_tensor_slice_transfer.hpp" -#include "threadwise_dynamic_tensor_slice_set.hpp" +#include "blockwise_tensor_slice_transfer_v2.hpp" +#include "threadwise_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_set.hpp" namespace ck { @@ -25,7 +25,7 @@ __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_dynamic_contraction_dlops_v1r2( + kernel_contraction_dlops_v1r2( const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, @@ -89,7 +89,7 @@ template -struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1 +struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1 { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -110,17 +110,15 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 // A matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = - make_dynamic_naive_tensor_descriptor_aligned_v2( - make_tuple(Number{}, GM0, I1, Number{}, GK1), - max_lds_align); + constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2( + make_tuple(Number{}, GM0, I1, Number{}, GK1), + max_lds_align); // B matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = - make_dynamic_naive_tensor_descriptor_aligned_v2( - make_tuple(Number{}, GN0, I1, Number{}, GK1), - max_lds_align); + constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2( + make_tuple(Number{}, GN0, I1, Number{}, GK1), + max_lds_align); // LDS allocation for A and B: be careful of alignment constexpr auto a_block_aligned_space_size = math::integer_least_multiple( @@ -201,7 +199,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 const auto GM11 = Number{}; const auto GM10 = GM1 / GM11; - const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = transform_dynamic_tensor_descriptor( + const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = transform_tensor_descriptor( a_grid_desc_gk0_gm0_gm1_gk1, make_tuple(make_pass_through_transform(GK0), make_pass_through_transform(GM0), @@ -222,7 +220,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 const auto GN11 = Number{}; const auto GN10 = GN1 / GN11; - const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = transform_dynamic_tensor_descriptor( + const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = transform_tensor_descriptor( b_grid_desc_gk0_gn0_gn1_gk1, make_tuple(make_pass_through_transform(GK0), make_pass_through_transform(GN0), @@ -259,7 +257,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 constexpr auto BM0 = BM / BM1; constexpr auto BN0 = BN / BN1; - const auto c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc = transform_dynamic_tensor_descriptor( + const auto c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc = transform_tensor_descriptor( c_grid_desc_gm0_gm1_gn0_gn1, make_tuple(make_pass_through_transform(GM0), make_unmerge_transform(make_tuple(GM10, GM11)), @@ -268,7 +266,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{})); - const auto c_gm10_bm_gn10_bn_grid_desc = transform_dynamic_tensor_descriptor( + const auto c_gm10_bm_gn10_bn_grid_desc = transform_tensor_descriptor( c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc, make_tuple(make_pass_through_transform(GM10), make_merge_transform(make_tuple(GM0, GM11)), @@ -277,7 +275,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 make_tuple(Sequence<1>{}, Sequence<0, 2>{}, Sequence<4>{}, Sequence<3, 5>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = transform_dynamic_tensor_descriptor( + const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = transform_tensor_descriptor( c_gm10_bm_gn10_bn_grid_desc, make_tuple(make_pass_through_transform(GM10), make_unmerge_transform(make_tuple(BM0, BM1)), @@ -356,26 +354,24 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 // A matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = - make_dynamic_naive_tensor_descriptor_aligned_v2( - make_tuple(Number{}, GM0, I1, Number{}, GK1), - max_lds_align); + constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2( + make_tuple(Number{}, GM0, I1, Number{}, GK1), + max_lds_align); // B matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = - make_dynamic_naive_tensor_descriptor_aligned_v2( - make_tuple(Number{}, GN0, I1, Number{}, GK1), - max_lds_align); + constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2( + make_tuple(Number{}, GN0, I1, Number{}, GK1), + max_lds_align); // A matrix in LDS memory for blockwise GEMM // be careful of LDS alignment - constexpr auto a_block_desc_gk0_bm_gk1 = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, GM0 * Number{}, GK1), max_lds_align); // B matrix in LDS memory for blockwise GEMM // be careful of LDS alignment - constexpr auto b_block_desc_gk0_bn_gk1 = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, GN0 * Number{}, GK1), max_lds_align); static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() == @@ -385,7 +381,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 "wrong!"); // A matrix blockwise copy - auto a_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1< + auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1< BlockSize, InMemoryDataOperationEnum_t::Set, Sequence, @@ -409,7 +405,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 make_multi_index(0, 0, 0, 0, 0)); // B matrix blockwise copy - auto b_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1< + auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1< BlockSize, InMemoryDataOperationEnum_t::Set, Sequence, @@ -457,9 +453,8 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 constexpr auto c_thread_tensor_lengths_bm0_bm1_bn0_bn1 = decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1(); - constexpr auto c_thread_desc_bm0_bm1_bn0_bn1 = - make_dynamic_naive_tensor_descriptor_packed_v2( - sequence_to_tuple_of_number(c_thread_tensor_lengths_bm0_bm1_bn0_bn1)); + constexpr auto c_thread_desc_bm0_bm1_bn0_bn1 = make_naive_tensor_descriptor_packed( + sequence_to_tuple_of_number(c_thread_tensor_lengths_bm0_bm1_bn0_bn1)); // LDS allocation for A and B: be careful of alignment constexpr auto a_block_aligned_space_size = math::integer_least_multiple( @@ -475,9 +470,9 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 auto c_thread_buf = make_static_buffer( c_thread_desc_bm0_bm1_bn0_bn1.GetElementSpaceSize()); - ThreadwiseDynamicTensorSliceSet_v1{} + ThreadwiseTensorSliceSet_v1{} .Run(c_thread_desc_bm0_bm1_bn0_bn1, make_tuple(I0, I0, I0, I0), c_thread_buf, @@ -615,7 +610,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 // output: register to global memory { constexpr auto c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1 = - make_dynamic_naive_tensor_descriptor_packed_v2( + make_naive_tensor_descriptor_packed( make_tuple(I1, Number{}, Number{}, @@ -627,7 +622,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0 blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1( get_thread_local_1d_id()); - ThreadwiseDynamicTensorSliceTransfer_v1r3< + ThreadwiseTensorSliceTransfer_v1r3< FloatAcc, FloatC, decltype(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1), diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp similarity index 82% rename from composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp rename to composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp index e4858af492..88f2059bbf 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp @@ -1,14 +1,14 @@ -#ifndef CK_GRIDWISE_DYNAMIC_GEMM_DLOPS_V1R2_HPP -#define CK_GRIDWISE_DYNAMIC_GEMM_DLOPS_V1R2_HPP +#ifndef CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP +#define CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP #include "common_header.hpp" -#include "dynamic_multi_index_transform_helper.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "multi_index_transform_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "blockwise_gemm_dlops_v2r2.hpp" -#include "blockwise_dynamic_tensor_slice_transfer.hpp" -#include "threadwise_dynamic_tensor_slice_transfer.hpp" -#include "threadwise_dynamic_tensor_slice_set.hpp" +#include "blockwise_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_set.hpp" namespace ck { @@ -26,7 +26,7 @@ __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_dynamic_gemm_dlops_v1r2( + kernel_gemm_dlops_v1r2( const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, @@ -68,14 +68,13 @@ __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_dynamic_gemm_dlops_v1r2( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k_m0_m1_grid_desc, - const void CONSTANT* p_b_k_n0_n1_grid_desc, - const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) + kernel_gemm_dlops_v1r2(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_a_k_m0_m1_grid_desc, + const void CONSTANT* p_b_k_n0_n1_grid_desc, + const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, + const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) { // first cast void CONSTANT void* to void* // second cast void* to Desc* @@ -151,7 +150,7 @@ template -struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 +struct GridwiseGemmDlops_km_kn_mn_v1r2 { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -167,12 +166,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 // A matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}), max_lds_align); // B matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}), max_lds_align); // LDS allocation for A and B: be careful of alignment @@ -230,7 +229,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 const auto M1 = Number{}; const auto M0 = M / M1; - const auto a_k_m0_m1_grid_desc = transform_dynamic_tensor_descriptor( + const auto a_k_m0_m1_grid_desc = transform_tensor_descriptor( a_k_m_grid_desc, make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(M0, M1))), make_tuple(Sequence<0>{}, Sequence<1>{}), @@ -248,7 +247,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 const auto N1 = Number{}; const auto N0 = N / N1; - const auto b_k_n0_n1_grid_desc = transform_dynamic_tensor_descriptor( + const auto b_k_n0_n1_grid_desc = transform_tensor_descriptor( b_k_n_grid_desc, make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(N0, N1))), make_tuple(Sequence<0>{}, Sequence<1>{}), @@ -277,7 +276,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 constexpr auto M10 = M1 / M11; constexpr auto N10 = N1 / N11; - const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_dynamic_tensor_descriptor( + const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor( c_m_n_grid_desc, make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)), make_unmerge_transform(make_tuple(N0, N10, N11))), @@ -352,75 +351,75 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 // A matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}), max_lds_align); // B matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}), max_lds_align); // A matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto a_k_m0_m1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, I1, Number{}), max_lds_align); // B matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto b_k_n0_n1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, I1, Number{}), max_lds_align); // A matrix blockwise copy auto a_blockwise_copy = - BlockwiseDynamicTensorSliceTransfer_v4, - ABlockTransferThreadSliceLengths_K_M0_M1, - ABlockTransferThreadClusterLengths_K_M0_M1, - ABlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(a_k_m0_m1_grid_desc), - decltype(a_k_m0_m1_block_desc), - ABlockTransferSrcAccessOrder, - Sequence<0, 1, 2>, - ABlockTransferSrcVectorDim, - 2, - ABlockTransferSrcScalarPerVector, - ABlockTransferDstScalarPerVector_M1, - 1, - 1, - AThreadTransferSrcResetCoordinateAfterRun, - true>(a_k_m0_m1_grid_desc, - make_multi_index(0, im0, 0), - a_k_m0_m1_block_desc, - make_multi_index(0, 0, 0)); + BlockwiseTensorSliceTransfer_v4, + ABlockTransferThreadSliceLengths_K_M0_M1, + ABlockTransferThreadClusterLengths_K_M0_M1, + ABlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(a_k_m0_m1_grid_desc), + decltype(a_k_m0_m1_block_desc), + ABlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + ABlockTransferSrcVectorDim, + 2, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_M1, + 1, + 1, + AThreadTransferSrcResetCoordinateAfterRun, + true>(a_k_m0_m1_grid_desc, + make_multi_index(0, im0, 0), + a_k_m0_m1_block_desc, + make_multi_index(0, 0, 0)); // B matrix blockwise copy auto b_blockwise_copy = - BlockwiseDynamicTensorSliceTransfer_v4, - BBlockTransferThreadSliceLengths_K_N0_N1, - BBlockTransferThreadClusterLengths_K_N0_N1, - BBlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(b_k_n0_n1_grid_desc), - decltype(b_k_n0_n1_block_desc), - BBlockTransferSrcAccessOrder, - Sequence<0, 1, 2>, - BBlockTransferSrcVectorDim, - 2, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_N1, - 1, - 1, - BThreadTransferSrcResetCoordinateAfterRun, - true>(b_k_n0_n1_grid_desc, - make_multi_index(0, in0, 0), - b_k_n0_n1_block_desc, - make_multi_index(0, 0, 0)); + BlockwiseTensorSliceTransfer_v4, + BBlockTransferThreadSliceLengths_K_N0_N1, + BBlockTransferThreadClusterLengths_K_N0_N1, + BBlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(b_k_n0_n1_grid_desc), + decltype(b_k_n0_n1_block_desc), + BBlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_N1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true>(b_k_n0_n1_grid_desc, + make_multi_index(0, in0, 0), + b_k_n0_n1_block_desc, + make_multi_index(0, 0, 0)); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -447,9 +446,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths = decltype(blockwise_gemm)::GetCM0M1N0N1ThreadTensorLengths(); - constexpr auto c_m10_m11_n10_n11_thread_desc = - make_dynamic_naive_tensor_descriptor_packed_v2( - sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths)); + constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed( + sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths)); // LDS allocation for A and B: be careful of alignment constexpr auto a_block_aligned_space_size = @@ -465,9 +463,9 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 auto c_thread_buf = make_static_buffer( c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize()); - ThreadwiseDynamicTensorSliceSet_v1{} + ThreadwiseTensorSliceSet_v1{} .Run(c_m10_m11_n10_n11_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, @@ -620,7 +618,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 // output: register to global memory { constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc = - make_dynamic_naive_tensor_descriptor_packed_v2( + make_naive_tensor_descriptor_packed( make_tuple(I1, Number{}, Number{}, @@ -631,7 +629,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 const auto c_m10_m11_n10_n11_thread_origin_idx_on_block = blockwise_gemm.CalculateCM0M1N0N1ThreadOriginOnBlock(get_thread_local_1d_id()); - ThreadwiseDynamicTensorSliceTransfer_v1r3< + ThreadwiseTensorSliceTransfer_v1r3< FloatAcc, FloatC, decltype(c_m0_m10_m11_n0_n10_n11_thread_desc), diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp similarity index 89% rename from composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp rename to composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp index 244c376cf8..70cedf3fa0 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp @@ -1,14 +1,14 @@ -#ifndef CK_GRIDWISE_DYNAMIC_GEMM_V1R3_HPP -#define CK_GRIDWISE_DYNAMIC_GEMM_V1R3_HPP +#ifndef CK_GRIDWISE_GEMM_V1R3_HPP +#define CK_GRIDWISE_GEMM_V1R3_HPP #include "common_header.hpp" -#include "dynamic_multi_index_transform_helper.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "multi_index_transform_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "blockwise_gemm_dlops_v2r3.hpp" -#include "blockwise_dynamic_tensor_slice_transfer_v2.hpp" -#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp" -#include "threadwise_dynamic_tensor_slice_set.hpp" +#include "blockwise_tensor_slice_transfer_v2.hpp" +#include "threadwise_tensor_slice_transfer_v2.hpp" +#include "threadwise_tensor_slice_set.hpp" namespace ck { @@ -26,7 +26,7 @@ __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_dynamic_gemm_dlops_v1r3( + kernel_gemm_dlops_v1r3( const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, @@ -68,14 +68,13 @@ __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_dynamic_gemm_dlops_v1r3( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc, - const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc, - const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) + kernel_gemm_dlops_v1r3(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc, + const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc, + const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, + const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) { // first cast void CONSTANT void* to void* // second cast void* to Desc* @@ -147,7 +146,7 @@ template -struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 +struct GridwiseGemmDlops_km_kn_mn_v1r3 { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -164,12 +163,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 // TODO: check alignment // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}, K1), max_lds_align); // TODO: check alignment // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}, K1), max_lds_align); // TODO: check alignment @@ -231,13 +230,13 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 const auto M1 = Number{}; const auto M0 = M / M1; - const auto a_k0_m0_m1_k1_grid_desc = transform_dynamic_tensor_descriptor( - a_k0_m_k1_grid_desc, - make_tuple(make_pass_through_transform(K0), - make_unmerge_transform(make_tuple(M0, M1)), - make_pass_through_transform(K1)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); + const auto a_k0_m0_m1_k1_grid_desc = + transform_tensor_descriptor(a_k0_m_k1_grid_desc, + make_tuple(make_pass_through_transform(K0), + make_unmerge_transform(make_tuple(M0, M1)), + make_pass_through_transform(K1)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); return a_k0_m0_m1_k1_grid_desc; } @@ -251,13 +250,13 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 const auto N1 = Number{}; const auto N0 = N / N1; - const auto b_k0_n0_n1_k1_grid_desc = transform_dynamic_tensor_descriptor( - b_k0_n_k1_grid_desc, - make_tuple(make_pass_through_transform(K0), - make_unmerge_transform(make_tuple(N0, N1)), - make_pass_through_transform(K1)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); + const auto b_k0_n0_n1_k1_grid_desc = + transform_tensor_descriptor(b_k0_n_k1_grid_desc, + make_tuple(make_pass_through_transform(K0), + make_unmerge_transform(make_tuple(N0, N1)), + make_pass_through_transform(K1)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); return b_k0_n0_n1_k1_grid_desc; } @@ -284,7 +283,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 constexpr auto M10 = M1 / M11; constexpr auto N10 = N1 / N11; - const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_dynamic_tensor_descriptor( + const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor( c_m_n_grid_desc, make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)), make_unmerge_transform(make_tuple(N0, N10, N11))), @@ -355,23 +354,23 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 // TODO: check alignment // A matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto a_k0_m0_m1_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, I1, Number{}, K1), max_lds_align); // TODO: check alignment // B matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto b_k0_n0_n1_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, I1, Number{}, K1), max_lds_align); // TODO: check alignment // A matrix in LDS memory, for blockwise GEMM - constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}, K1), max_lds_align); // TODO: check alignment // B matrix in LDS memory, for blockwise GEMM - constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}, K1), max_lds_align); static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() == @@ -381,7 +380,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 "wrong!"); // A matrix blockwise copy - auto a_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1< + auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1< BlockSize, InMemoryDataOperationEnum_t::Set, Sequence, @@ -405,7 +404,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 make_multi_index(0, 0, 0, 0)); // B matrix blockwise copy - auto b_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1< + auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1< BlockSize, InMemoryDataOperationEnum_t::Set, Sequence, @@ -453,9 +452,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths = decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1(); - constexpr auto c_m10_m11_n10_n11_thread_desc = - make_dynamic_naive_tensor_descriptor_packed_v2( - sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths)); + constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed( + sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths)); // LDS allocation for A and B: be careful of alignment constexpr auto a_block_aligned_space_size = math::integer_least_multiple( @@ -471,9 +469,9 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 auto c_thread_buf = make_static_buffer( c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize()); - ThreadwiseDynamicTensorSliceSet_v1{} + ThreadwiseTensorSliceSet_v1{} .Run(c_m10_m11_n10_n11_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, @@ -609,7 +607,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 // output: register to global memory { constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc = - make_dynamic_naive_tensor_descriptor_packed_v2( + make_naive_tensor_descriptor_packed( make_tuple(I1, Number{}, Number{}, @@ -621,7 +619,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1( get_thread_local_1d_id()); - ThreadwiseDynamicTensorSliceTransfer_v1r3< + ThreadwiseTensorSliceTransfer_v1r3< FloatAcc, FloatC, decltype(c_m0_m10_m11_n0_n10_n11_thread_desc), diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp similarity index 79% rename from composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp rename to composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp index 5e90e0e85d..484f5d938d 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp @@ -1,12 +1,12 @@ -#ifndef CK_GRIDWISE_DYNAMIC_GEMM_V2_HPP -#define CK_GRIDWISE_DYNAMIC_GEMM_V2_HPP +#ifndef CK_GRIDWISE_GEMM_V2_HPP +#define CK_GRIDWISE_GEMM_V2_HPP #include "common_header.hpp" -#include "dynamic_multi_index_transform_helper.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "blockwise_dynamic_tensor_slice_transfer.hpp" -#include "threadwise_dynamic_tensor_slice_transfer.hpp" +#include "multi_index_transform_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "blockwise_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_transfer.hpp" #include "blockwise_gemm_dlops_v3.hpp" namespace ck { @@ -47,7 +47,7 @@ template -struct GridwiseDynamicGemmDlops_km_kn_mn_v3 +struct GridwiseGemmDlops_km_kn_mn_v3 { __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() { @@ -58,7 +58,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3 // A matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}), max_lds_align); // LDS allocation for A and B: be careful of alignment @@ -132,23 +132,21 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3 // A matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto a_e_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}), max_lds_align); - constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}), max_lds_align); // B matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto b_e_n_ho_wo_block_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( - Number{}, Number<1>{}, Number{}, Number{})); + constexpr auto b_e_n_ho_wo_block_desc = make_naive_tensor_descriptor_packed(make_tuple( + Number{}, Number<1>{}, Number{}, Number{})); // c_thread_mtx definition: this is a mess // TODO:: more elegent way of defining c_thread_mtx - constexpr auto c_k_n_ho_wo_thread_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( - Number{}, Number<1>{}, Number{}, Number{})); + constexpr auto c_k_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple( + Number{}, Number<1>{}, Number{}, Number{})); auto blockwise_gemm = BlockwiseGemmDlops_km_kn_m0m1n0n1_v3, - ABlockTransferThreadSliceLengths_E_K, - ABlockTransferThreadClusterLengths_E_K, - ABlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(a_e_k_global_desc), - decltype(a_e_k_desc), - ABlockTransferSrcAccessOrder, - Sequence<0, 1>, - ABlockTransferSrcVectorDim, - 1, - ABlockTransferSrcScalarPerVector, - ABlockTransferDstScalarPerVector_K, - 1, - 1, - AThreadTransferSrcResetCoordinateAfterRun, - true>( - a_e_k_global_desc, - make_multi_index(0, k_block_data_on_global), - a_e_k_desc, - make_multi_index(0, 0)); + BlockwiseTensorSliceTransfer_v4, + ABlockTransferThreadSliceLengths_E_K, + ABlockTransferThreadClusterLengths_E_K, + ABlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(a_e_k_global_desc), + decltype(a_e_k_desc), + ABlockTransferSrcAccessOrder, + Sequence<0, 1>, + ABlockTransferSrcVectorDim, + 1, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K, + 1, + 1, + AThreadTransferSrcResetCoordinateAfterRun, + true>(a_e_k_global_desc, + make_multi_index(0, k_block_data_on_global), + a_e_k_desc, + make_multi_index(0, 0)); - constexpr auto b_e_n_ho_wo_thread_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( - Number{}, Number<1>{}, Number{}, Number{})); + constexpr auto b_e_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple( + Number{}, Number<1>{}, Number{}, Number{})); - auto b_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2< - FloatAB, - FloatAB, - decltype(b_e_n_ho_wo_global_desc), - decltype(b_e_n_ho_wo_thread_desc), - Sequence, - BBlockTransferSrcAccessOrder, - BBlockTransferSrcVectorDim, - BBlockTransferSrcScalarPerVector, - 1, - true>(b_e_n_ho_wo_global_desc, - make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global)); + auto b_threadwise_transfer = + ThreadwiseTensorSliceTransfer_v2, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + 1, + true>( + b_e_n_ho_wo_global_desc, + make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global)); auto a_block_buf = make_dynamic_buffer( p_shared_block, a_e_k_desc.GetElementSpaceSize()); @@ -234,9 +231,9 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3 c_thread_buf; // initialize output thread tensor - ThreadwiseDynamicTensorSliceSet_v1>{} + ThreadwiseTensorSliceSet_v1>{} .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0}); constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0); @@ -354,18 +351,17 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3 const index_t k_thread_data_on_global = k_block_data_on_global + k_thread_id * KPerThread; - ThreadwiseDynamicTensorSliceTransfer_v1r3< - FloatAcc, - FloatC, - decltype(c_k_n_ho_wo_thread_desc), - decltype(c_k_n_ho_wo_global_desc), - Sequence, - CThreadTransferSrcDstAccessOrder, - CThreadTransferSrcDstVectorDim, - CThreadTransferDstScalarPerVector, - CGlobalMemoryDataOperation, - 1, - true>( + ThreadwiseTensorSliceTransfer_v1r3, + CThreadTransferSrcDstAccessOrder, + CThreadTransferSrcDstVectorDim, + CThreadTransferDstScalarPerVector, + CGlobalMemoryDataOperation, + 1, + true>( c_k_n_ho_wo_global_desc, make_multi_index( k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global)) diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp similarity index 81% rename from composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp rename to composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp index 4f02da1409..b70d57f913 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp @@ -1,14 +1,14 @@ -#ifndef CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R3_HPP -#define CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R3_HPP +#ifndef CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP +#define CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP #include "common_header.hpp" -#include "dynamic_multi_index_transform_helper.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "multi_index_transform_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "blockwise_gemm_xdlops.hpp" -#include "blockwise_dynamic_tensor_slice_transfer.hpp" -#include "threadwise_dynamic_tensor_slice_transfer.hpp" -#include "threadwise_dynamic_tensor_slice_set.hpp" +#include "blockwise_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_set.hpp" namespace ck { @@ -24,13 +24,13 @@ __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_dynamic_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const AK0MK1GridDesc a_k0_m_k1_grid_desc, - const BK0NK1GridDesc b_k0_n_k1_grid_desc, - const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc, - const CBlockClusterAdaptor c_block_cluster_adaptor) + kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const AK0MK1GridDesc a_k0_m_k1_grid_desc, + const BK0NK1GridDesc b_k0_n_k1_grid_desc, + const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc, + const CBlockClusterAdaptor c_block_cluster_adaptor) { constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); @@ -58,13 +58,13 @@ __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_dynamic_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k0_m_k1_grid_desc, - const void CONSTANT* p_b_k0_n_k1_grid_desc, - const void CONSTANT* p_c_m0_m1_m2_n_grid_desc, - const void CONSTANT* p_c_block_cluster_adaptor) + kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_a_k0_m_k1_grid_desc, + const void CONSTANT* p_b_k0_n_k1_grid_desc, + const void CONSTANT* p_c_m0_m1_m2_n_grid_desc, + const void CONSTANT* p_c_block_cluster_adaptor) { constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); @@ -132,7 +132,7 @@ template -struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 +struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -148,12 +148,12 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 // A matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}, K1), max_lds_align); // B matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}, K1), max_lds_align); // LDS allocation for A and B: be careful of alignment @@ -216,7 +216,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 constexpr auto N1 = Number{}; - const auto c_m0_m1_m2_n_grid_desc = transform_dynamic_tensor_descriptor( + const auto c_m0_m1_m2_n_grid_desc = transform_tensor_descriptor( c_m_n_grid_desc, make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, M0, M1, M2)), make_unmerge_transform(make_tuple(NRepeat, NWaves, N1))), @@ -290,67 +290,65 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 // A matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}, K1), max_lds_align); // B matrix in LDS memory, dst of blockwise copy // be careful of LDS alignment - constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( + constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( make_tuple(Number{}, Number{}, K1), max_lds_align); // A matrix blockwise copy auto a_blockwise_copy = - BlockwiseDynamicTensorSliceTransfer_v4, - ABlockTransferThreadSliceLengths_K0_M_K1, - ABlockTransferThreadClusterLengths_K0_M_K1, - ABlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(a_k0_m_k1_grid_desc), - decltype(a_k0_m_k1_block_desc), - ABlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, - ABlockTransferSrcVectorDim, - 2, - ABlockTransferSrcScalarPerVector, - ABlockTransferDstScalarPerVector_K1, - 1, - 1, - AThreadTransferSrcResetCoordinateAfterRun, - true>( - a_k0_m_k1_grid_desc, - make_multi_index(0, m_block_data_idx_on_grid, 0), - a_k0_m_k1_block_desc, - make_multi_index(0, 0, 0)); + BlockwiseTensorSliceTransfer_v4, + ABlockTransferThreadSliceLengths_K0_M_K1, + ABlockTransferThreadClusterLengths_K0_M_K1, + ABlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(a_k0_m_k1_grid_desc), + decltype(a_k0_m_k1_block_desc), + ABlockTransferSrcAccessOrder, + Sequence<1, 0, 2>, + ABlockTransferSrcVectorDim, + 2, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K1, + 1, + 1, + AThreadTransferSrcResetCoordinateAfterRun, + true>(a_k0_m_k1_grid_desc, + make_multi_index(0, m_block_data_idx_on_grid, 0), + a_k0_m_k1_block_desc, + make_multi_index(0, 0, 0)); // B matrix blockwise copy auto b_blockwise_copy = - BlockwiseDynamicTensorSliceTransfer_v4, - BBlockTransferThreadSliceLengths_K0_N_K1, - BBlockTransferThreadClusterLengths_K0_N_K1, - BBlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(b_k0_n_k1_grid_desc), - decltype(b_k0_n_k1_block_desc), - BBlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, - BBlockTransferSrcVectorDim, - 2, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_K1, - 1, - 1, - BThreadTransferSrcResetCoordinateAfterRun, - true>( - b_k0_n_k1_grid_desc, - make_multi_index(0, n_block_data_idx_on_grid, 0), - b_k0_n_k1_block_desc, - make_multi_index(0, 0, 0)); + BlockwiseTensorSliceTransfer_v4, + BBlockTransferThreadSliceLengths_K0_N_K1, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(b_k0_n_k1_grid_desc), + decltype(b_k0_n_k1_block_desc), + BBlockTransferSrcAccessOrder, + Sequence<1, 0, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true>(b_k0_n_k1_grid_desc, + make_multi_index(0, n_block_data_idx_on_grid, 0), + b_k0_n_k1_block_desc, + make_multi_index(0, 0, 0)); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -364,7 +362,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 NPerBlock % (NPerWave * NRepeat) == 0, "wrong!"); - constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor( + constexpr auto a_k0_m0_m1_k1_block_desc = transform_tensor_descriptor( a_k0_m_k1_block_desc, make_tuple(make_pass_through_transform(Number{}), make_unmerge_transform( @@ -373,7 +371,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); - constexpr auto b_k0_n0_n1_k1_block_desc = transform_dynamic_tensor_descriptor( + constexpr auto b_k0_n0_n1_k1_block_desc = transform_tensor_descriptor( b_k0_n_k1_block_desc, make_tuple(make_pass_through_transform(Number{}), make_unmerge_transform( @@ -399,8 +397,8 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 static_assert(NumBlks == 1 && NumXdlops == 1, "K Reduction Mfma only"); - constexpr auto c_mr_nr_blk_desc = make_dynamic_naive_tensor_descriptor_packed_v2( - make_tuple(Number{}, Number{})); + constexpr auto c_mr_nr_blk_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number{})); StaticBuffer, @@ -492,7 +490,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 constexpr index_t N1 = CLayout.N0(); constexpr auto c_m0_m1_m2_n_thread_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(Number{}, + make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number{}, Number<1>{}, Number<1>{}, @@ -533,7 +531,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat); constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat); - ThreadwiseDynamicTensorSliceTransfer_v1r3< + ThreadwiseTensorSliceTransfer_v1r3< FloatC, FloatC, decltype(c_m0_m1_m2_n_thread_desc), @@ -567,9 +565,8 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 constexpr index_t M1 = CLayout.N1(); constexpr index_t M2 = CLayout.M0(); - constexpr auto c_m0_m1_m2_n_thread_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( - I1, I1, I1, I1, Number{}, Number<1>{}, Number{}, Number<1>{})); + constexpr auto c_m0_m1_m2_n_thread_desc = make_naive_tensor_descriptor_packed( + make_tuple(I1, I1, I1, I1, Number{}, Number<1>{}, Number{}, Number<1>{})); // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index @@ -585,17 +582,17 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks = CGridIteratorHacks{}; auto c_thread_copy = - ThreadwiseDynamicTensorSliceTransfer_v1r3, - CThreadTransferSrcDstAccessOrder, - CThreadTransferSrcDstVectorDim, - CThreadTransferDstScalarPerVector, - CGlobalMemoryDataOperation, - 1, - true>{ + ThreadwiseTensorSliceTransfer_v1r3, + CThreadTransferSrcDstAccessOrder, + CThreadTransferSrcDstVectorDim, + CThreadTransferDstScalarPerVector, + CGlobalMemoryDataOperation, + 1, + true>{ c_m0_m1_m2_n_grid_desc, make_multi_index(0, 0, diff --git a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp similarity index 83% rename from composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp rename to composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp index f1b632aa84..6eb058711e 100644 --- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp @@ -1,9 +1,9 @@ -#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SET_HPP -#define CK_THREADWISE_DYNAMIC_TENSOR_SET_HPP +#ifndef CK_THREADWISE_TENSOR_SET_HPP +#define CK_THREADWISE_TENSOR_SET_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -16,7 +16,7 @@ template ::type = false> -struct ThreadwiseDynamicTensorSliceSet_v1 +struct ThreadwiseTensorSliceSet_v1 { static constexpr index_t nDim = SliceLengths::Size(); @@ -40,7 +40,7 @@ struct ThreadwiseDynamicTensorSliceSet_v1 constexpr auto origin_idx = to_multi_index(OriginIdx{}); static_ford{}([&](auto access_idx) { - constexpr auto coord = make_dynamic_tensor_coordinate(desc, origin_idx + access_idx); + constexpr auto coord = make_tensor_coordinate(desc, origin_idx + access_idx); constexpr bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(desc, coord); diff --git a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp similarity index 89% rename from composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp rename to composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp index 9626113686..66be04c335 100644 --- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp @@ -1,9 +1,9 @@ -#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP -#define CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP +#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP +#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -58,19 +58,19 @@ template ::type = false> -struct ThreadwiseDynamicTensorSliceTransfer_v1r3 +struct ThreadwiseTensorSliceTransfer_v1r3 { static constexpr index_t nDim = SliceLengths::Size(); using Index = MultiIndex; - using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{})); + using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{})); + using DstCoordIterator = decltype(make_tensor_coordinate_iterator(DstDesc{}, Index{})); - __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r3( - const DstDesc& dst_desc, const Index& dst_slice_origin_idx) - : dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx)) + __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3(const DstDesc& dst_desc, + const Index& dst_slice_origin_idx) + : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)) { static_assert(SrcDesc::IsKnownAtCompileTime(), "wrong! SrcDesc need to known at compile-time"); @@ -78,7 +78,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) { - dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx); + dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); } template {}); @@ -150,7 +150,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; }); - return make_dynamic_tensor_coordinate_iterator( + return make_tensor_coordinate_iterator( dst_desc, backward_step, dst_iterator_hacks[I1][i]); }, Number{}); @@ -235,12 +235,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 { if constexpr(forward_sweep[i]) { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( dst_desc, dst_coord_, dst_forward_iterators[dim_access_order[i]]); } else { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( dst_desc, dst_coord_, dst_backward_iterators[dim_access_order[i]]); } } @@ -251,9 +251,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 if constexpr(DstResetCoordinateAfterRun) { const auto dst_reset_iterator = - make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep()); + make_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep()); - move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator); + move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator); } } @@ -345,10 +345,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); // is it OK to construct a new step every time? - const auto adjusted_step = - make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx); + const auto adjusted_step = make_tensor_coordinate_iterator(dst_desc, adjusted_step_idx); - move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); + move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); } private: @@ -375,19 +374,19 @@ template ::type = false> -struct ThreadwiseDynamicTensorSliceTransfer_v2 +struct ThreadwiseTensorSliceTransfer_v2 { static constexpr index_t nDim = SliceLengths::Size(); using Index = MultiIndex; - using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{})); + using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); - using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{})); + using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{})); - __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v2(const SrcDesc& src_desc, - const Index& src_slice_origin_idx) - : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx)) + __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc, + const Index& src_slice_origin_idx) + : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx)) { static_assert(DstDesc::IsKnownAtCompileTime(), "wrong! SrcDesc need to known at compile-time"); @@ -395,7 +394,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2 __device__ void SetDstSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) { - src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx); + src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); } template {}); @@ -465,7 +464,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2 backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; }); - return make_dynamic_tensor_coordinate_iterator( + return make_tensor_coordinate_iterator( src_desc, backward_step, src_iterator_hacks[I1][i]); }, Number{}); @@ -548,12 +547,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2 { if constexpr(forward_sweep[i]) { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( src_desc, src_coord_, src_forward_iterators[dim_access_order[i]]); } else { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( src_desc, src_coord_, src_backward_iterators[dim_access_order[i]]); } } @@ -564,9 +563,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2 if constexpr(SrcResetCoordinateAfterRun) { const auto src_reset_iterator = - make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep()); + make_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep()); - move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator); + move_tensor_coordinate(src_desc, src_coord_, src_reset_iterator); } } @@ -658,10 +657,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2 : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); // is it OK to construct a new step every time? - const auto adjusted_step = - make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx); + const auto adjusted_step = make_tensor_coordinate_iterator(src_desc, adjusted_step_idx); - move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step); + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); } private: @@ -693,23 +691,23 @@ template // control whether to move back dst coordinate after each // RunWrite(), will be fused with MoveDstSliceWindow to // save addr computation -struct ThreadwiseDynamicTensorSliceTransfer_v3 +struct ThreadwiseTensorSliceTransfer_v3 { static constexpr index_t nDim = SliceLengths::Size(); using Index = MultiIndex; - using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{})); - using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{})); + using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); + using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{})); - using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{})); + using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{})); + using DstCoordIterator = decltype(make_tensor_coordinate_iterator(DstDesc{}, Index{})); - __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3(const SrcDesc& src_desc, - const Index& src_slice_origin, - const DstDesc& dst_desc, - const Index& dst_slice_origin) - : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)), - dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin)) + __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc, + const Index& src_slice_origin, + const DstDesc& dst_desc, + const Index& dst_slice_origin) + : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), + dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)) { // TODO: fix this static_assert(is_same::value, @@ -718,12 +716,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) { - src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx); + src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); } __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) { - dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx); + dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); } template @@ -766,7 +764,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; }); - return make_dynamic_tensor_coordinate_iterator( + return make_tensor_coordinate_iterator( src_desc, forward_step, src_iterator_hacks[I0][i]); }, Number{}); @@ -780,7 +778,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; }); - return make_dynamic_tensor_coordinate_iterator( + return make_tensor_coordinate_iterator( src_desc, backward_step, src_iterator_hacks[I1][i]); }, Number{}); @@ -862,12 +860,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 { if constexpr(forward_sweep[i]) { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( src_desc, src_coord_, src_forward_iterators[src_dim_access_order[i]]); } else { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( src_desc, src_coord_, src_backward_iterators[src_dim_access_order[i]]); } } @@ -878,9 +876,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 if constexpr(SrcResetCoordinateAfterRun) { const auto src_reset_iterator = - make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep()); + make_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep()); - move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator); + move_tensor_coordinate(src_desc, src_coord_, src_reset_iterator); } } @@ -924,7 +922,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 forward_step(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; }); - const auto forward_iterator = make_dynamic_tensor_coordinate_iterator( + const auto forward_iterator = make_tensor_coordinate_iterator( dst_desc, forward_step, dst_iterator_hacks[I0][i]); return forward_iterator; @@ -940,7 +938,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; }); - const auto backward_iterator = make_dynamic_tensor_coordinate_iterator( + const auto backward_iterator = make_tensor_coordinate_iterator( dst_desc, backward_step, dst_iterator_hacks[I1][i]); return backward_iterator; @@ -1026,12 +1024,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 { if constexpr(forward_sweep[i]) { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( dst_desc, dst_coord_, dst_forward_iterators[dst_dim_access_order[i]]); } else { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( dst_desc, dst_coord_, dst_backward_iterators[dst_dim_access_order[i]]); } } @@ -1042,9 +1040,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 if constexpr(DstResetCoordinateAfterRun) { const auto dst_reset_iterator = - make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep()); + make_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep()); - move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator); + move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator); } } @@ -1206,10 +1204,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); // is it OK to construct a new step every time? - const auto adjusted_step = - make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx); + const auto adjusted_step = make_tensor_coordinate_iterator(src_desc, adjusted_step_idx); - move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step); + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); } // src_slice_origin_step_idx need to be known at compile-time, for performance reason @@ -1225,10 +1222,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); // is it OK to construct a new step every time? - const auto adjusted_step = make_dynamic_tensor_coordinate_iterator( + const auto adjusted_step = make_tensor_coordinate_iterator( src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack); - move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step); + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); } // dst_slice_origin_step_idx need to be known at compile-time, for performance reason __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, @@ -1240,15 +1237,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); // is it OK to construct a new step every time? - const auto adjusted_step = - make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx); + const auto adjusted_step = make_tensor_coordinate_iterator(dst_desc, adjusted_step_idx); - move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); + move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); } private: static constexpr auto buffer_desc_ = - make_dynamic_naive_tensor_descriptor_packed_v2(sequence_to_tuple_of_number(SliceLengths{})); + make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize(); @@ -1283,18 +1279,18 @@ template < index_t SrcScalarStrideInVector, typename std::enable_if::type = false> -struct ThreadwiseDynamicTensorSliceTransfer_v4 +struct ThreadwiseTensorSliceTransfer_v4 { static constexpr index_t nDim = SliceLengths::Size(); using Index = MultiIndex; - using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{})); + using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); - using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{})); + using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{})); - __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v4(const Index& src_ref_idx) - : src_ref_coord_(make_dynamic_tensor_coordinate(SrcDesc{}, src_ref_idx)) + __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx) + : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx)) { static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), "wrong! SrcDesc and DstDesc need to known at compile-time"); @@ -1391,12 +1387,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4 src_ref_to_origin_disp_idx + data_to_origin_disp_idx; constexpr auto src_ref_to_data_disp_coord_iterator = - make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx); + make_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx); auto src_data_coord = src_ref_coord_; - move_dynamic_tensor_coordinate( - src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator); + move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator); vector_type_maker_t src_tmp_vector; @@ -1435,10 +1430,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4 { constexpr auto src_desc = SrcDesc{}; - const auto src_slice_move_step_iter = make_dynamic_tensor_coordinate_iterator( - src_desc, to_multi_index(src_slice_move_step_idx)); + const auto src_slice_move_step_iter = + make_tensor_coordinate_iterator(src_desc, to_multi_index(src_slice_move_step_idx)); - move_dynamic_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter); + move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter); } private: diff --git a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp similarity index 86% rename from composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp rename to composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp index ba60e26c38..a2613f2e2d 100644 --- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp @@ -1,9 +1,9 @@ -#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP -#define CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP +#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP +#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" namespace ck { @@ -30,7 +30,7 @@ template // control whether to move back dst coordinate after each // RunWrite(), will be fused with MoveDstSliceWindow to // save addr computation -struct ThreadwiseDynamicTensorSliceTransfer_v3r1 +struct ThreadwiseTensorSliceTransfer_v3r1 { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -38,18 +38,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 static constexpr index_t nDim = SliceLengths::Size(); using Index = MultiIndex; - using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{})); - using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{})); + using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); + using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{})); - using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{})); + using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{})); + using DstCoordIterator = decltype(make_tensor_coordinate_iterator(DstDesc{}, Index{})); - __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3r1(const SrcDesc& src_desc, - const Index& src_slice_origin, - const DstDesc& dst_desc, - const Index& dst_slice_origin) - : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)), - dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin)) + __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(const SrcDesc& src_desc, + const Index& src_slice_origin, + const DstDesc& dst_desc, + const Index& dst_slice_origin) + : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), + dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)) { // TODO: fix this static_assert(is_same::value, @@ -64,12 +64,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) { - src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx); + src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); } __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) { - dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx); + dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); } template @@ -96,9 +96,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 I1), SrcVectorTensorContiguousDimOrder{}); - constexpr auto src_vector_desc = make_dynamic_naive_tensor_descriptor_v2( - sequence_to_tuple_of_number(src_vector_tensor_lengths), - sequence_to_tuple_of_number(src_vector_tensor_strides)); + constexpr auto src_vector_desc = + make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(src_vector_tensor_lengths), + sequence_to_tuple_of_number(src_vector_tensor_strides)); // access order and lengths constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths; @@ -117,7 +117,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 forward_step(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0; }); - return make_dynamic_tensor_coordinate_iterator( + return make_tensor_coordinate_iterator( src_desc, forward_step, src_iterator_hacks[I0][i]); }, Number{}); @@ -131,7 +131,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 backward_step(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0; }); - return make_dynamic_tensor_coordinate_iterator( + return make_tensor_coordinate_iterator( src_desc, backward_step, src_iterator_hacks[I1][i]); }, Number{}); @@ -219,12 +219,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 { if constexpr(forward_sweep[i]) { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( src_desc, src_coord_, src_forward_iterators[src_dim_access_order[i]]); } else { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( src_desc, src_coord_, src_backward_iterators[src_dim_access_order[i]]); } } @@ -235,9 +235,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 if constexpr(SrcResetCoordinateAfterRun) { const auto src_reset_iterator = - make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep()); + make_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep()); - move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator); + move_tensor_coordinate(src_desc, src_coord_, src_reset_iterator); } } @@ -265,9 +265,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 I1), DstVectorTensorContiguousDimOrder{}); - constexpr auto dst_vector_desc = make_dynamic_naive_tensor_descriptor_v2( - sequence_to_tuple_of_number(dst_vector_tensor_lengths), - sequence_to_tuple_of_number(dst_vector_tensor_strides)); + constexpr auto dst_vector_desc = + make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(dst_vector_tensor_lengths), + sequence_to_tuple_of_number(dst_vector_tensor_strides)); // dst access order and lengths constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths; @@ -286,7 +286,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 forward_step(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0; }); - const auto forward_iterator = make_dynamic_tensor_coordinate_iterator( + const auto forward_iterator = make_tensor_coordinate_iterator( dst_desc, forward_step, dst_iterator_hacks[I0][i]); return forward_iterator; @@ -302,7 +302,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 backward_step(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0; }); - const auto backward_iterator = make_dynamic_tensor_coordinate_iterator( + const auto backward_iterator = make_tensor_coordinate_iterator( dst_desc, backward_step, dst_iterator_hacks[I1][i]); return backward_iterator; @@ -394,12 +394,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 { if constexpr(forward_sweep[i]) { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( dst_desc, dst_coord_, dst_forward_iterators[dst_dim_access_order[i]]); } else { - move_dynamic_tensor_coordinate( + move_tensor_coordinate( dst_desc, dst_coord_, dst_backward_iterators[dst_dim_access_order[i]]); } } @@ -410,9 +410,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 if constexpr(DstResetCoordinateAfterRun) { const auto dst_reset_iterator = - make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep()); + make_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep()); - move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator); + move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator); } } @@ -564,10 +564,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); // is it OK to construct a new step every time? - const auto adjusted_step = - make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx); + const auto adjusted_step = make_tensor_coordinate_iterator(src_desc, adjusted_step_idx); - move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step); + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); } // src_slice_origin_step_idx need to be known at compile-time, for performance reason @@ -583,10 +582,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); // is it OK to construct a new step every time? - const auto adjusted_step = make_dynamic_tensor_coordinate_iterator( + const auto adjusted_step = make_tensor_coordinate_iterator( src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack); - move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step); + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); } // dst_slice_origin_step_idx need to be known at compile-time, for performance reason __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, @@ -598,15 +597,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1 : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); // is it OK to construct a new step every time? - const auto adjusted_step = - make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx); + const auto adjusted_step = make_tensor_coordinate_iterator(dst_desc, adjusted_step_idx); - move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); + move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); } private: static constexpr auto buffer_desc_ = - make_dynamic_naive_tensor_descriptor_packed_v2(sequence_to_tuple_of_number(SliceLengths{})); + make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize(); @@ -640,7 +638,7 @@ template < typename SrcVectorTensorContiguousDimOrder, typename std::enable_if::type = false> -struct ThreadwiseDynamicTensorSliceTransfer_v4r1 +struct ThreadwiseTensorSliceTransfer_v4r1 { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -649,12 +647,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1 using Index = MultiIndex; - using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{})); + using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); - using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{})); + using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{})); - __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v4r1(const Index& src_ref_idx) - : src_ref_coord_(make_dynamic_tensor_coordinate(SrcDesc{}, src_ref_idx)) + __device__ constexpr ThreadwiseTensorSliceTransfer_v4r1(const Index& src_ref_idx) + : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx)) { static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), "wrong! SrcDesc and DstDesc need to known at compile-time"); @@ -712,9 +710,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1 I1), SrcVectorTensorContiguousDimOrder{}); - constexpr auto src_vector_desc = make_dynamic_naive_tensor_descriptor_v2( - sequence_to_tuple_of_number(src_vector_tensor_lengths), - sequence_to_tuple_of_number(src_vector_tensor_strides)); + constexpr auto src_vector_desc = + make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(src_vector_tensor_lengths), + sequence_to_tuple_of_number(src_vector_tensor_strides)); // access order and lengths constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths; @@ -735,12 +733,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1 src_ref_to_origin_disp_idx + data_to_origin_disp_idx; constexpr auto src_ref_to_data_disp_coord_iterator = - make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx); + make_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx); auto src_data_coord = src_ref_coord_; - move_dynamic_tensor_coordinate( - src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator); + move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator); vector_type_maker_t src_vector; @@ -775,10 +772,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1 { constexpr auto src_desc = SrcDesc{}; - const auto src_slice_move_step_iter = make_dynamic_tensor_coordinate_iterator( - src_desc, to_multi_index(src_slice_move_step_idx)); + const auto src_slice_move_step_iter = + make_tensor_coordinate_iterator(src_desc, to_multi_index(src_slice_move_step_idx)); - move_dynamic_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter); + move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter); } private: diff --git a/composable_kernel/include/utility/config.hpp b/composable_kernel/include/utility/config.hpp index 547d1fadbe..49f1bb7a5a 100644 --- a/composable_kernel/include/utility/config.hpp +++ b/composable_kernel/include/utility/config.hpp @@ -99,8 +99,8 @@ // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be // thread-invariant, otherwise it's a bug // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread" -#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE -#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0 +#ifndef CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE +#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0 #endif // workaround for compiler crash when compiling recursive lambda diff --git a/composable_kernel/include/utility/dynamic_buffer.hpp b/composable_kernel/include/utility/dynamic_buffer.hpp index b41639051f..920a716765 100644 --- a/composable_kernel/include/utility/dynamic_buffer.hpp +++ b/composable_kernel/include/utility/dynamic_buffer.hpp @@ -1,5 +1,5 @@ -#ifndef CK_DYNAMIC_BUFFER_HPP -#define CK_DYNAMIC_BUFFER_HPP +#ifndef CK_BUFFER_HPP +#define CK_BUFFER_HPP #include "amd_buffer_addressing.hpp" #include "c_style_pointer_cast.hpp" diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp similarity index 67% rename from composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp rename to composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp index 652ccdb926..1843a0ca64 100644 --- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp +++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp @@ -1,7 +1,7 @@ #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "gridwise_dynamic_gemm_dlops_v1r2.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_dlops_v1r2.hpp" #include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp" using namespace ck; @@ -64,8 +64,7 @@ constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDs constexpr bool HasMainKBlockLoop = static_cast(CK_PARAM_HAS_MAIN_KBLOCK_LOOP); constexpr bool HasDoubleTailKBlockLoop = static_cast(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP); -extern "C" __global__ void -dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare( +extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare( int n, int c, int hi, @@ -93,12 +92,9 @@ dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare( const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1; const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1; - const auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi)); - const auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x)); - const auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo)); + const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi)); + const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x)); + const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo)); const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad( wei_k_c_y_x_desc, @@ -151,48 +147,48 @@ dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare( using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>; using GridwiseGemm = - GridwiseDynamicGemmDlops_km_kn_mn_v1r2; + GridwiseGemmDlops_km_kn_mn_v1r2; auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc); auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc); @@ -216,7 +212,7 @@ extern "C" __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( + convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, @@ -230,11 +226,11 @@ extern "C" __global__ void constexpr auto I2 = Number<2>{}; constexpr auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28)); + make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); constexpr auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3)); + make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3)); constexpr auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28)); + make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); constexpr auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc, @@ -287,48 +283,48 @@ extern "C" __global__ void using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>; using GridwiseGemm = - GridwiseDynamicGemmDlops_km_kn_mn_v1r2; + GridwiseGemmDlops_km_kn_mn_v1r2; constexpr auto a_k_m0_m1_grid_desc_tmp = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc); diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp similarity index 64% rename from composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp rename to composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp index d33bc74aa6..d434dab6fe 100644 --- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp +++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp @@ -1,7 +1,7 @@ #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_xdlops_v2r3.hpp" #include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp" using namespace ck; @@ -60,8 +60,7 @@ using CThreadTransferSrcDstAccessOrder = Sequence; using GridwiseGemm = - GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3; + GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3; auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc); @@ -212,7 +208,7 @@ extern "C" __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( + convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, @@ -227,11 +223,11 @@ extern "C" __global__ void constexpr auto I2 = Number<2>{}; constexpr auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28)); + make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); constexpr auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3)); + make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3)); constexpr auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28)); + make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); constexpr auto descs = transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc, @@ -285,47 +281,47 @@ extern "C" __global__ void using CMNGridDesc = decltype(c_m_n_grid_desc); using GridwiseGemm = - GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3; + GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3; constexpr auto c_m0_m1_m2_n_grid_desc_tmp = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc); diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp similarity index 64% rename from composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp rename to composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp index d946bc63ee..7678a69b12 100644 --- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp +++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp @@ -1,7 +1,7 @@ #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_xdlops_v2r3.hpp" #include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp" using namespace ck; @@ -60,8 +60,7 @@ using CThreadTransferSrcDstAccessOrder = Sequence; using GridwiseGemm = - GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3; + GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3; auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc); @@ -212,7 +208,7 @@ extern "C" __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk( + convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk( const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, @@ -227,11 +223,11 @@ extern "C" __global__ void constexpr auto I2 = Number<2>{}; constexpr auto in_n_hi_wi_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 28, 28, 256)); + make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256)); constexpr auto wei_k_y_x_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 3, 3, 256)); + make_naive_tensor_descriptor_packed(make_tuple(256, 3, 3, 256)); constexpr auto out_n_ho_wo_k_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 28, 28, 256)); + make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256)); constexpr auto descs = transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc, @@ -285,47 +281,47 @@ extern "C" __global__ void using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>; using GridwiseGemm = - GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3; + GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3; constexpr auto c_m0_m1_m2_n_grid_desc_tmp = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc); constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp = diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp similarity index 93% rename from composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp rename to composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp index 90c957bb0b..ac7e1dd6d4 100644 --- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp +++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp @@ -1,7 +1,7 @@ #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "gridwise_dynamic_contraction_dlops_v1r2.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_contraction_dlops_v1r2.hpp" #include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp" using namespace ck; @@ -62,22 +62,22 @@ constexpr bool HasMainKBlockLoop = static_cast(CK_PARAM_HasMainKBloc constexpr bool HasDoubleTailKBlockLoop = static_cast(CK_PARAM_HasDoubleTailKBlockLoop); extern "C" __global__ void -dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N, - index_t C, - index_t Hi, - index_t Wi, - index_t K, - index_t Y, - index_t X, - index_t ConvStrideH, - index_t ConvStrideW, - index_t ConvDilationH, - index_t ConvDilationW, - index_t InLeftPadH, - index_t InLeftPadW, - index_t InRightPadH, - index_t InRightPadW, - void* p_desc_tuple) +convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N, + index_t C, + index_t Hi, + index_t Wi, + index_t K, + index_t Y, + index_t X, + index_t ConvStrideH, + index_t ConvStrideW, + index_t ConvDilationH, + index_t ConvDilationW, + index_t InLeftPadH, + index_t InLeftPadW, + index_t InRightPadH, + index_t InRightPadW, + void* p_desc_tuple) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -88,12 +88,9 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde const index_t Wo = (Wi + InLeftPadW + InRightPadW - ConvDilationW * (X - 1) - 1) / ConvStrideW + 1; - const auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C, Hi, Wi)); - const auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C, Y, X)); - const auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo)); + const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C, Hi, Wi)); + const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(K, C, Y, X)); + const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo)); const auto descs = transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad( wei_k_c_y_x_desc, @@ -160,7 +157,7 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>; using GridwiseContraction = - GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1< + GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1< BlockSize, FloatAB, FloatAcc, @@ -220,7 +217,7 @@ extern "C" __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( + convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, @@ -232,11 +229,11 @@ extern "C" __global__ void constexpr auto I3 = Number<3>{}; constexpr auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28)); + make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); constexpr auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3)); + make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3)); constexpr auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28)); + make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); constexpr auto descs = transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc, @@ -303,7 +300,7 @@ extern "C" __global__ void Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>; using GridwiseContraction = - GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1< + GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1< BlockSize, FloatAB, FloatAcc, diff --git a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp similarity index 96% rename from host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp rename to host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp index 187a05554b..5f162ec24b 100644 --- a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp @@ -2,7 +2,7 @@ #include "device.hpp" #include "host_tensor.hpp" #include "transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp" -#include "driver_dynamic_gemm_xdlops_v2r3.hpp" +#include "driver_gemm_xdlops_v2r3.hpp" template -void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk( +void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk( const InLengths& in_n_hi_wi_c_lengths, const WeiLengths& wei_k_y_x_c_lengths, const OutLengths& out_n_ho_wo_k_lengths, @@ -44,12 +44,9 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data()); out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data()); - const auto in_n_hi_wi_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths); - const auto wei_k_y_x_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths); - const auto out_n_ho_wo_k_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths); + const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths); + const auto wei_k_y_x_c_desc = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths); + const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths); #if 1 // [M, N, K0, K1] = [128, 128, 4, 4] for fp32 @@ -254,7 +251,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx for(index_t i = 0; i < 5; ++i) { - float ave_time = driver_dynamic_gemm_xdlops_v2r3< + float ave_time = driver_gemm_xdlops_v2r3< BlockSize, TInWei, TAcc, diff --git a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp similarity index 96% rename from host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp rename to host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp index 85c418c52f..82539fdd11 100644 --- a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp @@ -2,7 +2,7 @@ #include "device.hpp" #include "host_tensor.hpp" #include "transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp" -#include "driver_dynamic_gemm_xdlops_v2r3.hpp" +#include "driver_gemm_xdlops_v2r3.hpp" template -void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk( +void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk( const InLengths& in_n_hi_wi_c_lengths, const WeiLengths& wei_k_y_x_c_lengths, const OutLengths& out_n_ho_wo_k_lengths, @@ -44,12 +44,9 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data()); out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data()); - const auto in_n_hi_wi_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths); - const auto wei_k_y_x_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths); - const auto out_n_ho_wo_k_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths); + const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths); + const auto wei_k_y_x_c_desc = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths); + const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths); #if 0 // [M, N, K0, K1] = [256, 128, 4, 4] for fp32 @@ -226,7 +223,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k for(index_t i = 0; i < 5; ++i) { - float ave_time = driver_dynamic_gemm_xdlops_v2r3< + float ave_time = driver_gemm_xdlops_v2r3< BlockSize, TInWei, TAcc, diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp similarity index 94% rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp index c044036a2c..a2af8eab28 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp @@ -2,7 +2,7 @@ #include "device.hpp" #include "host_tensor.hpp" #include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp" -#include "driver_dynamic_gemm_dlops_v1r2.hpp" +#include "driver_gemm_dlops_v1r2.hpp" template -void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( +void device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( const InLengths& in_n_c_hi_wi_lengths, const WeiLengths& wei_k_c_y_x_lengths, const OutLengths& out_n_k_ho_wo_lengths, @@ -43,12 +43,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data()); out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data()); - const auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths); - const auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths); - const auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths); + const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths); + const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths); + const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths); #if 1 // cdata = 64, BlockSize = 256, 128x128x8 @@ -136,7 +133,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( for(index_t i = 0; i < 5; ++i) { - float ave_time = driver_dynamic_gemm_dlops_v1r2< + float ave_time = driver_gemm_dlops_v1r2< BlockSize, TInWei, TAcc, diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp similarity index 94% rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp index b6b1cc8969..4a9d01081c 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp @@ -1,7 +1,7 @@ #include #include "device.hpp" #include "host_tensor.hpp" -#include "driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp" +#include "driver_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp" template -void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( +void device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( const InLengths& in_n_c_hi_wi_lengths, const WeiLengths& wei_k_c_y_x_lengths, const OutLengths& out_n_k_ho_wo_lengths, @@ -48,12 +48,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data()); out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data()); - const auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths); - const auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths); - const auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths); + const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths); + const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths); + const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths); #if 0 constexpr index_t BlockSize = 256; @@ -212,9 +209,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw for(index_t i = 0; i < 5; ++i) { #if 0 - float ave_time = launch_kernel_dynamic_gemm_xdlops_v1 + float ave_time = launch_kernel_gemm_xdlops_v1 #else - float ave_time = launch_kernel_dynamic_gemm_xdlops_v2 + float ave_time = launch_kernel_gemm_xdlops_v2 #endif -void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk( +void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk( const InLengths& in_n_hi_wi_c_lengths, const WeiLengths& wei_k_y_x_c_lengths, const OutLengths& out_n_ho_wo_k_lengths, @@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data()); out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data()); - const auto in_n_hi_wi_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths); - const auto wei_k_y_x_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths); - const auto out_n_ho_wo_k_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths); + const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths); + const auto wei_k_y_x_c_desc = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths); + const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths); #if 0 // [M, N, K0, K1] = [128, 128, 8, 1] for fp32 @@ -200,7 +197,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw for(index_t i = 0; i < 5; ++i) { - float ave_time = driver_dynamic_gemm_dlops_v1r3< + float ave_time = driver_gemm_dlops_v1r3< BlockSize, TInWei, TAcc, diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp similarity index 94% rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp index 514ff6a3a9..d82fbf69d6 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp @@ -2,7 +2,7 @@ #include "device.hpp" #include "host_tensor.hpp" #include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp" -#include "driver_dynamic_gemm_xdlops_v2r3.hpp" +#include "driver_gemm_xdlops_v2r3.hpp" template -void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw( +void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw( const InLengths& in_n_c_hi_wi_lengths, const WeiLengths& wei_k_c_y_x_lengths, const OutLengths& out_n_k_ho_wo_lengths, @@ -43,12 +43,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data()); out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data()); - const auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths); - const auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths); - const auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths); + const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths); + const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths); + const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths); #if 1 // [M, N, K0, K1] = [256, 128, 4, 8] for fp16 @@ -134,7 +131,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk for(index_t i = 0; i < 5; ++i) { - float ave_time = driver_dynamic_gemm_xdlops_v2r3< + float ave_time = driver_gemm_xdlops_v2r3< BlockSize, TInWei, TAcc, diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp similarity index 94% rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp index 5310503318..37d89ec5a2 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp @@ -2,7 +2,7 @@ #include "device.hpp" #include "host_tensor.hpp" #include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp" -#include "driver_dynamic_gemm_xdlops_v2r2.hpp" +#include "driver_gemm_xdlops_v2r2.hpp" template -void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk( +void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk( const InLengths& in_n_hi_wi_c_lengths, const WeiLengths& wei_k_y_x_c_lengths, const OutLengths& out_n_ho_wo_k_lengths, @@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data()); out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data()); - const auto in_n_hi_wi_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths); - const auto wei_k_y_x_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths); - const auto out_n_ho_wo_k_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths); + const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths); + const auto wei_k_y_x_c_desc = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths); + const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths); #if 1 // [M, N, K0, K1] = [256, 128, 4, 4] for fp32 @@ -155,7 +152,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh for(index_t i = 0; i < 5; ++i) { - float ave_time = driver_dynamic_gemm_xdlops_v2r2< + float ave_time = driver_gemm_xdlops_v2r2< BlockSize, TInWei, TAcc, diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp similarity index 96% rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp index f2a30fb525..d1671bb87c 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp @@ -2,7 +2,7 @@ #include "device.hpp" #include "host_tensor.hpp" #include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp" -#include "driver_dynamic_gemm_xdlops_v2r3.hpp" +#include "driver_gemm_xdlops_v2r3.hpp" template -void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk( +void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk( const InLengths& in_n_hi_wi_c_lengths, const WeiLengths& wei_k_y_x_c_lengths, const OutLengths& out_n_ho_wo_k_lengths, @@ -49,12 +49,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data()); out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data()); - const auto in_n_hi_wi_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths); - const auto wei_k_y_x_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths); - const auto out_n_ho_wo_k_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths); + const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths); + const auto wei_k_y_x_c_desc = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths); + const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths); #if 1 // [M, N, K0, K1] = [256, 128, 4, 4] for fp32 @@ -224,7 +221,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh for(index_t i = 0; i < 5; ++i) { - float ave_time = driver_dynamic_gemm_xdlops_v2r3< + float ave_time = driver_gemm_xdlops_v2r3< BlockSize, TInWei, TAcc, diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp similarity index 96% rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp index 0d49c417de..7a38b569c9 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp @@ -2,7 +2,7 @@ #include "device.hpp" #include "host_tensor.hpp" #include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp" -#include "driver_dynamic_gemm_xdlops_v2r3.hpp" +#include "driver_gemm_xdlops_v2r3.hpp" template -void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk( +void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk( const InLengths& in_n_hi_wi_c_lengths, const WeiLengths& wei_k_y_x_c_lengths, const OutLengths& out_n_ho_wo_k_lengths, @@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data()); out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data()); - const auto in_n_hi_wi_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths); - const auto wei_k_y_x_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths); - const auto out_n_ho_wo_k_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths); + const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths); + const auto wei_k_y_x_c_desc = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths); + const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths); #if 0 // [M, N, K0, K1] = [256, 128, 4, 4] for fp32 @@ -278,7 +275,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh for(index_t i = 0; i < 5; ++i) { - float ave_time = driver_dynamic_gemm_xdlops_v2r3< + float ave_time = driver_gemm_xdlops_v2r3< BlockSize, TInWei, TAcc, diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp similarity index 91% rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp index 583c8a8a79..b5e5f91d59 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp @@ -1,8 +1,8 @@ #include #include "device.hpp" #include "host_tensor.hpp" -#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp" -#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp" +#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp" +#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp" template -void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( +void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( const InLengths& in_n_c_hi_wi_lengths, const WeiLengths& wei_k_c_y_x_lengths, const OutLengths& out_n_k_ho_wo_lengths, @@ -85,12 +85,10 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data()); wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); - const auto in_n_c0_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi)); - const auto wei_k_c0_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X)); + const auto in_n_c0_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi)); + const auto wei_k_c0_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X)); const auto out_n_k0_ho_wo_k1_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)); + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)); #if 1 // cdata = 64, BlockSize = 64, 16x8x32x4 diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp similarity index 95% rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp index 37ea6f1b17..f2a8a1a2b2 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "host_tensor.hpp" #include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp" -#include "driver_dynamic_contraction_dlops_v1r2.hpp" +#include "driver_contraction_dlops_v1r2.hpp" template -void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( +void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( const InLengths& in_n_c_hi_wi_lengths, const WeiLengths& wei_k_c_y_x_lengths, const OutLengths& out_n_k_ho_wo_lengths, @@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data()); out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data()); - const auto in_desc_n_c_hi_wi = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths); - const auto wei_desc_k_c_y_x = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths); - const auto out_desc_n_k_ho_wo = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths); + const auto in_desc_n_c_hi_wi = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths); + const auto wei_desc_k_c_y_x = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths); + const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths); #if 0 // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32 @@ -180,7 +177,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( for(index_t i = 0; i < 5; ++i) { - float ave_time = driver_dynamic_contraction_dlops_v1r2< + float ave_time = driver_contraction_dlops_v1r2< BlockSize, TInWei, TAcc, diff --git a/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp b/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp similarity index 88% rename from host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp rename to host/driver_offline/include/driver_contraction_dlops_v1r2.hpp index b520be5b6a..fbd1ce4e5e 100644 --- a/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp +++ b/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp @@ -1,10 +1,10 @@ -#ifndef DRIVER_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP -#define DRIVER_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP +#ifndef DRIVER_CONTRACTION_DLOPS_V1R2_HPP +#define DRIVER_CONTRACTION_DLOPS_V1R2_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "gridwise_dynamic_contraction_dlops_v1r2.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_contraction_dlops_v1r2.hpp" template __host__ float -driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid, - const FloatAB* p_b_grid, - FloatC* p_c_grid, - const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1, - const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1, - const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1, - AGridIteratorHacks, - BGridIteratorHacks, - CGridIteratorHacks, - AGridMoveSliceWindowIteratorHacks, - BGridMoveSliceWindowIteratorHacks, - ck::index_t nrepeat) +driver_contraction_dlops_v1r2(const FloatAB* p_a_grid, + const FloatAB* p_b_grid, + FloatC* p_c_grid, + const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1, + const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1, + const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1, + AGridIteratorHacks, + BGridIteratorHacks, + CGridIteratorHacks, + AGridMoveSliceWindowIteratorHacks, + BGridMoveSliceWindowIteratorHacks, + ck::index_t nrepeat) { using namespace ck; @@ -70,7 +70,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid, // GEMM using GridwiseContraction = - GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1< + GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1< BlockSize, FloatAB, FloatAcc, @@ -116,7 +116,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid, a_grid_desc_gk0_gm0_gm1_gk1, b_grid_desc_gk0_gn0_gn1_gk1, c_grid_desc_gm0_gm1_gn0_gn1)) { throw std::runtime_error("wrong! " - "GridwiseDynamicContraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_" + "GridwiseContraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_" "GM0_GM1_GN0_GN1 has invalid setting"); } @@ -178,7 +178,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid, if(has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = kernel_dynamic_contraction_dlops_v1r2< + const auto kernel = kernel_contraction_dlops_v1r2< GridwiseContraction, FloatAB, FloatC, @@ -204,7 +204,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid, } else if(has_main_k_block_loop && !has_double_tail_k_block_loop) { - const auto kernel = kernel_dynamic_contraction_dlops_v1r2< + const auto kernel = kernel_contraction_dlops_v1r2< GridwiseContraction, FloatAB, FloatC, @@ -230,7 +230,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid, } else if(!has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = kernel_dynamic_contraction_dlops_v1r2< + const auto kernel = kernel_contraction_dlops_v1r2< GridwiseContraction, FloatAB, FloatC, @@ -256,7 +256,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid, } else { - const auto kernel = kernel_dynamic_contraction_dlops_v1r2< + const auto kernel = kernel_contraction_dlops_v1r2< GridwiseContraction, FloatAB, FloatC, diff --git a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp similarity index 92% rename from host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp rename to host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp index 693045cd16..6f4db5ff7b 100644 --- a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp @@ -1,10 +1,10 @@ -#ifndef DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP -#define DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP +#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP +#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "gridwise_dynamic_gemm_dlops_v2.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_dlops_v2.hpp" #include "gridwise_operation_wrapper.hpp" template - __host__ void Run(const ck::DynamicTensorDescriptor& wei_k_c_y_x_global_desc, - const ck::DynamicTensorDescriptor& in_n_c_hi_wi_global_desc, - const ck::DynamicTensorDescriptor& out_n_k0_ho_wo_k1_global_desc, + __host__ void Run(const ck::TensorDescriptor& wei_k_c_y_x_global_desc, + const ck::TensorDescriptor& in_n_c_hi_wi_global_desc, + const ck::TensorDescriptor& out_n_k0_ho_wo_k1_global_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -82,14 +82,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad const auto InRightPadW = in_right_pads[I1]; // weight tensor - const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)), + const auto wei_e_k_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)), make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); // input tensor - const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor( + const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( in_n_c_hi_wi_global_desc, make_tuple(make_pass_through_transform(N), make_pass_through_transform(C), @@ -98,7 +98,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor( + const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( in_n_c_hip_wip_global_desc, make_tuple( make_pass_through_transform(N), @@ -108,7 +108,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); - const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor( + const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor( in_n_c_y_ho_x_wo_global_desc, make_tuple(make_merge_transform(make_tuple(C, Y, X)), make_pass_through_transform(N), @@ -118,8 +118,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); // output tensor - const auto out_k_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)), + const auto out_k_n_ho_wo_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)), make_tuple(make_merge_transform(make_tuple(K0, K1)), make_pass_through_transform(N), make_pass_through_transform(Ho), @@ -169,7 +169,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad #if 1 // GEMM - using gridwise_gemm = GridwiseDynamicGemmDlops_km_kn_mn_v3< + using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3< BlockSize, FloatAB, FloatAcc, diff --git a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp similarity index 92% rename from host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp rename to host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp index 2238b355f9..1b7179173c 100644 --- a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp +++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp @@ -1,10 +1,10 @@ -#ifndef DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP -#define DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP +#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP +#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "gridwise_dynamic_gemm_dlops_v2.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_dlops_v2.hpp" #include "gridwise_operation_wrapper.hpp" template - __host__ void Run(const ck::DynamicTensorDescriptor& wei_k_c_y_x_global_desc, - const ck::DynamicTensorDescriptor& in_n_c_hi_wi_global_desc, - const ck::DynamicTensorDescriptor& out_n_k0_ho_wo_k1_global_desc, + __host__ void Run(const ck::TensorDescriptor& wei_k_c_y_x_global_desc, + const ck::TensorDescriptor& in_n_c_hi_wi_global_desc, + const ck::TensorDescriptor& out_n_k0_ho_wo_k1_global_desc, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, @@ -93,14 +93,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp << std::endl; // weight tensor - const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)), + const auto wei_e_k_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)), make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); // input tensor - const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor( + const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( in_n_c_hi_wi_global_desc, make_tuple(make_pass_through_transform(N), make_pass_through_transform(C), @@ -109,7 +109,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor( + const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( in_n_c_hip_wip_global_desc, make_tuple( make_pass_through_transform(N), @@ -119,7 +119,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); - const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor( + const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor( in_n_c_y_ho_x_wo_global_desc, make_tuple(make_merge_transform(make_tuple(C, Y, X)), make_pass_through_transform(N), @@ -129,8 +129,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); // output tensor - const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor( - make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)), + const auto out_k_n_hop_wop_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)), make_tuple(make_merge_transform(make_tuple(K0, K1)), make_pass_through_transform(N), make_pad_transform(Ho, 0, OutRightPadH), @@ -181,7 +181,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp Sequence<0, 0, 0, 0, 0>{})); // GEMM - using gridwise_gemm = GridwiseDynamicGemmDlops_km_kn_mn_v3< + using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3< BlockSize, FloatAB, FloatAcc, diff --git a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp b/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp similarity index 56% rename from host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp rename to host/driver_offline/include/driver_gemm_dlops_v1r2.hpp index 29a72502d5..114f31e760 100644 --- a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp +++ b/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp @@ -1,10 +1,10 @@ -#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R2 -#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R2 +#ifndef DRIVER_GEMM_DLOPS_V1R2 +#define DRIVER_GEMM_DLOPS_V1R2 #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "gridwise_dynamic_gemm_dlops_v1r2.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_dlops_v1r2.hpp" template -__host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, - const FloatAB* p_b_grid, - FloatC* p_c_grid, - const AKMGridDesc& a_k_m_grid_desc, - const BKNGridDesc& b_k_n_grid_desc, - const CMNGridDesc& c_m_n_grid_desc, - AGridIteratorHacks, - BGridIteratorHacks, - CGridIteratorHacks, - AGridMoveSliceWindowIteratorHacks, - BGridMoveSliceWindowIteratorHacks, - ck::index_t nrepeat) +__host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid, + const FloatAB* p_b_grid, + FloatC* p_c_grid, + const AKMGridDesc& a_k_m_grid_desc, + const BKNGridDesc& b_k_n_grid_desc, + const CMNGridDesc& c_m_n_grid_desc, + AGridIteratorHacks, + BGridIteratorHacks, + CGridIteratorHacks, + AGridMoveSliceWindowIteratorHacks, + BGridMoveSliceWindowIteratorHacks, + ck::index_t nrepeat) { using namespace ck; @@ -72,49 +72,48 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, constexpr auto I5 = Number<5>{}; // GEMM - using GridwiseGemm = - GridwiseDynamicGemmDlops_km_kn_mn_v1r2; + using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v1r2; const auto M = a_k_m_grid_desc.GetLength(I1); const auto N = b_k_n_grid_desc.GetLength(I1); @@ -122,8 +121,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, if(!GridwiseGemm::CheckValidity(a_k_m_grid_desc, b_k_n_grid_desc, c_m_n_grid_desc)) { - throw std::runtime_error( - "wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r2 has invalid setting"); + throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r2 has invalid setting"); } const auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc); @@ -174,15 +172,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, if(has_main_k_block_loop && has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r2, - remove_reference_t, - remove_reference_t, - remove_reference_t, - true, - true>; + kernel_gemm_dlops_v1r2, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + true>; ave_time = launch_and_time_kernel(kernel, nrepeat, @@ -200,15 +198,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, else if(has_main_k_block_loop && !has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r2, - remove_reference_t, - remove_reference_t, - remove_reference_t, - true, - false>; + kernel_gemm_dlops_v1r2, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + false>; ave_time = launch_and_time_kernel(kernel, nrepeat, @@ -226,15 +224,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, else if(!has_main_k_block_loop && has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r2, - remove_reference_t, - remove_reference_t, - remove_reference_t, - false, - true>; + kernel_gemm_dlops_v1r2, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + true>; ave_time = launch_and_time_kernel(kernel, nrepeat, @@ -252,15 +250,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, else { const auto kernel = - kernel_dynamic_gemm_dlops_v1r2, - remove_reference_t, - remove_reference_t, - remove_reference_t, - false, - false>; + kernel_gemm_dlops_v1r2, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + false>; ave_time = launch_and_time_kernel(kernel, nrepeat, @@ -295,15 +293,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, if(has_main_k_block_loop && has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r2, - remove_reference_t, - remove_reference_t, - remove_reference_t, - true, - true>; + kernel_gemm_dlops_v1r2, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + true>; ave_time = launch_and_time_kernel( kernel, @@ -324,15 +322,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, else if(has_main_k_block_loop && !has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r2, - remove_reference_t, - remove_reference_t, - remove_reference_t, - true, - false>; + kernel_gemm_dlops_v1r2, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + false>; ave_time = launch_and_time_kernel( kernel, @@ -353,15 +351,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, else if(!has_main_k_block_loop && has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r2, - remove_reference_t, - remove_reference_t, - remove_reference_t, - false, - true>; + kernel_gemm_dlops_v1r2, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + true>; ave_time = launch_and_time_kernel( kernel, @@ -382,15 +380,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, else { const auto kernel = - kernel_dynamic_gemm_dlops_v1r2, - remove_reference_t, - remove_reference_t, - remove_reference_t, - false, - false>; + kernel_gemm_dlops_v1r2, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + false>; ave_time = launch_and_time_kernel( kernel, diff --git a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp b/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp similarity index 57% rename from host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp rename to host/driver_offline/include/driver_gemm_dlops_v1r3.hpp index 242bcfb28b..a9350bf0f8 100644 --- a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp +++ b/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp @@ -1,10 +1,10 @@ -#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R3 -#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R3 +#ifndef DRIVER_GEMM_DLOPS_V1R3 +#define DRIVER_GEMM_DLOPS_V1R3 #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "gridwise_dynamic_gemm_dlops_v1r3.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_dlops_v1r3.hpp" template -__host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, - const FloatAB* p_b_grid, - FloatC* p_c_grid, - const AK0MK1GridDesc& a_k0_m_k1_grid_desc, - const BK0NK1GridDesc& b_k0_n_k1_grid_desc, - const CMNGridDesc& c_m_n_grid_desc, - AGridIteratorHacks, - BGridIteratorHacks, - CGridIteratorHacks, - AGridMoveSliceWindowIteratorHacks, - BGridMoveSliceWindowIteratorHacks, - ck::index_t nrepeat) +__host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid, + const FloatAB* p_b_grid, + FloatC* p_c_grid, + const AK0MK1GridDesc& a_k0_m_k1_grid_desc, + const BK0NK1GridDesc& b_k0_n_k1_grid_desc, + const CMNGridDesc& c_m_n_grid_desc, + AGridIteratorHacks, + BGridIteratorHacks, + CGridIteratorHacks, + AGridMoveSliceWindowIteratorHacks, + BGridMoveSliceWindowIteratorHacks, + ck::index_t nrepeat) { using namespace ck; @@ -69,44 +69,44 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, // GEMM using GridwiseGemm = - GridwiseDynamicGemmDlops_km_kn_mn_v1r3; + GridwiseGemmDlops_km_kn_mn_v1r3; const auto M = a_k0_m_k1_grid_desc.GetLength(I1); const auto N = b_k0_n_k1_grid_desc.GetLength(I1); @@ -114,8 +114,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc)) { - throw std::runtime_error( - "wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r3 has invalid setting"); + throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r3 has invalid setting"); } const auto a_k0_m0_m1_k1_grid_desc = @@ -170,15 +169,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, if(has_main_k_block_loop && has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r3, - remove_reference_t, - remove_reference_t, - remove_reference_t, - true, - true>; + kernel_gemm_dlops_v1r3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + true>; ave_time = launch_and_time_kernel(kernel, nrepeat, @@ -196,15 +195,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, else if(has_main_k_block_loop && !has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r3, - remove_reference_t, - remove_reference_t, - remove_reference_t, - true, - false>; + kernel_gemm_dlops_v1r3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + false>; ave_time = launch_and_time_kernel(kernel, nrepeat, @@ -222,15 +221,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, else if(!has_main_k_block_loop && has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r3, - remove_reference_t, - remove_reference_t, - remove_reference_t, - false, - true>; + kernel_gemm_dlops_v1r3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + true>; ave_time = launch_and_time_kernel(kernel, nrepeat, @@ -248,15 +247,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, else { const auto kernel = - kernel_dynamic_gemm_dlops_v1r3, - remove_reference_t, - remove_reference_t, - remove_reference_t, - false, - false>; + kernel_gemm_dlops_v1r3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + false>; ave_time = launch_and_time_kernel(kernel, nrepeat, @@ -291,15 +290,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, if(has_main_k_block_loop && has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r3, - remove_reference_t, - remove_reference_t, - remove_reference_t, - true, - true>; + kernel_gemm_dlops_v1r3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + true>; ave_time = launch_and_time_kernel( kernel, @@ -322,15 +321,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, else if(has_main_k_block_loop && !has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r3, - remove_reference_t, - remove_reference_t, - remove_reference_t, - true, - false>; + kernel_gemm_dlops_v1r3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + false>; ave_time = launch_and_time_kernel( kernel, @@ -353,15 +352,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, else if(!has_main_k_block_loop && has_double_tail_k_block_loop) { const auto kernel = - kernel_dynamic_gemm_dlops_v1r3, - remove_reference_t, - remove_reference_t, - remove_reference_t, - false, - true>; + kernel_gemm_dlops_v1r3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + true>; ave_time = launch_and_time_kernel( kernel, @@ -384,15 +383,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, else { const auto kernel = - kernel_dynamic_gemm_dlops_v1r3, - remove_reference_t, - remove_reference_t, - remove_reference_t, - false, - false>; + kernel_gemm_dlops_v1r3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + false>; ave_time = launch_and_time_kernel( kernel, diff --git a/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp b/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp similarity index 50% rename from host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp rename to host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp index 85f5e27b8d..c29dbdae69 100644 --- a/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp +++ b/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp @@ -1,10 +1,10 @@ -#ifndef DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3 -#define DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3 +#ifndef DRIVER_GEMM_XDLOPS_V2R3 +#define DRIVER_GEMM_XDLOPS_V2R3 #include "common_header.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_xdlops_v2r3.hpp" template -__host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid, - const FloatAB* p_b_grid, - FloatC* p_c_grid, - const AK0MK1GridDesc& a_k0_m_k1_grid_desc, - const BK0NK1GridDesc& b_k0_n_k1_grid_desc, - const CMNGridDesc& c_m_n_grid_desc, - AGridIteratorHacks, - BGridIteratorHacks, - CGridIteratorHacks, - AGridMoveSliceWindowIteratorHacks, - BGridMoveSliceWindowIteratorHacks, - ck::index_t nrepeat) +__host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid, + const FloatAB* p_b_grid, + FloatC* p_c_grid, + const AK0MK1GridDesc& a_k0_m_k1_grid_desc, + const BK0NK1GridDesc& b_k0_n_k1_grid_desc, + const CMNGridDesc& c_m_n_grid_desc, + AGridIteratorHacks, + BGridIteratorHacks, + CGridIteratorHacks, + AGridMoveSliceWindowIteratorHacks, + BGridMoveSliceWindowIteratorHacks, + ck::index_t nrepeat) { using namespace ck; @@ -68,47 +68,47 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid, constexpr auto I2 = Number<2>{}; using GridwiseGemm = - GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3; + GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3; { std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", " @@ -126,7 +126,7 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid, if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc)) { throw std::runtime_error( - "wrong! GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting"); + "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting"); } const auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc); @@ -139,13 +139,13 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid, const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc); - const auto kernel = kernel_dynamic_gemm_xdlops_v2r3, - remove_reference_t, - remove_reference_t, - remove_reference_t>; + const auto kernel = kernel_gemm_xdlops_v2r3, + remove_reference_t, + remove_reference_t, + remove_reference_t>; #if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE float ave_time = launch_and_time_kernel(kernel, diff --git a/host/driver_offline/src/conv_bwd_driver_offline.cpp b/host/driver_offline/src/conv_bwd_driver_offline.cpp index 34fa7eb3fa..67cea94813 100644 --- a/host/driver_offline/src/conv_bwd_driver_offline.cpp +++ b/host/driver_offline/src/conv_bwd_driver_offline.cpp @@ -12,10 +12,10 @@ #include "conv_common.hpp" #include "host_conv_bwd_data.hpp" #include "device_tensor.hpp" -#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp" -#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp" +#include "device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp" +#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp" -#define USE_DYNAMIC_MODE 1 +#define USE_MODE 1 #define USE_CONV_BWD_V4R1_XDL_NHWC 1 #define USE_CONV_BWD_V4R1R2_XDL_NHWC 1 @@ -37,7 +37,7 @@ int main(int argc, char* argv[]) constexpr auto I5 = Number<5>{}; constexpr auto I6 = Number<6>{}; -#if USE_DYNAMIC_MODE +#if USE_MODE // dynamic mode if(argc != 22) { @@ -212,7 +212,7 @@ int main(int argc, char* argv[]) } auto f_make_for_device_nhwc = [&]() { -#if USE_DYNAMIC_MODE +#if USE_MODE const auto in_lengths_dev = make_tuple(N, Hi, Wi, C); const auto wei_lengths_dev = make_tuple(K, Y, X, C); const auto out_lengths_dev = make_tuple(N, Ho, Wo, K); @@ -253,20 +253,20 @@ int main(int argc, char* argv[]) const auto tmp = f_make_for_device_nhwc(); - device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk< - in_data_t, - acc_data_t, - out_data_t>(tmp[I0], - tmp[I1], - tmp[I2], - tmp[I3], - tmp[I4], - tmp[I5], - tmp[I6], - in_device, - wei, - out, - nrepeat); + device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk( + tmp[I0], + tmp[I1], + tmp[I2], + tmp[I3], + tmp[I4], + tmp[I5], + tmp[I6], + in_device, + wei, + out, + nrepeat); } #endif @@ -280,20 +280,20 @@ int main(int argc, char* argv[]) const auto tmp = f_make_for_device_nhwc(); - device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk< - in_data_t, - acc_data_t, - out_data_t>(tmp[I0], - tmp[I1], - tmp[I2], - tmp[I3], - tmp[I4], - tmp[I5], - tmp[I6], - in_device, - wei, - out, - nrepeat); + device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk( + tmp[I0], + tmp[I1], + tmp[I2], + tmp[I3], + tmp[I4], + tmp[I5], + tmp[I6], + in_device, + wei, + out, + nrepeat); } #endif diff --git a/host/driver_offline/src/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp index 968501e947..2653929c32 100644 --- a/host/driver_offline/src/conv_fwd_driver_offline.cpp +++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp @@ -12,14 +12,14 @@ #include "conv_common.hpp" #include "host_conv.hpp" #include "device_tensor.hpp" -#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp" -#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp" -#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp" -#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp" -#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp" -#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp" +#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp" +#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp" +#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp" +#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp" +#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp" +#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp" -#define USE_DYNAMIC_MODE 1 +#define USE_MODE 1 #define USE_CONV_FWD_V4R4_NCHW 0 #define USE_CONV_FWD_V4R4R2_NHWC 1 #define USE_CONV_FWD_V6R1_NCHW 1 @@ -49,7 +49,7 @@ int main(int argc, char* argv[]) constexpr auto I5 = Number<5>{}; constexpr auto I6 = Number<6>{}; -#if USE_DYNAMIC_MODE +#if USE_MODE // dynamic mode if(argc != 22) { @@ -228,7 +228,7 @@ int main(int argc, char* argv[]) } auto f_make_for_device_nchw = [&]() { -#if USE_DYNAMIC_MODE +#if USE_MODE const auto in_lengths_dev = make_tuple(N, C, Hi, Wi); const auto wei_lengths_dev = make_tuple(K, C, Y, X); const auto out_lengths_dev = make_tuple(N, K, Ho, Wo); @@ -260,7 +260,7 @@ int main(int argc, char* argv[]) }; auto f_make_for_device_nhwc = [&]() { -#if USE_DYNAMIC_MODE +#if USE_MODE const auto in_lengths_dev = make_tuple(N, Hi, Wi, C); const auto wei_lengths_dev = make_tuple(K, Y, X, C); const auto out_lengths_dev = make_tuple(N, Ho, Wo, K); @@ -301,20 +301,19 @@ int main(int argc, char* argv[]) const auto tmp = f_make_for_device_nchw(); - device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( - tmp[I0], - tmp[I1], - tmp[I2], - tmp[I3], - tmp[I4], - tmp[I5], - tmp[I6], - in, - wei, - out_device, - nrepeat); + device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(tmp[I0], + tmp[I1], + tmp[I2], + tmp[I3], + tmp[I4], + tmp[I5], + tmp[I6], + in, + wei, + out_device, + nrepeat); } #endif @@ -328,20 +327,19 @@ int main(int argc, char* argv[]) const auto tmp = f_make_for_device_nhwc(); - device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk( - tmp[I0], - tmp[I1], - tmp[I2], - tmp[I3], - tmp[I4], - tmp[I5], - tmp[I6], - in, - wei, - out_device, - nrepeat); + device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(tmp[I0], + tmp[I1], + tmp[I2], + tmp[I3], + tmp[I4], + tmp[I5], + tmp[I6], + in, + wei, + out_device, + nrepeat); } #endif @@ -355,20 +353,19 @@ int main(int argc, char* argv[]) const auto tmp = f_make_for_device_nchw(); - device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( - tmp[I0], - tmp[I1], - tmp[I2], - tmp[I3], - tmp[I4], - tmp[I5], - tmp[I6], - in, - wei, - out_device, - nrepeat); + device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(tmp[I0], + tmp[I1], + tmp[I2], + tmp[I3], + tmp[I4], + tmp[I5], + tmp[I6], + in, + wei, + out_device, + nrepeat); } #endif @@ -382,21 +379,20 @@ int main(int argc, char* argv[]) const auto tmp = f_make_for_device_nchw(); - device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( - tmp[I0], - tmp[I1], - tmp[I2], - tmp[I3], - tmp[I4], - tmp[I5], - tmp[I6], - in, - wei, - out_device, - nrepeat); + device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(tmp[I0], + tmp[I1], + tmp[I2], + tmp[I3], + tmp[I4], + tmp[I5], + tmp[I6], + in, + wei, + out_device, + nrepeat); } #endif @@ -410,9 +406,9 @@ int main(int argc, char* argv[]) const auto tmp = f_make_for_device_nchw(); - device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw( + device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw( tmp[I0], tmp[I1], tmp[I2], @@ -437,9 +433,9 @@ int main(int argc, char* argv[]) const auto tmp = f_make_for_device_nhwc(); - device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk( + device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk( tmp[I0], tmp[I1], tmp[I2], diff --git a/host/host_tensor/include/conv_common.hpp b/host/host_tensor/include/conv_common.hpp index ca95c1f138..4bf2c23494 100644 --- a/host/host_tensor/include/conv_common.hpp +++ b/host/host_tensor/include/conv_common.hpp @@ -1,7 +1,7 @@ #ifndef CONV_COMMON_HPP #define CONV_COMMON_HPP -#include "dynamic_tensor_descriptor.hpp" +#include "tensor_descriptor.hpp" enum ConvTensorLayout { @@ -19,8 +19,8 @@ template constexpr auto get_convolution_output_default_4d_tensor_descriptor( - const ck::DynamicTensorDescriptor& in_desc, - const ck::DynamicTensorDescriptor& wei_desc, + const ck::TensorDescriptor& in_desc, + const ck::TensorDescriptor& wei_desc, const ConvStrides& conv_strides, const ConvDilations conv_dilations, const LeftPads& left_pads, @@ -57,7 +57,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor( const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1; const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1; - return make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo)); + return make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo)); } template