From 970fa3e92ec4e67cfbfe1b0428e84870663ab8cd Mon Sep 17 00:00:00 2001 From: zjing14 Date: Thu, 18 Nov 2021 08:34:07 -0600 Subject: [PATCH] v5r1 fusion kernels for inference (#49) * init * refactor for 1x1 * rename e0_e1 * add e1 with bugs * debug * fixed * fixed e1 * add timer * imprve threadwise gemm with dot2 * add e2 * tuning * seperate c2 * add nhwc * restore nchwc * clean * opt * fixed; tuning * add BGlobalMoveSliceWindowStepHacks{} * tuning * repeat running * adjust * merge v5r1 nchwc * add adaptors * split k0 k1 in c_thread_grid * split h and w * remove v5r1 nhwc * clean for pr * remove host_conv_add * clean code * clean * add dynamic support * static mode * test static * add conv+add fusion * fixed validation * naming fix * use activ_enum * make static * refactor conv_add for InMem::add * add bias * add conv_out * add configurable makeddesc * add maxpool fusion * add maxpool host for validation * enable static desc * conv-only use v5r1_add * test * test * for binary dumps * fixed incorrect results due to typo * clean * debugging maxpool * workaround with offset trick * clean code * modularize ops of fusion * add gridwise_gemm_v3 * create seperate fusion fun * enable dynamic mode of conv and conv+resize_add * add dynamic mode of maxpool * add pass by point * add activ_type as arguments * merge develop * clean * reset config to old default Co-authored-by: Chao Liu --- .../blockwise_gemm_dlops_v3.hpp | 186 +- .../gridwise_gemm_dlops_v3.hpp | 1920 +++++++++++++++++ .../threadwise_gemm_dlops_v3.hpp | 186 +- .../threadwise_tensor_slice_transfer.hpp | 35 + .../include/utility/amd_buffer_addressing.hpp | 8 + composable_kernel/include/utility/config.hpp | 11 +- host/driver_offline/CMakeLists.txt | 9 + ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 220 ++ ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 196 ++ ...mplicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp | 190 -- ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 212 ++ ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 565 +++++ ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 500 +++++ ...mplicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp | 349 --- ..._gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp | 364 ---- ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 569 +++++ .../src/conv_add_fwd_driver_offline_nchwc.cpp | 414 ++++ .../src/conv_fwd_driver_offline.cpp | 48 +- .../src/conv_fwd_driver_offline_nchwc.cpp | 391 ++++ .../conv_maxpool_fwd_driver_offline_nchwc.cpp | 413 ++++ host/host_tensor/include/conv_common.hpp | 13 + host/host_tensor/include/host_tensor.hpp | 12 + 22 files changed, 5682 insertions(+), 1129 deletions(-) create mode 100644 composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp create mode 100644 host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp create mode 100644 host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp delete mode 100644 host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp create mode 100644 host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp create mode 100644 host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp create mode 100644 host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp delete mode 100644 host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp delete mode 100644 host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp create mode 100644 host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp create mode 100644 host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp create mode 100644 host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp create mode 100644 host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp index 5cc2f2393e..3df0497f61 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp @@ -10,99 +10,99 @@ template + index_t KPerThreadLoop> struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 { - struct MatrixIndex - { - index_t k; - index_t h; - index_t w; - }; + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; - // HACK: fix this @Jing Zhang - static constexpr index_t KPerThreadSubC = 4; + using AIndex = MultiIndex<3>; + using BIndex = MultiIndex<3>; + using CIndex = MultiIndex<4>; + + static constexpr auto E1 = ABlockDesc_E1_K1_E2{}.GetLength(I0); + static constexpr auto KPerBlock = ABlockDesc_E1_K1_E2{}.GetLength(I1); + static constexpr auto E2 = ABlockDesc_E1_K1_E2{}.GetLength(I2); + + static constexpr auto HoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I2); + static constexpr auto WoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I3); + + static constexpr auto KPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I0); + static constexpr auto HoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I2); + static constexpr auto WoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I3); static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); + make_tuple(Number{}, Number{}, Number{})); - static constexpr auto b_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple( - Number{}, Number<1>{}, Number{}, Number{})); + static constexpr auto b_thread_mtx_ = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number<1>{}, + Number{}, + Number{}, + Number{})); static constexpr auto c_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple( - Number{}, Number<1>{}, Number{}, Number{})); - - using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1>, - 1, - ThreadGemmADataPerRead_K, - 1>; + Number{}, Number<1>{}, Number{}, Number{})); __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3() - : c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())}, - a_thread_copy_{make_tuple(0, c_thread_begin_mtx_idx_.k * KPerThread)} + : c_thread_origin_data_idx_{GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id())}, + a_thread_copy_{make_tuple(0, c_thread_origin_data_idx_[I0] * KPerThread, 0)} { - static_assert(BlockMatrixA::IsKnownAtCompileTime() && - BlockMatrixB::IsKnownAtCompileTime() && - ThreadMatrixC::IsKnownAtCompileTime(), + static_assert(ABlockDesc_E1_K1_E2::IsKnownAtCompileTime() && + BBlockDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() && + CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time"); - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; + static_assert( + ABlockDesc_E1_K1_E2{}.GetLength(I0) == BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I0) && + ABlockDesc_E1_K1_E2{}.GetLength(I2) == BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I4), + "wrong! E dimension not consistent\n"); - static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0), - "wrong! K dimension not consistent\n"); + static_assert(E1 % EPerThreadLoop == 0, ""); + static_assert(KPerThread % KPerThreadLoop == 0, ""); - constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed - constexpr index_t H = BlockMatrixB{}.GetLength(I2); - constexpr index_t W = BlockMatrixB{}.GetLength(I3); - - static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0, + static_assert(KPerBlock % KPerThread == 0 && HoPerBlock % HoPerThread == 0 && + WoPerBlock % WoPerThread == 0, "wrong! Cannot evenly divide work among\n"); - constexpr auto KThreadCluster = K / KPerThread; - constexpr auto HThreadCluster = H / HPerThread; - constexpr auto WThreadCluster = W / WPerThread; + constexpr auto KThreadCluster = KPerBlock / KPerThread; + constexpr auto HThreadCluster = HoPerBlock / HoPerThread; + constexpr auto WThreadCluster = WoPerBlock / WoPerThread; static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster, "wrong! wrong blocksize\n"); } - __device__ static constexpr auto GetThreadMatrixCLengths() + __device__ static constexpr auto GetCThreadDesc_K_N_Ho_WoLengths() { - return Sequence{}; + return Sequence{}; } - __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) + __device__ static CIndex GetBeginOfCThreadDesc_K_N_Ho_Wo(index_t thread_id) { - constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{}); - constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{}); + constexpr auto K0 = KPerBlock / KPerThread; + constexpr auto N0 = I1; + constexpr auto H0 = HoPerBlock / HoPerThread; + constexpr auto W0 = WoPerBlock / WoPerThread; - constexpr auto num_w_threads = W / WPerThread; - constexpr auto num_h_threads = H / HPerThread; - constexpr auto num_hw_threads = num_w_threads * num_h_threads; + constexpr auto c_threadid_to_k_n_h_w_thread_cluster_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(K0, N0, H0, W0))), + make_tuple(Sequence<0, 1, 2, 3>{}), + make_tuple(Sequence<0>{})); - index_t k_thread_id = thread_id / num_hw_threads; - index_t hw_thread_id = thread_id % num_hw_threads; + const auto c_k_n_h_w_thread_cluster_idx = + c_threadid_to_k_n_h_w_thread_cluster_adaptor.CalculateBottomIndex( + make_multi_index(thread_id)); - index_t h_thread_id = hw_thread_id / num_w_threads; - index_t w_thread_id = hw_thread_id % num_w_threads; - - return MatrixIndex{k_thread_id, h_thread_id, w_thread_id}; + return c_k_n_h_w_thread_cluster_idx; } template @@ -116,19 +116,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 is_same, remove_cvref_t>::value && "wrong! inconsistent type"); - constexpr auto I0 = Number<0>{}; - - constexpr auto a_block_mtx = BlockMatrixA{}; - - constexpr auto EPerBlock = a_block_mtx.GetLength(I0); - - // HACK: fix this @Jing Zhang - constexpr auto HoPerThreadSubC = 2; - constexpr auto WoPerThreadSubC = 2; - - static_assert(KPerThread % KPerThreadSubC == 0, ""); - static_assert(HPerThread % HoPerThreadSubC == 0, ""); - static_assert(WPerThread % WoPerThreadSubC == 0, ""); + constexpr auto a_block_mtx = ABlockDesc_E1_K1_E2{}; // thread A buffer for GEMM StaticBuffer @@ -139,42 +127,46 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 FloatC, decltype(a_thread_mtx_), decltype(b_thread_mtx_), - decltype(c_thread_mtx_), - HoPerThreadSubC, - WoPerThreadSubC>{}; + decltype(c_thread_mtx_)>{}; - static_for<0, EPerBlock, EPerThreadLoop>{}([&](auto e_begin) { - static_for<0, KPerThread, KPerThreadSubC>{}([&](auto k_begin) { + static_for<0, E1, EPerThreadLoop>{}([&](auto e_begin) { + static_for<0, KPerThread, KPerThreadLoop>{}([&](auto k_begin) { a_thread_copy_.Run(a_block_mtx, - make_tuple(e_begin, k_begin), + make_tuple(e_begin, k_begin, I0), a_block_buf, a_thread_mtx_, - make_tuple(I0, I0), + make_tuple(I0, I0, I0), a_thread_buf); - static_for<0, HPerThread, HoPerThreadSubC>{}([&](auto h_begin) { - static_for<0, WPerThread, WoPerThreadSubC>{}([&](auto w_begin) { - threadwise_gemm.Run(a_thread_buf, - make_tuple(I0, I0), - b_thread_buf, - make_tuple(e_begin, I0, h_begin, w_begin), - c_thread_buf, - make_tuple(k_begin, I0, h_begin, w_begin)); - }); - }); + threadwise_gemm.Run(a_thread_buf, + make_tuple(I0, I0, I0), + b_thread_buf, + make_tuple(e_begin, I0, I0, I0, I0), + c_thread_buf, + make_tuple(k_begin, I0, I0, I0)); }); }); } template - __device__ void MoveASliceWindow(const BlockMatrixA&, - const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx) + __device__ void MoveABlockSliceWindow(const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx) { - a_thread_copy_.MoveSrcSliceWindow(BlockMatrixA{}, a_block_slice_move_step_idx); + a_thread_copy_.MoveSrcSliceWindow(ABlockDesc_E1_K1_E2{}, a_block_slice_move_step_idx); } private: - MatrixIndex c_thread_begin_mtx_idx_; + using AThreadCopy = + ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2>, + 2, + E2, + E2>; + + CIndex c_thread_origin_data_idx_; AThreadCopy a_thread_copy_; }; diff --git a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp new file mode 100644 index 0000000000..1d8a110e22 --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp @@ -0,0 +1,1920 @@ +#ifndef CK_GRIDWISE_GEMM_V3_HPP +#define CK_GRIDWISE_GEMM_V3_HPP + +#include "common_header.hpp" +#include "multi_index_transform_helper.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "blockwise_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_set.hpp" +#include "blockwise_gemm_dlops_v3.hpp" + +namespace ck { + +#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_dlops_v3( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_c_grid, + const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc, + const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + const CBlockIdToBlockClusterAdaptor_K_N_H_W c_blockid_to_k_n_h_w_block_cluster_adaptor) +{ + constexpr index_t shared_block_size = + GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + GridwiseGemm::ConvBiasActiv(p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_shared_block, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +} + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_dlops_v3_resize_add( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_d_grid, + const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc, + const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + const CBlockIdToBlockClusterAdaptor_K_N_H_W c_blockid_to_k_n_h_w_block_cluster_adaptor) +{ + constexpr index_t shared_block_size = + GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid, + p_b_grid, + p_bias_grid, + p_d_grid, + p_shared_block, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +} + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_dlops_v3_maxpool( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_c_grid, + FloatC* __restrict__ p_d_grid, + const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc, + const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + const CBlockIdToBlockClusterAdaptor_K_N_H_W c_blockid_to_k_n_h_w_block_cluster_adaptor) +{ + constexpr index_t shared_block_size = + GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + GridwiseGemm::ConvBiasActivMaxpool(p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_d_grid, + p_shared_block, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +} +#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER +// pass tensor descriptor by CONSTANT void pointer +// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to +// non-modifiable parameter address space, so compiler can enable corresponding optimization +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_dlops_v3(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc, + const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor) +{ + // first cast void CONSTANT void* to void* + // second cast void* to Desc* + // the copy constructor of tensor descriptor doesn't take address_space(4) + const auto a_e0_e1_k0_k1_e2_grid_desc = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_a_e0_e1_k0_k1_e2_grid_desc)); + const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc)); + const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc)); + const auto c_blockid_to_k_n_h_w_block_cluster_adaptor = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_h_w_block_cluster_adaptor)); + + constexpr index_t shared_block_size = + GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + GridwiseGemm::ConvBiasActiv(p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_shared_block, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +} + +// pass tensor descriptor by CONSTANT void pointer +// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to +// non-modifiable parameter address space, so compiler can enable corresponding optimization +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_dlops_v3_resize_add( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_d_grid, + const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc, + const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + const void CONSTANT* p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor) +{ + // first cast void CONSTANT void* to void* + // second cast void* to Desc* + // the copy constructor of tensor descriptor doesn't take address_space(4) + const auto a_e0_e1_k0_k1_e2_grid_desc = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_a_e0_e1_k0_k1_e2_grid_desc)); + const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc)); + const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc)); + const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc)); + const auto c_blockid_to_k_n_h_w_block_cluster_adaptor = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_h_w_block_cluster_adaptor)); + + constexpr index_t shared_block_size = + GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid, + p_b_grid, + p_bias_grid, + p_d_grid, + p_shared_block, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +} + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_dlops_v3_maxpool( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_c_grid, + FloatC* __restrict__ p_d_grid, + const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc, + const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + const void CONSTANT* p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor) +{ + // first cast void CONSTANT void* to void* + // second cast void* to Desc* + // the copy constructor of tensor descriptor doesn't take address_space(4) + const auto a_e0_e1_k0_k1_e2_grid_desc = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_a_e0_e1_k0_k1_e2_grid_desc)); + const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc)); + const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc)); + const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc)); + const auto c_blockid_to_k_n_h_w_block_cluster_adaptor = + *reinterpret_cast( + cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_h_w_block_cluster_adaptor)); + + constexpr index_t shared_block_size = + GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + GridwiseGemm::ConvBiasActivMaxpool(p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_d_grid, + p_shared_block, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +} +#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_dlops_v3_resize_add(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_d_grid) +{ + constexpr index_t shared_block_size = + GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{}; + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = + BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{}; + constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2{}; + constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx{}; + constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor = + CBlockIdToBlockClusterAdaptor_K_N_H_W{}; + + GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid, + p_b_grid, + p_bias_grid, + p_d_grid, + p_shared_block, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +} + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_dlops_v3_maxpool(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_c_grid, + FloatC* __restrict__ p_d_grid) +{ + constexpr index_t shared_block_size = + GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{}; + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = + BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{}; + constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2{}; + constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx{}; + constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor = + CBlockIdToBlockClusterAdaptor_K_N_H_W{}; + + GridwiseGemm::ConvBiasActivMaxpool(p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_d_grid, + p_shared_block, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +} + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_dlops_v3(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_c_grid) +{ + constexpr index_t shared_block_size = + GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{}; + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = + BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{}; + constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2{}; + constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor = + CBlockIdToBlockClusterAdaptor_K_N_H_W{}; + + GridwiseGemm::ConvBiasActiv(p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_shared_block, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +} +#endif + +template +struct GridwiseGemmDlops_km_kn_mn_v3 +{ + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + + static constexpr auto E1 = Number{}; + static constexpr auto E2 = Number{}; + static constexpr auto K2 = Number{}; + + static constexpr auto NPerBlock = I1; + + static constexpr FloatAcc alpha = 0.3; + + __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() + { + constexpr auto max_lds_align = Number{}; + + // A matrix in LDS memory, dst of blockwise copy + // be careful of LDS alignment + constexpr auto a_e0_e1_k1_e2_block_desc = make_naive_tensor_descriptor_aligned( + make_tuple(I1, Number{}, Number{}, Number{}), max_lds_align); + + // LDS allocation for A and B: be careful of alignment + constexpr auto a_block_space_size = math::integer_least_multiple( + a_e0_e1_k1_e2_block_desc.GetElementSpaceSize(), max_lds_align); + + return a_block_space_size * sizeof(FloatAB); + } + + __host__ __device__ static constexpr index_t + CalculateGridSize(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc) + { + const auto K = c_k_n_ho_wo_grid_desc.GetLength(I0); + const auto N = c_k_n_ho_wo_grid_desc.GetLength(I1); + const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2); + const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3); + + const auto K0 = K / KPerBlock; + const auto N0 = N / NPerBlock; + const auto H0 = Ho / HoPerBlock; + const auto W0 = Wo / WoPerBlock; + + const index_t grid_size = K0 * N0 * H0 * W0; + + return grid_size; + } + + __host__ __device__ static constexpr bool CalculateHasMainE0BlockLoop(const index_t E0) + { + const bool has_main_e0_block_loop = E0 > 1; + + return has_main_e0_block_loop; + } + + __host__ __device__ static constexpr bool CalculateHasMainE1BlockLoop() + { + const bool has_main_e1_block_loop = ((E1 + E1PerBlock) / (2 * E1PerBlock)) > 1; + + return has_main_e1_block_loop; + } + + __host__ __device__ static constexpr bool CalculateHasDoubleTailE1BlockLoop() + { + const bool has_double_tail_e1_block_loop = (E1 / E1PerBlock) % 2 == 0; + + return has_double_tail_e1_block_loop; + } + + __host__ __device__ static constexpr auto + MakeAE0E1K0K1E2GridDescriptor(const AGridDesc_E0_E1_K_E2& a_e0_e1_k_e2_grid_desc) + { + const auto E0 = a_e0_e1_k_e2_grid_desc.GetLength(I0); + const auto K = a_e0_e1_k_e2_grid_desc.GetLength(I2); + + const auto K1 = Number{}; + const auto K0 = K / K1; + + const auto a_e0_e1_k0_k1_e2_grid_desc = transform_tensor_descriptor( + a_e0_e1_k_e2_grid_desc, + make_tuple(make_pass_through_transform(E0), + make_pass_through_transform(E1), + make_unmerge_transform(make_tuple(K0, K1)), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{})); + + return a_e0_e1_k0_k1_e2_grid_desc; + } + + __host__ __device__ static constexpr auto MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor( + const BGridDesc_E0_E1_N_Ho_Wo_E2& b_e0_e1_n_ho_wo_e2_grid_desc) + { + const auto E0 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I0); + // const auto E1 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I1); + const auto N = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I2); + const auto Ho = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I3); + const auto Wo = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I4); + // const auto E2 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I5); + + const auto H2 = Number{}; + const auto H1 = Number{}; + const auto H0 = Ho / (H1 * H2); + + const auto W2 = Number{}; + const auto W1 = Number{}; + const auto W0 = Wo / (W1 * W2); + + const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = + transform_tensor_descriptor(b_e0_e1_n_ho_wo_e2_grid_desc, + make_tuple(make_pass_through_transform(E0), + make_pass_through_transform(E1), + make_pass_through_transform(N), + make_unmerge_transform(make_tuple(H0, H1, H2)), + make_unmerge_transform(make_tuple(W0, W1, W2)), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3, 4, 5>{}, + Sequence<6, 7, 8>{}, + Sequence<9>{})); + + return b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc; + } + + __host__ __device__ static constexpr auto + MakeCK0K1NH0H1H2W0W1W2GridDescriptor(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc) + { + const auto K = c_k_n_ho_wo_grid_desc.GetLength(I0); + const auto N = c_k_n_ho_wo_grid_desc.GetLength(I1); + const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2); + const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3); + + const auto K1 = Number{}; + const auto K0 = K / K1; + + const auto H2 = Number{}; + const auto H1 = Number{}; + const auto H0 = Ho / (H1 * H2); + + const auto W2 = Number{}; + const auto W1 = Number{}; + const auto W0 = Wo / (W1 * W2); + + const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = transform_tensor_descriptor( + c_k_n_ho_wo_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(K0, K1)), + make_pass_through_transform(N), + make_unmerge_transform(make_tuple(H0, H1, H2)), + make_unmerge_transform(make_tuple(W0, W1, W2))), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{})); + + return c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc; + } + + __host__ __device__ static constexpr auto + MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc) + { + const auto K = d_k_n_hx_wx_grid_desc.GetLength(I0); + const auto N = d_k_n_hx_wx_grid_desc.GetLength(I1); + const auto Hx = d_k_n_hx_wx_grid_desc.GetLength(I2); + const auto Wx = d_k_n_hx_wx_grid_desc.GetLength(I3); + + const auto K1 = Number{}; + const auto K0 = K / K1; + +#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR + const auto H2 = Number{}; + const auto H1 = Number{}; + const auto H0 = Number{}; + + const auto W2 = Number{}; + const auto W1 = Number{}; + const auto W0 = Number{}; +#else + const auto H2 = HoPerThread / 2; + const auto H1 = HoPerBlock / HoPerThread; + const auto H0 = Hx / (H1 * H2); + + const auto W2 = WoPerThread / 2; + const auto W1 = WoPerBlock / WoPerThread; + const auto W0 = Wx / (W1 * W2); +#endif + + const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor( + d_k_n_hx_wx_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(K0, K1)), + make_pass_through_transform(N), + make_unmerge_transform(make_tuple(H0, H1, H2)), + make_unmerge_transform(make_tuple(W0, W1, W2))), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{})); + + return d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc; + } + + __host__ __device__ static constexpr auto + MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc) + { + const auto K = d_k_n_hx_wx_grid_desc.GetLength(I0); + const auto N = d_k_n_hx_wx_grid_desc.GetLength(I1); + const auto Hx = d_k_n_hx_wx_grid_desc.GetLength(I2); + const auto Wx = d_k_n_hx_wx_grid_desc.GetLength(I3); + + const auto K1 = Number{}; + const auto K0 = K / K1; + + const auto H2 = Number{}; + const auto H1 = Number{}; + + const auto W2 = Number{}; + const auto W1 = Number{}; + +#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR + const auto H0 = Number{}; + const auto W0 = Number{}; +#else + const auto H0 = Hx / (H1 * H2); + const auto W0 = Wx / (W1 * W2); +#endif + + const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor( + d_k_n_hx_wx_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(K0, K1)), + make_pass_through_transform(N), + make_unmerge_transform(make_tuple(H0, H1, H2)), + make_unmerge_transform(make_tuple(W0, W1, W2))), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{})); + + return d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc; + } + + __host__ __device__ static constexpr auto + MakeCBlockIdToKNHoWoBlockClusterAdaptor(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc) + { + const auto K = c_k_n_ho_wo_grid_desc.GetLength(I0); + const auto N = c_k_n_ho_wo_grid_desc.GetLength(I1); + const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2); + const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3); + +#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR + const auto K0 = Number{}; + const auto N0 = Number{}; + const auto H0 = Number{}; + const auto W0 = Number{}; +#else + const auto K0 = K / KPerBlock; + const auto N0 = N / NPerBlock; + const auto H0 = Ho / HoPerBlock; + const auto W0 = Wo / WoPerBlock; +#endif + + const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(K0, N0, H0, W0))), + make_tuple(Sequence<0, 1, 2, 3>{}), + make_tuple(Sequence<0>{})); + + return c_blockid_to_k_n_ho_wo_block_cluster_adaptor; + } + + // using AGridDesc_E0_E1_K0_K1_E2 = + // decltype(MakeAE0E1K0K1E2GridDescriptor(AGridDesc_E0_E1_K_E2{})); + // using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 = + // decltype(MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(BGridDesc_E0_E1_N_Ho_Wo_E2{})); + // using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = + // decltype(MakeCK0K1NH0H1H2W0W1W2GridDescriptor(CGridDesc_K_N_Ho_Wo{})); + // using DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx = + // decltype(MakeDK0K1NH0H1HxW0W1WxGridDescriptor(DGridDesc_K_N_Hx_Wx{})); + + using CBlockIdToBlockClusterAdaptor_K_N_H_W = + decltype(MakeCBlockIdToKNHoWoBlockClusterAdaptor(CGridDesc_K_N_Ho_Wo{})); + + template + __host__ __device__ static constexpr auto MakeBiasK0K1GridDescriptor( + const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc) + { + const auto K0 = c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetLength(I0); + const auto K1 = c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetLength(I1); + + return make_naive_tensor_descriptor_packed(make_tuple(K0, K1)); + } + + __host__ __device__ static constexpr auto MakeCK1NH2W2ThreadDescriptor() + { + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, I1, Number{}, Number{})); + return c_k1_n_h2_w2_thread_gemm_desc; + } + + // using CThreadDesc_K1_N_H2_W2 = decltype(MakeCK1NH2W2ThreadDescriptor()); + + __host__ __device__ static constexpr auto GetBlockWiseGemm() + { + constexpr auto max_lds_align = Number{}; + + constexpr auto a_e1_k1_e2_block_gemm_desc = make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, Number{}), max_lds_align); + + constexpr auto b_e1_n_h_w_e2_block_gemm_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + I1, + Number{}, + Number{}, + Number{})); + + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor(); + + auto blockwise_gemm = + BlockwiseGemmDlops_km_kn_m0m1n0n1_v3{}; + + return blockwise_gemm; + } + + __device__ static constexpr auto GetCThreadIndex() + { + auto blockwise_gemm = GetBlockWiseGemm(); + auto c_thread_mtx_index = + blockwise_gemm.GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id()); + + return c_thread_mtx_index; + }; + + __device__ static constexpr auto GetCBlockIndex( + const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor) + { + const auto c_k_n_h_w_block_cluster_idx = + c_blockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex( + make_multi_index(get_block_1d_id())); + return c_k_n_h_w_block_cluster_idx; + } + + template + __device__ static void BiasOp(BiasGlobalBuff& bias_global_buf, + CThreadBuff& c_thread_buf, + const CBlockIndex& c_block_idx, + const CThreadIndex& c_thread_idx, + const BiasGridDesc_K0_K1& bias_k0_k1_grid_desc, + const CThreadDesc_K1_N_H2_W2&) + + { + const index_t k_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I0]); + + const auto k_thread_id = c_thread_idx[I0]; + + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{}; + + constexpr auto bias_k0_k1_thread_desc = + make_naive_tensor_descriptor_packed(make_tuple(I1, Number{})); + + StaticBuffer + bias_thread_buf; + + const index_t k_thread_data_on_global = k_thread_id * KPerThread; + + auto bias_threadwise_transfer = + ThreadwiseTensorSliceTransfer_v2{}>, + Sequence<0, 1>, + 1, + CThreadTransferDstScalarPerVector, + false, + true>( + bias_k0_k1_grid_desc, make_multi_index(k_block_work_id, k_thread_data_on_global)); + + constexpr auto bias_k0_k1_global_tensor_step_hacks = make_tuple( + make_tuple(Sequence<0>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<0>{})); + + bias_threadwise_transfer.Run(bias_k0_k1_grid_desc, + bias_global_buf, + bias_k0_k1_thread_desc, + make_tuple(I0, I0), + bias_thread_buf, + bias_k0_k1_global_tensor_step_hacks); + + static_for<0, KPerThread, 1>{}([&](auto ki) { + static_for<0, HoPerThread, 1>{}([&](auto hi) { + static_for<0, WoPerThread, 1>{}([&](auto wi) { + constexpr index_t c_offset = + c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(make_tuple(ki, 0, hi, wi)); + c_thread_buf(Number{}) = + c_thread_buf[Number{}] + bias_thread_buf[ki]; + }); + }); + }); + } + + template + __device__ static void Activation(CThreadBuff& c_thread_buf, + const CThreadDesc_K1_N_H2_W2&, + integral_constant) + { + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{}; + + static_for<0, c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(), 1>{}([&](auto i) { + if constexpr(activ_type_ == 1) + { + c_thread_buf(i) = c_thread_buf[i] >= 0 ? c_thread_buf[i] : alpha * c_thread_buf[i]; + } + else if constexpr(activ_type_ == 2) + { + FloatAcc x = 1.0 + exp(-c_thread_buf[i]); + + asm volatile("\n \ + v_rcp_f32 %0, %1 \n" + : "=v"(x) + : "0"(x)); + + c_thread_buf(i) = x; + } + }); + } + + template + __device__ static void + WriteOut(const CThreadBuff& c_thread_buf, + CGlobalBuff& c_global_buf, + const CBlockIndex& c_block_idx, + const CThreadIndex& c_thread_idx, + const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc) + { + const index_t k_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I0]); + const index_t n_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I1]); + const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]); + const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]); + + const auto k_thread_id = c_thread_idx[I0]; + const auto ho_thread_id = c_thread_idx[I2]; + const auto wo_thread_id = c_thread_idx[I3]; + + // hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global + // tensor + constexpr auto c_k_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks = CGlobalStepHacks{}; + + constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc = + make_naive_tensor_descriptor_packed(make_tuple(I1, + Number{}, + I1, + I1, + I1, + Number{}, + I1, + I1, + Number{})); + + const index_t k_thread_data_on_global = k_thread_id * KPerThread; + + ThreadwiseTensorSliceTransfer_v1r3< + FloatAcc, + FloatC, + decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc), + decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc), + Sequence, + CThreadTransferSrcDstAccessOrder, + CThreadTransferSrcDstVectorDim, + CThreadTransferDstScalarPerVector, + CGlobalMemoryDataOperation, + 1, + true>(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + make_multi_index(k_block_work_id, + k_thread_data_on_global, + n_block_work_id, + ho_block_work_id, + ho_thread_id, + 0, + wo_block_work_id, + wo_thread_id, + 0)) + .Run(c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0), + c_thread_buf, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + c_global_buf, + c_k_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks); + } + + template + __device__ static void + MaxPool(const CThreadBuff& c_thread_buf, + DGlobalBuff& d_global_buf, + const CBlockIndex& c_block_idx, + const CThreadIndex& c_thread_idx, + const CThreadDesc_K1_N_H2_W2&, + const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc) + { + + const index_t k_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I0]); + const index_t n_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I1]); + const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]); + const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]); + + const auto k_thread_id = c_thread_idx[I0]; + const auto ho_thread_id = c_thread_idx[I2]; + const auto wo_thread_id = c_thread_idx[I3]; + + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{}; + + static_assert(HoPerThread % 2 == 0 && WoPerThread % 2 == 0, ""); + + constexpr auto HoPerThread_2 = HoPerThread / 2; + constexpr auto WoPerThread_2 = WoPerThread / 2; + + constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc = + make_naive_tensor_descriptor_packed(make_tuple(I1, + Number{}, + I1, + I1, + I1, + Number{}, + I1, + I1, + Number{})); + + StaticBuffer + d_thread_buf; + + static_for<0, KPerThread, 1>{}([&](auto ki) { + static_for<0, HoPerThread_2, 1>{}([&](auto hi) { + static_for<0, WoPerThread_2, 1>{}([&](auto wi) { + constexpr index_t d_offset = + d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.CalculateOffset( + make_tuple(0, ki, 0, 0, 0, hi, 0, 0, wi)); + + constexpr index_t c_offset_0 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset( + make_tuple(ki, 0, hi * 2, wi * 2)); + constexpr index_t c_offset_1 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset( + make_tuple(ki, 0, hi * 2, wi * 2 + 1)); + constexpr index_t c_offset_2 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset( + make_tuple(ki, 0, hi * 2 + 1, wi * 2)); + constexpr index_t c_offset_3 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset( + make_tuple(ki, 0, hi * 2 + 1, wi * 2 + 1)); + + d_thread_buf(Number{}) = c_thread_buf[Number{}]; + d_thread_buf(Number{}) = + fmaxf(c_thread_buf[Number{}], d_thread_buf(Number{})); + d_thread_buf(Number{}) = + fmaxf(c_thread_buf[Number{}], d_thread_buf(Number{})); + d_thread_buf(Number{}) = + fmax(c_thread_buf[Number{}], d_thread_buf(Number{})); + }); + }); + }); + + const index_t k_thread_data_on_global = k_thread_id * KPerThread; + + constexpr auto d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks = DGlobalStepHacks{}; + + ThreadwiseTensorSliceTransfer_v1r3< + FloatC, + FloatC, + decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc), + decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc), + Sequence, + CThreadTransferSrcDstAccessOrder, + CThreadTransferSrcDstVectorDim, + CThreadTransferDstScalarPerVector, + InMemoryDataOperationEnum_t::Set, + 1, + true>(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + make_multi_index(k_block_work_id, + k_thread_data_on_global, + n_block_work_id, + ho_block_work_id, + ho_thread_id, + 0, + wo_block_work_id, + wo_thread_id, + 0)) + .Run(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0), + d_thread_buf, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + d_global_buf, + d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks); + } + + template + __device__ static void + ResizeAdd(const CThreadBuff& c_thread_buf, + DGlobalBuff& d_global_buf, + const CBlockIndex& c_block_idx, + const CThreadIndex& c_thread_idx, + const CThreadDesc_K1_N_H2_W2&, + const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc) + { + + const index_t k_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I0]); + const index_t n_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I1]); + const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]); + const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]); + + const auto k_thread_id = c_thread_idx[I0]; + const auto ho_thread_id = c_thread_idx[I2]; + const auto wo_thread_id = c_thread_idx[I3]; + + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{}; + + constexpr auto HoPerThreadx2 = HoPerThread * 2; + constexpr auto WoPerThreadx2 = WoPerThread * 2; + + constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc = + make_naive_tensor_descriptor_packed(make_tuple(I1, + Number{}, + I1, + I1, + I1, + Number{}, + I1, + I1, + Number{})); + + StaticBuffer + d_thread_buf; + + static_for<0, KPerThread, 1>{}([&](auto k_i) { + static_for<0, HoPerThreadx2, 1>{}([&](auto h_i) { + static_for<0, WoPerThreadx2, 1>{}([&](auto w_i) { + d_thread_buf(Number{}) = + c_thread_buf[Number{}]; + }); + }); + }); + + // hack to control index calculation when iterating over d_k_n_ho_wo_global tensor + constexpr auto d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks = DGlobalStepHacks{}; + + const index_t k_thread_data_on_global = k_thread_id * KPerThread; + + ThreadwiseTensorSliceTransfer_v1r3< + FloatC, + FloatC, + decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc), + decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc), + Sequence, + CThreadTransferSrcDstAccessOrder, + CThreadTransferSrcDstVectorDim, + CThreadTransferDstScalarPerVector, + InMemoryDataOperationEnum_t::Add, + 1, + true>(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + make_multi_index(k_block_work_id, + k_thread_data_on_global, + n_block_work_id, + ho_block_work_id, + ho_thread_id, + 0, + wo_block_work_id, + wo_thread_id, + 0)) + .Run(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0), + d_thread_buf, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + d_global_buf, + d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks); + } + + template + __device__ static void + GemmOp(const AGlobalBuff& a_global_buf, + const BGlobalBuff& b_global_buf, + CThreadBuff& c_thread_buf, + FloatAB* __restrict__ p_shared_block, + const CBlockIndex& c_block_idx, + const CThreadIndex& c_thread_idx, + const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc, + const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const CThreadDesc_K1_N_H2_W2&, + integral_constant) + { + constexpr auto HasMainE1BlockLoop = CalculateHasMainE1BlockLoop(); + constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop(); + + // const auto c_k_n_h_w_block_cluster_idx = + // GetCBlockIndex(c_blockid_to_k_n_h_w_block_cluster_adaptor); + // c_blockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex( + // make_multi_index(get_block_1d_id())); + + const index_t k_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I0]); + const index_t n_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I1]); + const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]); + const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]); + + constexpr auto max_lds_align = Number{}; + + constexpr auto a_e1_k1_e2_block_gemm_desc = make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, Number{}), max_lds_align); + + constexpr auto b_e1_n_h_w_e2_block_gemm_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + I1, + Number{}, + Number{}, + Number{})); + + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{}; + + auto blockwise_gemm = + BlockwiseGemmDlops_km_kn_m0m1n0n1_v3{}; + // blockwise_gemm.GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id()); + + const auto ho_thread_id = c_thread_idx[I2]; + const auto wo_thread_id = c_thread_idx[I3]; + + constexpr auto a_e0_e1_k0_k1_e2_block_copy_desc = make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, I1, Number{}, Number{}), + max_lds_align); + + // A matrix blockwise copy + auto a_blockwise_copy = + BlockwiseTensorSliceTransfer_v4, + ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2, + ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2, + ABlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(a_e0_e1_k0_k1_e2_grid_desc), + decltype(a_e0_e1_k0_k1_e2_block_copy_desc), + ABlockTransferSrcAccessOrder, + Sequence<0, 1, 2, 3, 4>, + ABlockTransferSrcVectorDim, + 4, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_E2, + 1, + 1, + AThreadTransferSrcResetCoordinateAfterRun, + false>(a_e0_e1_k0_k1_e2_grid_desc, + make_multi_index(0, 0, k_block_work_id, 0, 0), + a_e0_e1_k0_k1_e2_block_copy_desc, + make_multi_index(0, 0, 0, 0, 0)); + + constexpr auto a_block_slice_copy_step = make_multi_index(I1, 0, 0, 0, 0); + + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc = + make_naive_tensor_descriptor_packed(make_tuple(I1, + Number{}, + I1, + I1, + I1, + Number{}, + I1, + I1, + Number{}, + Number{})); + + auto b_threadwise_transfer = ThreadwiseTensorSliceTransfer_v2< + FloatAB, + FloatAB, + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc), + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc), + Sequence, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BThreadTransferSrcResetCoordinateAfterRun, + true>(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + make_multi_index(0, + 0, + n_block_work_id, + ho_block_work_id, + ho_thread_id, + 0, + wo_block_work_id, + wo_thread_id, + 0, + 0)); + + auto a_block_buf = make_dynamic_buffer( + p_shared_block, a_e0_e1_k0_k1_e2_block_copy_desc.GetElementSpaceSize()); + + //// register allocation for output + // StaticBuffer + // c_thread_buf; + + // initialize output thread tensor + ThreadwiseTensorSliceSet_v1>{} + .Run(c_k1_n_h2_w2_thread_gemm_desc, + make_tuple(I0, I0, I0, I0), + c_thread_buf, + FloatAcc{0}); + + constexpr auto b_thread_slice_copy_step = + make_multi_index(0, E1PerBlock, 0, 0, 0, 0, 0, 0, 0, 0); + + // hack to control index calculation when iterating over A and B matrix for threadwise copy + constexpr auto a_e0_e1_k_e2_global_step_hacks = AGlobalStepHacks{}; + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = BGlobalStepHacks{}; + + // double regsiter buffer for b + StaticBuffer + b_thread_even_buf, b_thread_odd_buf; + + if constexpr(HasMainE0BlockLoop) + { + const auto E0 = b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetLength(I0); + + index_t e0_block_data_begin = 0; + + do + { + // LDS double buffer: preload data + { + a_blockwise_copy.RunRead( + a_e0_e1_k0_k1_e2_grid_desc, a_global_buf, a_e0_e1_k_e2_global_step_hacks); + + b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_global_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), + b_thread_even_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks); + + a_blockwise_copy.RunWrite(a_e0_e1_k0_k1_e2_block_copy_desc, a_block_buf); + } + + __syncthreads(); + + if constexpr(HasMainE1BlockLoop) + { + index_t e1_block_data_begin = 0; + + // LDS double buffer: main body + // use Do-While loop instead of For loop to simplify control flow + do + { + // even iteration + b_threadwise_transfer.MoveSrcSliceWindow( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_thread_slice_copy_step, + BGlobalMoveSliceWindowStepHacks{}); + + b_threadwise_transfer.Run( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_global_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), + b_thread_odd_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks); + + // LDS double buffer: GEMM on current data + blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); + + blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0)); + + b_threadwise_transfer.MoveSrcSliceWindow( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_thread_slice_copy_step, + BGlobalMoveSliceWindowStepHacks{}); + + b_threadwise_transfer.Run( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_global_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), + b_thread_even_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks); + + // LDS double buffer: GEMM on current data + blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf); + + blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0)); + + e1_block_data_begin += 2 * E1PerBlock; + + } while(e1_block_data_begin < E1 - 2 * E1PerBlock); + } + + // LDS double buffer: tail + if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left + { + b_threadwise_transfer.MoveSrcSliceWindow( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_thread_slice_copy_step, + BGlobalMoveSliceWindowStepHacks{}); + + b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_global_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), + b_thread_odd_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks); + + // LDS double buffer: GEMM on 2nd-last data + blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); + + blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0)); + + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf); + } + else // if has 1 iteration left + { + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); + } + + a_blockwise_copy.MoveSrcSliceWindow(a_e0_e1_k0_k1_e2_grid_desc, + a_block_slice_copy_step, + AGlobalMoveSliceWindowStepHacks{}); + + blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - E1PerBlock), 0, 0)); + + b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_thread_slice_copy_step, + BGlobalMoveSliceWindowStepHacks{}); + + e0_block_data_begin += 1; + + } while(e0_block_data_begin < E0); + } + else + { + // LDS double buffer: preload data + { + a_blockwise_copy.RunRead( + a_e0_e1_k0_k1_e2_grid_desc, a_global_buf, a_e0_e1_k_e2_global_step_hacks); + + b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_global_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), + b_thread_even_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks); + + a_blockwise_copy.RunWrite(a_e0_e1_k0_k1_e2_block_copy_desc, a_block_buf); + } + + __syncthreads(); + + if constexpr(HasMainE1BlockLoop) + { + index_t e1_block_data_begin = 0; + + // LDS double buffer: main body + // use Do-While loop instead of For loop to simplify control flow + do + { + // even iteration + b_threadwise_transfer.MoveSrcSliceWindow( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_thread_slice_copy_step, + BGlobalMoveSliceWindowStepHacks{}); + + b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_global_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), + b_thread_odd_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks); + + // LDS double buffer: GEMM on current data + blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); + + blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0)); + + b_threadwise_transfer.MoveSrcSliceWindow( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_thread_slice_copy_step, + BGlobalMoveSliceWindowStepHacks{}); + + b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_global_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), + b_thread_even_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks); + + // LDS double buffer: GEMM on current data + blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf); + + blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0)); + + e1_block_data_begin += 2 * E1PerBlock; + + } while(e1_block_data_begin < E1 - 2 * E1PerBlock); + } + + // LDS double buffer: tail + if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left + { + b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_thread_slice_copy_step, + BGlobalMoveSliceWindowStepHacks{}); + + b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + b_global_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc, + make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), + b_thread_odd_buf, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks); + + // LDS double buffer: GEMM on 2nd-last data + blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); + + blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0)); + + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf); + } + else // if has 1 iteration left + { + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); + } + } + } + + template + __device__ static void + Conv(const FloatAB* __restrict__ p_a_global, + const FloatAB* __restrict__ p_b_global, + const FloatC* __restrict__ p_bias_global, + FloatC* __restrict__ p_c_global, + FloatC* __restrict__ p_d_global, + FloatAB* __restrict__ p_shared_block, + const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc, + const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant) + { + const auto bias_k0_k1_grid_desc = + MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + + const auto a_global_buf = make_dynamic_buffer( + p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize()); + const auto b_global_buf = make_dynamic_buffer( + p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize()); + auto c_global_buf = make_dynamic_buffer( + p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize()); + auto d_global_buf = make_dynamic_buffer( + p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize()); + auto bias_global_buf = make_dynamic_buffer( + p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize()); + + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor(); + + // register allocation for output + StaticBuffer + c_thread_buf; + + const auto c_k_n_h_w_block_cluster_idx = + GetCBlockIndex(c_blockid_to_k_n_h_w_block_cluster_adaptor); + + const auto c_thread_mtx_index = GetCThreadIndex(); + + // GemmOp + GemmOp(a_global_buf, + b_global_buf, + c_thread_buf, + p_shared_block, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k1_n_h2_w2_thread_gemm_desc, + integral_constant{}); + + // Output + WriteOut(c_thread_buf, + c_global_buf, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + } + + template + __device__ static void ConvBiasActiv( + const FloatAB* __restrict__ p_a_global, + const FloatAB* __restrict__ p_b_global, + const FloatC* __restrict__ p_bias_global, + FloatC* __restrict__ p_c_global, + FloatAB* __restrict__ p_shared_block, + const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc, + const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant, + integral_constant) + { + static constexpr auto activ_type = integral_constant{}; + + const auto bias_k0_k1_grid_desc = + MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + + const auto a_global_buf = make_dynamic_buffer( + p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize()); + const auto b_global_buf = make_dynamic_buffer( + p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize()); + auto c_global_buf = make_dynamic_buffer( + p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize()); + auto bias_global_buf = make_dynamic_buffer( + p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize()); + + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor(); + + // register allocation for output + StaticBuffer + c_thread_buf; + + const auto c_k_n_h_w_block_cluster_idx = + GetCBlockIndex(c_blockid_to_k_n_h_w_block_cluster_adaptor); + + const auto c_thread_mtx_index = GetCThreadIndex(); + + // GemmOp + GemmOp(a_global_buf, + b_global_buf, + c_thread_buf, + p_shared_block, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k1_n_h2_w2_thread_gemm_desc, + integral_constant{}); + + // Bias + BiasOp(bias_global_buf, + c_thread_buf, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + bias_k0_k1_grid_desc, + c_k1_n_h2_w2_thread_gemm_desc); + + // Activ + Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type); + + // Output + WriteOut(c_thread_buf, + c_global_buf, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + } + + template + __device__ static void ConvBiasActivMaxpool( + const FloatAB* __restrict__ p_a_global, + const FloatAB* __restrict__ p_b_global, + const FloatC* __restrict__ p_bias_global, + FloatC* __restrict__ p_c_global, + FloatC* __restrict__ p_d_global, + FloatAB* __restrict__ p_shared_block, + const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc, + const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant, + integral_constant) + { + static constexpr auto activ_type = integral_constant{}; + + const auto bias_k0_k1_grid_desc = + MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + + const auto a_global_buf = make_dynamic_buffer( + p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize()); + const auto b_global_buf = make_dynamic_buffer( + p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize()); + auto c_global_buf = make_dynamic_buffer( + p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize()); + auto d_global_buf = make_dynamic_buffer( + p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize()); + auto bias_global_buf = make_dynamic_buffer( + p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize()); + + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor(); + + // register allocation for output + StaticBuffer + c_thread_buf; + + const auto c_k_n_h_w_block_cluster_idx = + GetCBlockIndex(c_blockid_to_k_n_h_w_block_cluster_adaptor); + + const auto c_thread_mtx_index = GetCThreadIndex(); + + // GemmOp + GemmOp(a_global_buf, + b_global_buf, + c_thread_buf, + p_shared_block, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k1_n_h2_w2_thread_gemm_desc, + integral_constant{}); + + // Bias + BiasOp(bias_global_buf, + c_thread_buf, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + bias_k0_k1_grid_desc, + c_k1_n_h2_w2_thread_gemm_desc); + + // Activ + Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type); + + // Output + WriteOut(c_thread_buf, + c_global_buf, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + + // MaxPool + MaxPool(c_thread_buf, + d_global_buf, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + c_k1_n_h2_w2_thread_gemm_desc, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc); + } + + template + __device__ static void ConvBiasActivResizeAdd( + const FloatAB* __restrict__ p_a_global, + const FloatAB* __restrict__ p_b_global, + const FloatC* __restrict__ p_bias_global, + FloatC* __restrict__ p_d_global, + FloatAB* __restrict__ p_shared_block, + const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc, + const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor, + integral_constant, + integral_constant) + { + static constexpr auto activ_type = integral_constant{}; + + const auto bias_k0_k1_grid_desc = + MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + + const auto a_global_buf = make_dynamic_buffer( + p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize()); + const auto b_global_buf = make_dynamic_buffer( + p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize()); + auto d_global_buf = make_dynamic_buffer( + p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize()); + auto bias_global_buf = make_dynamic_buffer( + p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize()); + + constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor(); + + // register allocation for output + StaticBuffer + c_thread_buf; + + const auto c_k_n_h_w_block_cluster_idx = + GetCBlockIndex(c_blockid_to_k_n_h_w_block_cluster_adaptor); + + const auto c_thread_mtx_index = GetCThreadIndex(); + + // GemmOp + GemmOp(a_global_buf, + b_global_buf, + c_thread_buf, + p_shared_block, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k1_n_h2_w2_thread_gemm_desc, + integral_constant{}); + + // Bias + BiasOp(bias_global_buf, + c_thread_buf, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + bias_k0_k1_grid_desc, + c_k1_n_h2_w2_thread_gemm_desc); + + // Activ + Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type); + + // Resize_Add + ResizeAdd(c_thread_buf, + d_global_buf, + c_k_n_h_w_block_cluster_idx, + c_thread_mtx_index, + c_k1_n_h2_w2_thread_gemm_desc, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc); + } +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp index f6c15fd85a..360b115015 100644 --- a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp @@ -9,21 +9,22 @@ namespace ck { // C[M, N] += transpose(A[K, M]) * B[K, N] // Element of matrix can be vectorized data // Assume: -// 1. ADesc, BDesc, CDesc are known at compile-time +// 1. AThreadDesc_E1_K_E2, BThreadDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo are known at +// compile-time // 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time template ::type = false> struct ThreadwiseGemmDlops_km_kn_mn_v3 { + template >::value && @@ -54,102 +57,107 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; - constexpr auto E = ADesc{}.GetLength(I0); - constexpr auto K = ADesc{}.GetLength(I1); + constexpr auto E1 = AThreadDesc_E1_K_E2{}.GetLength(I0); + constexpr auto K = AThreadDesc_E1_K_E2{}.GetLength(I1); + constexpr auto E2 = AThreadDesc_E1_K_E2{}.GetLength(I2); + + constexpr auto Ho = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I2); + constexpr auto Wo = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I3); constexpr auto a_origin_idx = to_multi_index(AOriginIdx{}); constexpr auto b_origin_idx = to_multi_index(BOriginIdx{}); constexpr auto c_origin_idx = to_multi_index(COriginIdx{}); - static_for<0, E, 1>{}([&](auto e) { + if constexpr((Ho % 2 == 0) && (Wo % 2 == 0)) + { + constexpr auto SubHW = 2; + static_for<0, K, 1>{}([&](auto k) { - constexpr index_t a_offset = - ADesc{}.CalculateOffset(a_origin_idx + make_tuple(e, k)); + static_for<0, Ho, SubHW>{}([&](auto h) { + static_for<0, Wo, SubHW>{}([&](auto w) { + static_for<0, E1, 1>{}([&](auto e1) { + static_for<0, E2, 1>{}([&](auto e2) { + constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset( + a_origin_idx + make_tuple(e1, k, e2)); - if constexpr(H == 2 && W == 2) - { - constexpr index_t b_offset_0 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0)); - constexpr index_t b_offset_1 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1)); - constexpr index_t b_offset_2 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0)); - constexpr index_t b_offset_3 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1)); + constexpr index_t b0_offset = + BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset( + b_origin_idx + make_tuple(e1, 0, h, w, e2)); - constexpr index_t c_offset_0 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0)); - constexpr index_t c_offset_1 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1)); - constexpr index_t c_offset_2 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0)); - constexpr index_t c_offset_3 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1)); + constexpr index_t b1_offset = + BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset( + b_origin_idx + make_tuple(e1, 0, h, w + 1, e2)); - amd_assembly_outer_product_1x4(a_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - c_buf(Number{}), - c_buf(Number{}), - c_buf(Number{}), - c_buf(Number{})); - } - else if constexpr(H == 4 && W == 1) - { - constexpr index_t b_offset_0 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0)); - constexpr index_t b_offset_1 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0)); - constexpr index_t b_offset_2 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0)); - constexpr index_t b_offset_3 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0)); + constexpr index_t b2_offset = + BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset( + b_origin_idx + make_tuple(e1, 0, h + 1, w, e2)); - constexpr index_t c_offset_0 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0)); - constexpr index_t c_offset_1 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0)); - constexpr index_t c_offset_2 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0)); - constexpr index_t c_offset_3 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0)); + constexpr index_t b3_offset = + BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset( + b_origin_idx + make_tuple(e1, 0, h + 1, w + 1, e2)); - amd_assembly_outer_product_1x4(a_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - c_buf(Number{}), - c_buf(Number{}), - c_buf(Number{}), - c_buf(Number{})); - } - else - { - static_for<0, H, 1>{}([&](auto h) { - static_for<0, W, 1>{}([&](auto w) { - constexpr index_t b_offset = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, h, w)); + constexpr index_t c0_offset = + CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx + + make_tuple(k, 0, h, w)); - constexpr index_t c_offset = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w)); + constexpr index_t c1_offset = + CThreadDesc_K_N_Ho_Wo{}.CalculateOffset( + c_origin_idx + make_tuple(k, 0, h, w + 1)); -#if 0 - c_buf(Number{}) += inner_product_with_conversion{}( - a_buf[Number{}], b_buf[Number{}]); -#else - amd_assembly_inner_product(a_buf[Number{}], - b_buf[Number{}], - c_buf(Number{})); -#endif + constexpr index_t c2_offset = + CThreadDesc_K_N_Ho_Wo{}.CalculateOffset( + c_origin_idx + make_tuple(k, 0, h + 1, w)); + + constexpr index_t c3_offset = + CThreadDesc_K_N_Ho_Wo{}.CalculateOffset( + c_origin_idx + make_tuple(k, 0, h + 1, w + 1)); + + amd_assembly_outer_product_1x4(a_buf[Number{}], + b_buf[Number{}], + b_buf[Number{}], + b_buf[Number{}], + b_buf[Number{}], + c_buf(Number{}), + c_buf(Number{}), + c_buf(Number{}), + c_buf(Number{})); + }); }); }); - } + }); }); - }); + } + else + { + + static_for<0, K, 1>{}([&](auto k) { + static_for<0, Ho, 1>{}([&](auto h) { + static_for<0, Wo, 1>{}([&](auto w) { + static_for<0, E1, 1>{}([&](auto e1) { + static_for<0, E2, 1>{}([&](auto e2) { + constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset( + a_origin_idx + make_tuple(e1, k, e2)); + + constexpr index_t b_offset = + BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset( + b_origin_idx + make_tuple(e1, 0, h, w, e2)); + + constexpr index_t c_offset = + CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx + + make_tuple(k, 0, h, w)); + + inner_product(a_buf[Number{}], + b_buf[Number{}], + c_buf(Number{})); + }); + }); + }); + }); + }); + } } }; diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp index c02e959461..4b03ac04a4 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp @@ -217,6 +217,22 @@ struct ThreadwiseTensorSliceTransfer_v1r3 is_dst_valid, dst_vector.template AsType()[Number<0>{}]); } + else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add) + { + + typename vector_type_maker::type tmp; + tmp.template AsType()(Number<0>{}) = + dst_buf.template Get(dst_coord_.GetOffset(), is_dst_valid); + + static_for<0, DstScalarPerVector, 1>{}([&](auto t) { + dst_vector.template AsType()(t) += tmp.template AsType()[t]; + }); + + dst_buf.template Set( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector.template AsType()[Number<0>{}]); + } constexpr auto move_on_dim = [&]() constexpr { @@ -666,6 +682,25 @@ struct ThreadwiseTensorSliceTransfer_v2 move_tensor_coordinate(src_desc, src_coord_, adjusted_step); } + // src_slice_origin_step_idx need to be known at compile-time, for performance reason + template + __device__ void + MoveSrcSliceWindow(const SrcDesc& src_desc, + const Index& src_slice_origin_step_idx, + const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack) + { + // if src coord was not reset by RunRead(), then need to adjust the step here + const auto adjusted_step_idx = + SrcResetCoordinateAfterRun ? src_slice_origin_step_idx + : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); + + // is it OK to construct a new step every time? + const auto adjusted_step = make_tensor_coordinate_step( + src_desc, adjusted_step_idx, src_move_slice_window_step_hack); + + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); + } + private: SrcCoord src_coord_; }; // namespace ck diff --git a/composable_kernel/include/utility/amd_buffer_addressing.hpp b/composable_kernel/include/utility/amd_buffer_addressing.hpp index c481df180b..d40a302d69 100644 --- a/composable_kernel/include/utility/amd_buffer_addressing.hpp +++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp @@ -591,6 +591,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type::type src } else if constexpr(N == 8) { +#if 0 vector_type tmp{src_thread_data}; llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType()[Number<0>{}], @@ -604,6 +605,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type::type src dst_thread_addr_offset, dst_wave_addr_offset + 4 * sizeof(half_t), 0); +#else + llvm_amdgcn_raw_buffer_store_fp32x4(as_type(src_thread_data), + dst_wave_buffer_resource, + dst_thread_addr_offset, + dst_wave_addr_offset, + 0); +#endif } } else if constexpr(is_same::value) diff --git a/composable_kernel/include/utility/config.hpp b/composable_kernel/include/utility/config.hpp index f4181b29d4..e79c4d4f73 100644 --- a/composable_kernel/include/utility/config.hpp +++ b/composable_kernel/include/utility/config.hpp @@ -96,6 +96,7 @@ // pass tensor descriptor by value or void* #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 +#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0 // merge transformation use magic number division #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1 @@ -128,7 +129,15 @@ namespace ck { enum InMemoryDataOperationEnum_t { Set, - AtomicAdd + AtomicAdd, + Add +}; + +enum ActivTypeEnum_t +{ + None = 0, + LeakyRelu, + Sigmoid }; // index type diff --git a/host/driver_offline/CMakeLists.txt b/host/driver_offline/CMakeLists.txt index c0ab70e4c3..54b1395327 100644 --- a/host/driver_offline/CMakeLists.txt +++ b/host/driver_offline/CMakeLists.txt @@ -13,16 +13,25 @@ include_directories(BEFORE ) set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp) +set(CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_fwd_driver_offline_nchwc.cpp) +set(CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_add_fwd_driver_offline_nchwc.cpp) +set(CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_maxpool_fwd_driver_offline_nchwc.cpp) set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp) set(CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp) set(GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp) add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE}) +add_executable(conv_fwd_driver_offline_nchwc ${CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE}) +add_executable(conv_add_fwd_driver_offline_nchwc ${CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE}) +add_executable(conv_maxpool_fwd_driver_offline_nchwc ${CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE}) add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE}) add_executable(conv_wrw_driver_offline ${CONV_WRW_DRIVER_OFFLINE_SOURCE}) add_executable(gemm_driver_offline ${GEMM_DRIVER_OFFLINE_SOURCE}) target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor) +target_link_libraries(conv_fwd_driver_offline_nchwc PRIVATE host_tensor) +target_link_libraries(conv_add_fwd_driver_offline_nchwc PRIVATE host_tensor) +target_link_libraries(conv_maxpool_fwd_driver_offline_nchwc PRIVATE host_tensor) target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor) target_link_libraries(conv_wrw_driver_offline PRIVATE host_tensor) target_link_libraries(gemm_driver_offline PRIVATE host_tensor) diff --git a/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp b/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp new file mode 100644 index 0000000000..1463cebffc --- /dev/null +++ b/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp @@ -0,0 +1,220 @@ +#include +#include "device.hpp" +#include "host_tensor.hpp" +#include "driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" + +template +void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1( + const InLengths& in_n_c0_hi_wi_c1_lengths, + const WeiLengths& wei_k_c0_y_x_c1_lengths, + const AddLengths& add_n_k0_hox2_wox2_k1_lengths, + const OutLengths& out_n_k0_ho_wo_k1_lengths, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads& in_right_pads, + const Tensor& in_n_c0_hi_wi_c1, + const Tensor& wei_k_c0_y_x_c1, + const Tensor& bias_k0_k1, + const Tensor& add_n_k0_hox2_wox2_k1, + Tensor& add_n_k0_hox2_wox2_k1_out, + ck::index_t nrepeat) +{ + using namespace ck; + + std::cout << __func__ << std::endl; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; + + const auto N = out_n_k0_ho_wo_k1_lengths[I0]; + const auto K0 = out_n_k0_ho_wo_k1_lengths[I1]; + const auto Ho = out_n_k0_ho_wo_k1_lengths[I2]; + const auto Wo = out_n_k0_ho_wo_k1_lengths[I3]; + const auto K1 = out_n_k0_ho_wo_k1_lengths[I4]; + + const auto C0 = in_n_c0_hi_wi_c1_lengths[I1]; + const auto Hi = in_n_c0_hi_wi_c1_lengths[I2]; + const auto Wi = in_n_c0_hi_wi_c1_lengths[I3]; + const auto C1 = in_n_c0_hi_wi_c1_lengths[I4]; + + const auto K = wei_k_c0_y_x_c1_lengths[I0]; + const auto Y = wei_k_c0_y_x_c1_lengths[I2]; + const auto X = wei_k_c0_y_x_c1_lengths[I3]; + + const auto Hox2 = add_n_k0_hox2_wox2_k1_lengths[I2]; + const auto Wox2 = add_n_k0_hox2_wox2_k1_lengths[I3]; + + DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) * + in_n_c0_hi_wi_c1.mDesc.GetElementSpace()); + DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace()); + DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace()); + DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) * + add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace()); + + in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data()); + wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); + bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data()); + add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data()); + + constexpr index_t InWeiVectorSize = 8; + + if(C1 % InWeiVectorSize != 0) + { + throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize"); + } + +#if 0 + constexpr index_t BlockSize = 256; + + constexpr index_t KPerBlock = 32; + constexpr index_t HoPerBlock = 8; + constexpr index_t WoPerBlock = 64; + + constexpr index_t E1 = C0 * 9; + constexpr index_t E2 = 1; + constexpr index_t E1PerBlock = C0; + + constexpr index_t KPerThread = 16; + constexpr index_t HoPerThread = 2; + constexpr index_t WoPerThread = 2; + constexpr index_t EPerThread = 1; + + using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, E2>; + using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>; + + constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; + constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; + + constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2; + + constexpr index_t CThreadTransferDstScalarPerVector_K = K1; +#elif 1 + constexpr auto BlockSize = 64; + + constexpr auto KPerBlock = 8; + constexpr auto HoPerBlock = 8; + constexpr auto WoPerBlock = 32; + + constexpr auto E1 = 2 * 9; + constexpr auto E2 = 1; + constexpr auto K2 = 2; + constexpr auto E1PerBlock = 2; + + constexpr auto KPerThread = KPerBlock; + constexpr auto HoPerThread = 2; + constexpr auto WoPerThread = 2; + constexpr auto EPerThread = 1; + + using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>; + using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = + Sequence<1, E1PerBlock, 1, KPerBlock, 1>; + + constexpr auto ABlockTransferSrcScalarPerVector_E2 = E2; + constexpr auto ABlockTransferDstScalarPerVector_E2 = E2; + constexpr auto BThreadTransferSrcScalarPerVector_E2 = E2; + constexpr auto CThreadTransferDstScalarPerVector_K = InWeiVectorSize; +#endif + + const auto in_n_c0_hi_wi_c1_desc = + make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)); + const auto wei_k_c0_y_x_c1_desc = + make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2)); + const auto add_n_k0_hox2_wox2_k1_desc = + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1)); + const auto out_n_k0_ho_wo_k1_desc = + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)); + + constexpr auto conv_driver = + DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add< + BlockSize, + typename vector_type::type, + TAcc, + TOut, + E1, + E2, + K2, + KPerBlock, + HoPerBlock, + WoPerBlock, + E1PerBlock, + KPerThread, + HoPerThread, + WoPerThread, + EPerThread, + ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2, + ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2, + ABlockTransferSrcScalarPerVector_E2, + ABlockTransferDstScalarPerVector_E2, + BThreadTransferSrcScalarPerVector_E2, + CThreadTransferDstScalarPerVector_K, + activ_type>{}; + + std::cerr << "conv_bias_activ_resize_add_input_" + << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K + << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_addout_n" << N << "k" << K0 + << "h" << Ho * 2 << "w" << Wo * 2 << "k" << K1 << std::endl; + + for(int i = 0; i < 5; i++) + { + + const auto ave_time = + conv_driver.Run(wei_k_c0_y_x_c1_desc, + in_n_c0_hi_wi_c1_desc, + out_n_k0_ho_wo_k1_desc, + add_n_k0_hox2_wox2_k1_desc, + conv_strides, + conv_dilations, + in_left_pads, + in_right_pads, + static_cast::type*>( + wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()), + static_cast::type*>( + in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()), + static_cast(bias_k0_k1_device_buf.GetDeviceBuffer()), + static_cast(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()), + nrepeat); + + { + float perf = static_cast(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) / + (std::size_t(1000) * 1000 * 1000) / ave_time; + + std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" + << std::endl; + } + } + + add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data()); + + conv_driver.Run(wei_k_c0_y_x_c1_desc, + in_n_c0_hi_wi_c1_desc, + out_n_k0_ho_wo_k1_desc, + add_n_k0_hox2_wox2_k1_desc, + conv_strides, + conv_dilations, + in_left_pads, + in_right_pads, + static_cast::type*>( + wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()), + static_cast::type*>( + in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()), + static_cast(bias_k0_k1_device_buf.GetDeviceBuffer()), + static_cast(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()), + 0); + + add_n_k0_hox2_wox2_k1_device_buf.FromDevice(add_n_k0_hox2_wox2_k1_out.mData.data()); +} diff --git a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp new file mode 100644 index 0000000000..aed7368fb9 --- /dev/null +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp @@ -0,0 +1,196 @@ +#include +#include "device.hpp" +#include "host_tensor.hpp" +#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" + +template +void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1( + const InLengths& in_n_c0_hi_wi_c1_lengths, + const WeiLengths& wei_k_c0_y_x_c1_lengths, + const OutLengths& out_n_k0_ho_wo_k1_lengths, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads& in_right_pads, + const Tensor& in_n_c0_hi_wi_c1, + const Tensor& wei_k_c0_y_x_c1, + const Tensor& bias_k0_k1, + Tensor& out_n_k0_ho_wo_k1, + ck::index_t nrepeat) +{ + using namespace ck; + + std::cout << __func__ << std::endl; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; + + const auto N = out_n_k0_ho_wo_k1_lengths[I0]; + const auto K0 = out_n_k0_ho_wo_k1_lengths[I1]; + const auto Ho = out_n_k0_ho_wo_k1_lengths[I2]; + const auto Wo = out_n_k0_ho_wo_k1_lengths[I3]; + const auto K1 = out_n_k0_ho_wo_k1_lengths[I4]; + + const auto C0 = in_n_c0_hi_wi_c1_lengths[I1]; + const auto Hi = in_n_c0_hi_wi_c1_lengths[I2]; + const auto Wi = in_n_c0_hi_wi_c1_lengths[I3]; + const auto C1 = in_n_c0_hi_wi_c1_lengths[I4]; + + const auto K = wei_k_c0_y_x_c1_lengths[I0]; + const auto Y = wei_k_c0_y_x_c1_lengths[I2]; + const auto X = wei_k_c0_y_x_c1_lengths[I3]; + + DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) * + in_n_c0_hi_wi_c1.mDesc.GetElementSpace()); + DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace()); + DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace()); + DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) * + out_n_k0_ho_wo_k1.mDesc.GetElementSpace()); + in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data()); + wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); + bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data()); + + constexpr index_t InWeiVectorSize = 8; + + if(C1 % InWeiVectorSize != 0) + { + throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize"); + } + +#if 0 + constexpr index_t BlockSize = 256; + + constexpr index_t KPerBlock = 32; + constexpr index_t HoPerBlock = 8; + constexpr index_t WoPerBlock = 64; + + constexpr index_t E1 = C0 * 9; + constexpr index_t E2 = 1; + constexpr index_t E1PerBlock = C0; + + constexpr index_t KPerThread = 16; + constexpr index_t HoPerThread = 2; + constexpr index_t WoPerThread = 2; + constexpr index_t EPerThread = 1; + + using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, E2>; + using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>; + + constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; + constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; + + constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2; + + constexpr index_t CThreadTransferDstScalarPerVector_K = K1; +#elif 1 + constexpr index_t BlockSize = 64; + + constexpr index_t KPerBlock = 8; + constexpr index_t HoPerBlock = 8; + constexpr index_t WoPerBlock = 32; + + constexpr index_t E1 = 2 * 9; + constexpr index_t E2 = 1; + constexpr index_t K2 = 2; + constexpr index_t E1PerBlock = 2; + + constexpr index_t KPerThread = KPerBlock; + constexpr index_t HoPerThread = 2; + constexpr index_t WoPerThread = 2; + constexpr index_t EPerThread = 1; + + using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>; + using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = + Sequence<1, E1PerBlock, 1, KPerBlock, 1>; + + constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; + constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; + constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2; + constexpr index_t CThreadTransferDstScalarPerVector_K = InWeiVectorSize; +#endif + + if(KPerThread % InWeiVectorSize != 0) + { + throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize"); + } + + const auto in_n_c0_hi_wi_c1_desc = + make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)); + const auto wei_k_c0_y_x_c1_desc = + make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2)); + const auto out_n_k0_ho_wo_k1_desc = + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)); + + constexpr auto conv_driver = + DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad< + BlockSize, + typename vector_type::type, + TAcc, + TOut, + E1, + E2, + K2, + KPerBlock, + HoPerBlock, + WoPerBlock, + E1PerBlock, + KPerThread, + HoPerThread, + WoPerThread, + EPerThread, + ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2, + ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2, + ABlockTransferSrcScalarPerVector_E2, + ABlockTransferDstScalarPerVector_E2, + BThreadTransferSrcScalarPerVector_E2, + CThreadTransferDstScalarPerVector_K, + activ_type>{}; + + std::cerr << "conv_bias_activ_input_" + << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K + << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0 + << "h" << Ho << "w" << Wo << "k" << K1 << std::endl; + + for(int i = 0; i < 5; i++) + { + + const auto ave_time = + conv_driver.Run(wei_k_c0_y_x_c1_desc, + in_n_c0_hi_wi_c1_desc, + out_n_k0_ho_wo_k1_desc, + conv_strides, + conv_dilations, + in_left_pads, + in_right_pads, + static_cast::type*>( + wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()), + static_cast::type*>( + in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()), + static_cast(bias_k0_k1_device_buf.GetDeviceBuffer()), + static_cast(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()), + nrepeat); + + { + float perf = static_cast(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) / + (std::size_t(1000) * 1000 * 1000) / ave_time; + + std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" + << std::endl; + } + } + + out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data()); +} diff --git a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp deleted file mode 100644 index b5e5f91d59..0000000000 --- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp +++ /dev/null @@ -1,190 +0,0 @@ -#include -#include "device.hpp" -#include "host_tensor.hpp" -#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp" -#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp" - -template -void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( - const InLengths& in_n_c_hi_wi_lengths, - const WeiLengths& wei_k_c_y_x_lengths, - const OutLengths& out_n_k_ho_wo_lengths, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const Tensor& in_n_c_hi_wi, - const Tensor& wei_k_c_y_x, - Tensor& out_n_k_ho_wo, - ck::index_t /* nrepeat */) -{ - using namespace ck; - - std::cout << __func__ << std::endl; - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - const auto N = out_n_k_ho_wo_lengths[I0]; - const auto K = out_n_k_ho_wo_lengths[I1]; - const auto C = wei_k_c_y_x_lengths[I1]; - - const auto Hi = in_n_c_hi_wi_lengths[I2]; - const auto Wi = in_n_c_hi_wi_lengths[I3]; - - const auto Ho = out_n_k_ho_wo_lengths[I2]; - const auto Wo = out_n_k_ho_wo_lengths[I3]; - - const auto Y = wei_k_c_y_x_lengths[I2]; - const auto X = wei_k_c_y_x_lengths[I3]; - - const auto C0 = C / Number{}; - const auto C1 = Number{}; - - const auto K0 = K / Number{}; - const auto K1 = Number{}; - - Tensor in_n_c0_hi_wi_c1( - HostTensorDescriptor(std::initializer_list{N, C0, Hi, Wi, C1})); - Tensor wei_k_c0_y_x_c1( - HostTensorDescriptor(std::initializer_list{K, C0, Y, X, C1})); - Tensor out_n_k0_ho_wo_k1( - HostTensorDescriptor(std::initializer_list{N, K0, Ho, Wo, K1})); - - auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) { - in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) = - in_n_c_hi_wi(n, c, hi, wi); - }; - - auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) { - wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) = - wei_k_c_y_x(k, c, y, x); - }; - - make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)(); - make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)(); - - DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) * - in_n_c0_hi_wi_c1.mDesc.GetElementSpace()); - DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace()); - DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) * - out_n_k0_ho_wo_k1.mDesc.GetElementSpace()); - - in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data()); - wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); - - const auto in_n_c0_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi)); - const auto wei_k_c0_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X)); - const auto out_n_k0_ho_wo_k1_desc = - make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)); - -#if 1 - // cdata = 64, BlockSize = 64, 16x8x32x4 - constexpr index_t BlockSize = 64; - - constexpr index_t KPerBlock = 16; - constexpr index_t HoPerBlock = 8; - constexpr index_t WoPerBlock = 32; - constexpr index_t EPerBlock = 1; - - constexpr index_t KPerThread = KPerBlock; - constexpr index_t HoPerThread = 2; - constexpr index_t WoPerThread = 2; - constexpr index_t EPerThread = EPerBlock; - - using ABlockTransferThreadSliceLengths_E_K = Sequence<3, 1>; - using ABlockTransferThreadClusterLengths_E_K = Sequence<3 * EPerBlock, KPerBlock>; - - constexpr index_t ABlockTransferSrcScalarPerVector_E = 1; - constexpr index_t ABlockTransferDstScalarPerVector_K = 1; - - constexpr index_t BThreadTransferSrcScalarPerVector_W = 1; - - constexpr index_t CThreadTransferDstScalarPerVector_W = 16; - - static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, ""); -#else - constexpr index_t BlockSize = 64; - - constexpr index_t KPerBlock = 16; - constexpr index_t HoPerBlock = 8; - constexpr index_t WoPerBlock = 32; - constexpr index_t EPerBlock = 1; - - constexpr index_t KPerThread = 16; - constexpr index_t HoPerThread = 2; - constexpr index_t WoPerThread = 2; - constexpr index_t EPerThread = EPerBlock; - - using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>; - using ABlockTransferThreadClusterLengths_E_K = Sequence; - - constexpr index_t ABlockTransferSrcScalarPerVector_E = 1; - constexpr index_t ABlockTransferDstScalarPerVector_K = 1; - - constexpr index_t BThreadTransferSrcScalarPerVector_W = 1; - - constexpr index_t CThreadTransferDstScalarPerVector_W = K1; - - static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, ""); -#endif - - constexpr auto conv_driver = -#if 0 - DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad -#else - DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad -#endif - ::type, - TAcc, - TOut, - KPerBlock, - HoPerBlock, - WoPerBlock, - EPerBlock, - KPerThread, - HoPerThread, - WoPerThread, - EPerThread, - ABlockTransferThreadSliceLengths_E_K, - ABlockTransferThreadClusterLengths_E_K, - ABlockTransferSrcScalarPerVector_E, - ABlockTransferDstScalarPerVector_K, - BThreadTransferSrcScalarPerVector_W, - CThreadTransferDstScalarPerVector_W>{}; - - conv_driver.Run(wei_k_c0_y_x_desc, - in_n_c0_hi_wi_desc, - out_n_k0_ho_wo_k1_desc, - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads, - static_cast::type*>( - wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()), - static_cast::type*>( - in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()), - static_cast(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer())); - - out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data()); - - auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) { - out_n_k_ho_wo(n, k, ho, wo) = - out_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize); - }; - - make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)(); -} diff --git a/host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp b/host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp new file mode 100644 index 0000000000..cf610ae7a0 --- /dev/null +++ b/host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp @@ -0,0 +1,212 @@ +#include +#include "device.hpp" +#include "host_tensor.hpp" +#include "driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" + +template +void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1( + const InLengths& in_n_c0_hi_wi_c1_lengths, + const WeiLengths& wei_k_c0_y_x_c1_lengths, + const MaxLengths& max_n_k0_hx_wx_k1_lengths, + const OutLengths& out_n_k0_ho_wo_k1_lengths, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads& in_right_pads, + const Tensor& in_n_c0_hi_wi_c1, + const Tensor& wei_k_c0_y_x_c1, + const Tensor& bias_k0_k1, + Tensor& out_n_k0_ho_wo_k1, + Tensor& max_n_k0_hx_wx_k1, + ck::index_t nrepeat) +{ + using namespace ck; + + std::cout << __func__ << std::endl; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; + + const auto N = out_n_k0_ho_wo_k1_lengths[I0]; + const auto K0 = out_n_k0_ho_wo_k1_lengths[I1]; + const auto Ho = out_n_k0_ho_wo_k1_lengths[I2]; + const auto Wo = out_n_k0_ho_wo_k1_lengths[I3]; + const auto K1 = out_n_k0_ho_wo_k1_lengths[I4]; + + const auto C0 = in_n_c0_hi_wi_c1_lengths[I1]; + const auto Hi = in_n_c0_hi_wi_c1_lengths[I2]; + const auto Wi = in_n_c0_hi_wi_c1_lengths[I3]; + const auto C1 = in_n_c0_hi_wi_c1_lengths[I4]; + + const auto K = wei_k_c0_y_x_c1_lengths[I0]; + const auto Y = wei_k_c0_y_x_c1_lengths[I2]; + const auto X = wei_k_c0_y_x_c1_lengths[I3]; + + const auto Hx = max_n_k0_hx_wx_k1_lengths[I2]; + const auto Wx = max_n_k0_hx_wx_k1_lengths[I3]; + + DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) * + in_n_c0_hi_wi_c1.mDesc.GetElementSpace()); + DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace()); + DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace()); + DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) * + out_n_k0_ho_wo_k1.mDesc.GetElementSpace()); + DeviceMem max_n_k0_hx_wx_k1_device_buf(sizeof(TOut) * + max_n_k0_hx_wx_k1.mDesc.GetElementSpace()); + + in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data()); + wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); + bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data()); + max_n_k0_hx_wx_k1_device_buf.ToDevice(max_n_k0_hx_wx_k1.mData.data()); + + constexpr index_t InWeiVectorSize = 8; + + if(C1 % InWeiVectorSize != 0) + { + throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize"); + } + +#if 0 + constexpr index_t BlockSize = 256; + + constexpr index_t KPerBlock = 32; + constexpr index_t HoPerBlock = 8; + constexpr index_t WoPerBlock = 64; + + constexpr index_t E1 = C0 * 9; + constexpr index_t E2 = 1; + constexpr index_t E1PerBlock = C0; + + constexpr index_t KPerThread = 16; + constexpr index_t HoPerThread = 2; + constexpr index_t WoPerThread = 2; + constexpr index_t EPerThread = 1; + + using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, E2>; + using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>; + + constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; + constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; + + constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2; + + constexpr index_t CThreadTransferDstScalarPerVector_K = K1; +#elif 1 + constexpr index_t BlockSize = 64; + + constexpr index_t KPerBlock = 8; + constexpr index_t HoPerBlock = 8; + constexpr index_t WoPerBlock = 32; + + constexpr index_t E1 = 2 * 9; + constexpr index_t E2 = 1; + constexpr index_t K2 = 2; + constexpr index_t E1PerBlock = 2; + + constexpr index_t KPerThread = KPerBlock; + constexpr index_t HoPerThread = 2; + constexpr index_t WoPerThread = 2; + constexpr index_t EPerThread = 1; + + using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>; + using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = + Sequence<1, E1PerBlock, 1, KPerBlock, 1>; + + constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; + constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; + constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2; + constexpr index_t CThreadTransferDstScalarPerVector_K = InWeiVectorSize; +#endif + + if(KPerThread % InWeiVectorSize != 0) + { + throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize"); + } + + const auto in_n_c0_hi_wi_c1_desc = + make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)); + const auto wei_k_c0_y_x_c1_desc = + make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2)); + const auto max_n_k0_hx_wx_k1_desc = + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)); + const auto out_n_k0_ho_wo_k1_desc = + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)); + + constexpr auto conv_driver = + DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool< + BlockSize, + typename vector_type::type, + TAcc, + TOut, + E1, + E2, + K2, + KPerBlock, + HoPerBlock, + WoPerBlock, + E1PerBlock, + KPerThread, + HoPerThread, + WoPerThread, + EPerThread, + ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2, + ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2, + ABlockTransferSrcScalarPerVector_E2, + ABlockTransferDstScalarPerVector_E2, + BThreadTransferSrcScalarPerVector_E2, + CThreadTransferDstScalarPerVector_K, + activ_type>{}; + + std::cerr << "conv_bias_activ_maxpool_input_" + << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K + << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0 + << "h" << Ho << "w" << Wo << "k" << K1 << "_maxpoolout_n" << N << "k" << K0 << "h" + << Ho / 2 << "w" << Wo / 2 << "k" << K1 << std::endl; + + for(int i = 0; i < 5; i++) + { + + const auto ave_time = + conv_driver.Run(wei_k_c0_y_x_c1_desc, + in_n_c0_hi_wi_c1_desc, + out_n_k0_ho_wo_k1_desc, + max_n_k0_hx_wx_k1_desc, + conv_strides, + conv_dilations, + in_left_pads, + in_right_pads, + static_cast::type*>( + wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()), + static_cast::type*>( + in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()), + static_cast(bias_k0_k1_device_buf.GetDeviceBuffer()), + static_cast(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()), + static_cast(max_n_k0_hx_wx_k1_device_buf.GetDeviceBuffer()), + nrepeat); + + { + float perf = static_cast(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) / + (std::size_t(1000) * 1000 * 1000) / ave_time; + + std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" + << std::endl; + } + } + + out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data()); + max_n_k0_hx_wx_k1_device_buf.FromDevice(max_n_k0_hx_wx_k1.mData.data()); +} diff --git a/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp b/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp new file mode 100644 index 0000000000..bd2adcb3bd --- /dev/null +++ b/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp @@ -0,0 +1,565 @@ +#ifndef DRIVER_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP +#define DRIVER_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP + +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_dlops_v3.hpp" + +template +struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add +{ + template + __host__ float Run(const ck::TensorDescriptor& wei_k_c0_y_x_c1_global_desc, + const ck::TensorDescriptor& in_n_c0_hi_wi_c1_global_desc, + const ck::TensorDescriptor& out_n_k0_ho_wo_k1_global_desc, + const ck::TensorDescriptor& add_n_k0_hox2_wox2_k1_global_desc, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads& in_right_pads, + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_d_grid, + const int nrepeat) const + { + using namespace ck; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; + + const auto N = in_n_c0_hi_wi_c1_global_desc.GetLength(I0); + const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1); + const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2); + const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3); + // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4); + + const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1); + const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2); + const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3); + const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4); + + const auto Hox2 = add_n_k0_hox2_wox2_k1_global_desc.GetLength(I2); + const auto Wox2 = add_n_k0_hox2_wox2_k1_global_desc.GetLength(I3); + + const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0); + const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2); + const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3); + + const auto ConvStrideH = conv_strides[I0]; + const auto ConvStrideW = conv_strides[I1]; + + const auto ConvDilationH = conv_dilations[I0]; + const auto ConvDilationW = conv_dilations[I1]; + +#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR + const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{}; + const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{}; + + const auto OutRightPadH = Hop - Ho; + const auto OutRightPadW = Wop - Wo; + + const auto OutRightPadHx = Number{}; + const auto OutRightPadWx = Number{}; +#else + const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock; + const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock; + + const auto OutRightPadH = Hop - Ho; + const auto OutRightPadW = Wop - Wo; + + const auto OutRightPadHx = OutRightPadH * 2; + const auto OutRightPadWx = OutRightPadW * 2; +#endif + + const auto InLeftPadH = in_left_pads[I0]; + const auto InLeftPadW = in_left_pads[I1]; + + const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH; + const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW; + + const auto E = C0 * Y * X; + + constexpr auto E1 = Number{}; + constexpr auto E2 = Number{}; + constexpr auto K2 = Number{}; + + const auto E0 = E / E1; + + // weight tensor + const auto a_e_k_e2_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)), + make_tuple(make_pass_through_transform(K), + make_pass_through_transform(C0 * Y * X), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{})); + + const auto a_e0_e1_k_e2_grid_desc = + transform_tensor_descriptor(a_e_k_e2_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(E0, E1)), + make_pass_through_transform(K), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{})); + + // input tensor + const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)), + make_tuple(make_pass_through_transform(N), + make_pass_through_transform(C0), + make_pad_transform(Hi, InLeftPadH, InRightPadH), + make_pad_transform(Wi, InLeftPadW, InRightPadW), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + + const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor( + in_n_c0_hip_wip_e2_global_desc, + make_tuple( + make_pass_through_transform(N), + make_pass_through_transform(C0), + make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)), + make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{})); + + const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor( + in_n_c0_y_ho_x_wo_e2_global_desc, + make_tuple(make_merge_transform(make_tuple(C0, Y, X)), + make_pass_through_transform(N), + make_pass_through_transform(Hop), + make_pass_through_transform(Wop), + make_pass_through_transform(E2)), + make_tuple( + Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + + const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor( + in_e_n_ho_wo_e2_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(E0, E1)), + make_pass_through_transform(N), + make_pass_through_transform(Hop), + make_pass_through_transform(Wop), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple( + Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{})); + + // output tensor + const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)), + make_tuple(make_merge_transform(make_tuple(K0, K1)), + make_pass_through_transform(N), + make_pad_transform(Ho, I0, OutRightPadH), + make_pad_transform(Wo, I0, OutRightPadW)), + make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + // add tensor + const auto d_k_n_hopx2_wopx2_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1)), + make_tuple(make_merge_transform(make_tuple(K0, K1)), + make_pass_through_transform(N), + make_pad_transform(Hox2, I0, OutRightPadHx), + make_pad_transform(Wox2, I0, OutRightPadWx)), + make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl; + + if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 && + (E1 % E1PerBlock) == 0)) + { + throw std::runtime_error("wrong! GEMM size no divisible"); + } + + // clang-format off + + // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor + constexpr auto a_e0_e1_k_e2_global_step_hacks = + make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); + + constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack = + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}; + + // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = + make_tuple( + make_tuple( + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple( + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}) + ); + + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack = + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}; + + // hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor + constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks = + make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); + + constexpr auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks = + make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); + + // clang-format on + + // GEMM + using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3< + BlockSize, + FloatAB, + FloatAcc, + FloatC, + InMemoryDataOperationEnum_t::Set, + decltype(a_e0_e1_k_e2_grid_desc), + decltype(b_e0_e1_n_ho_wo_e2_grid_desc), + decltype(c_k_n_hop_wop_grid_desc), + decltype(d_k_n_hopx2_wopx2_grid_desc), + E1, + E2, + K2, + KPerBlock, + HoPerBlock, + WoPerBlock, + E1PerBlock, + KPerThread, + HoPerThread, + WoPerThread, + EPerThread, + ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2, + ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2, + Sequence<2, 3, 0, 1, 4>, + Sequence<0, 1, 2, 3, 4>, + 4, + ABlockTransferSrcScalarPerVector_E2, + ABlockTransferDstScalarPerVector_E2, + false, // don't move back src coordinate after threadwise copy + Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2 + 9, + BThreadTransferSrcScalarPerVector_E2, + false, // don't move back src coordinate after threadwise copy, which will be fused with + // MoveSrcSliceWindow() to save addr computation + Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2 + 1, + CThreadTransferDstScalarPerVector_K, + decltype(a_e0_e1_k_e2_global_step_hacks), + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks), + decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks), + decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks), + decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack), + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>; + + const auto a_e0_e1_k0_k1_e2_grid_desc = + GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc); + const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = + GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc); + const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = + GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc); + const auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc = + GridwiseGemm::MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd( + d_k_n_hopx2_wopx2_grid_desc); + + using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc); + using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 = + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc); + using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + using DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2 = + decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc); + + const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N; + + const bool has_main_e0_block_loop = E0 > 1; + + std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl; + + const auto c_blockid_to_k_n_h_w_block_cluster_adaptor = + GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc); + + using CBlockIdToBlockClusterAdaptor_K_N_H_W = + decltype(c_blockid_to_k_n_h_w_block_cluster_adaptor); + + float ave_time = 0; + +#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE + + if(has_main_e0_block_loop) + { + const auto kernel = kernel_gemm_dlops_v3_resize_add< + GridwiseGemm, + FloatAB, + FloatC, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + activ_type>; + + ave_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_d_grid, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor); + } + else + { + const auto kernel = kernel_gemm_dlops_v3_resize_add< + GridwiseGemm, + FloatAB, + FloatC, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + activ_type>; + + ave_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_d_grid, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor); + } + +#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER + DeviceMem a_e0_e1_k0_k1_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K0_K1_E2)); + DeviceMem b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf( + sizeof(BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2)); + DeviceMem c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf( + sizeof(CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2)); + DeviceMem d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf( + sizeof(DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2)); + DeviceMem c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf( + sizeof(CBlockIdToBlockClusterAdaptor_K_N_H_W)); + + a_e0_e1_k0_k1_e2_grid_desc_dev_buf.ToDevice(&a_e0_e1_k0_k1_e2_grid_desc); + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.ToDevice( + &b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc); + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.ToDevice( + &c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf.ToDevice( + &d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc); + c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.ToDevice( + &c_blockid_to_k_n_h_w_block_cluster_adaptor); + + if(has_main_e0_block_loop) + { + + const auto kernel = kernel_gemm_dlops_v3_resize_add< + GridwiseGemm, + FloatAB, + FloatC, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + activ_type>; + + ave_time = launch_and_time_kernel( + kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_d_grid, + cast_pointer_to_constant_address_space( + a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer())); + } + else + { + const auto kernel = kernel_gemm_dlops_v3_resize_add< + GridwiseGemm, + FloatAB, + FloatC, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + activ_type>; + + ave_time = launch_and_time_kernel( + kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_d_grid, + cast_pointer_to_constant_address_space( + a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer())); + } +#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR + { + static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), ""); + + const auto kernel = kernel_gemm_dlops_v3_resize_add< + GridwiseGemm, + FloatAB, + FloatC, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + has_main_e0_block_loop, + activ_type>; + + ave_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_d_grid); + } +#endif + return ave_time; + } +}; +#endif diff --git a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp new file mode 100644 index 0000000000..adb4cc79e7 --- /dev/null +++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp @@ -0,0 +1,500 @@ +#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP +#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP + +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_dlops_v3.hpp" + +template +struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad +{ + template + __host__ float Run(const ck::TensorDescriptor& wei_k_c0_y_x_c1_global_desc, + const ck::TensorDescriptor& in_n_c0_hi_wi_c1_global_desc, + const ck::TensorDescriptor& out_n_k0_ho_wo_k1_global_desc, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads& in_right_pads, + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_c_grid, + const int nrepeat) const + { + using namespace ck; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; + + const auto N = in_n_c0_hi_wi_c1_global_desc.GetLength(I0); + const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1); + const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2); + const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3); + // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4); + + const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1); + const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2); + const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3); + const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4); + + const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0); + const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2); + const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3); + + const auto ConvStrideH = conv_strides[I0]; + const auto ConvStrideW = conv_strides[I1]; + + const auto ConvDilationH = conv_dilations[I0]; + const auto ConvDilationW = conv_dilations[I1]; + +#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR + const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{}; + const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{}; +#else + const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock; + const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock; +#endif + + const auto OutRightPadH = Hop - Ho; + const auto OutRightPadW = Wop - Wo; + + const auto InLeftPadH = in_left_pads[I0]; + const auto InLeftPadW = in_left_pads[I1]; + + const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH; + const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW; + + const auto E = C0 * Y * X; + + constexpr auto E1 = Number{}; + constexpr auto E2 = Number{}; + constexpr auto K2 = Number{}; + + const auto E0 = E / E1; + + // weight tensor + const auto a_e_k_e2_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)), + make_tuple(make_pass_through_transform(K), + make_pass_through_transform(C0 * Y * X), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{})); + + const auto a_e0_e1_k_e2_grid_desc = + transform_tensor_descriptor(a_e_k_e2_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(E0, E1)), + make_pass_through_transform(K), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{})); + + // input tensor + const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)), + make_tuple(make_pass_through_transform(N), + make_pass_through_transform(C0), + make_pad_transform(Hi, InLeftPadH, InRightPadH), + make_pad_transform(Wi, InLeftPadW, InRightPadW), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + + const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor( + in_n_c0_hip_wip_e2_global_desc, + make_tuple( + make_pass_through_transform(N), + make_pass_through_transform(C0), + make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)), + make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{})); + + const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor( + in_n_c0_y_ho_x_wo_e2_global_desc, + make_tuple(make_merge_transform(make_tuple(C0, Y, X)), + make_pass_through_transform(N), + make_pass_through_transform(Hop), + make_pass_through_transform(Wop), + make_pass_through_transform(E2)), + make_tuple( + Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + + const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor( + in_e_n_ho_wo_e2_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(E0, E1)), + make_pass_through_transform(N), + make_pass_through_transform(Hop), + make_pass_through_transform(Wop), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple( + Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{})); + + // output tensor + const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)), + make_tuple(make_merge_transform(make_tuple(K0, K1)), + make_pass_through_transform(N), + make_pad_transform(Ho, I0, OutRightPadH), + make_pad_transform(Wo, I0, OutRightPadW)), + make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl; + + if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 && + (E1 % E1PerBlock) == 0)) + { + throw std::runtime_error("wrong! GEMM size no divisible"); + } + + // clang-format off + + // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor + constexpr auto a_e0_e1_k_e2_global_step_hacks = + make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); + + constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack = + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}; + + // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = + make_tuple( + make_tuple( + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple( + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}) + ); + + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack = + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}; + + // hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor + constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks = + make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); + // clang-format on + + // GEMM + using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3< + BlockSize, + FloatAB, + FloatAcc, + FloatC, + InMemoryDataOperationEnum_t::Set, + decltype(a_e0_e1_k_e2_grid_desc), + decltype(b_e0_e1_n_ho_wo_e2_grid_desc), + decltype(c_k_n_hop_wop_grid_desc), + decltype(c_k_n_hop_wop_grid_desc), + E1, + E2, + K2, + KPerBlock, + HoPerBlock, + WoPerBlock, + E1PerBlock, + KPerThread, + HoPerThread, + WoPerThread, + EPerThread, + ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2, + ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2, + Sequence<2, 3, 0, 1, 4>, + Sequence<0, 1, 2, 3, 4>, + 4, + ABlockTransferSrcScalarPerVector_E2, + ABlockTransferDstScalarPerVector_E2, + false, // don't move back src coordinate after threadwise copy + Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2 + 9, + BThreadTransferSrcScalarPerVector_E2, + false, // don't move back src coordinate after threadwise copy, which will be fused with + // MoveSrcSliceWindow() to save addr computation + Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, H2, W0, W1, W2 + 1, + CThreadTransferDstScalarPerVector_K, + decltype(a_e0_e1_k_e2_global_step_hacks), + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks), + decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks), + decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks), + decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack), + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>; + + const auto a_e0_e1_k0_k1_e2_grid_desc = + GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc); + const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = + GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc); + const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = + GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc); + + using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc); + using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 = + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc); + using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + + const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N; + + const bool has_main_e0_block_loop = E0 > 1; + + std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl; + + const auto c_blockid_to_k_n_h_w_block_cluster_adaptor = + GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc); + + using CBlockIdToBlockClusterAdaptor_K_N_H_W = + decltype(c_blockid_to_k_n_h_w_block_cluster_adaptor); + + float ave_time = 0; + +#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE + + if(has_main_e0_block_loop) + { + const auto kernel = + kernel_gemm_dlops_v3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + activ_type>; + + ave_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor); + } + else + { + const auto kernel = + kernel_gemm_dlops_v3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + activ_type>; + + ave_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor); + } + +#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER + DeviceMem a_e0_e1_k0_k1_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K0_K1_E2)); + DeviceMem b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf( + sizeof(BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2)); + DeviceMem c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf( + sizeof(CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2)); + DeviceMem c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf( + sizeof(CBlockIdToBlockClusterAdaptor_K_N_H_W)); + + a_e0_e1_k0_k1_e2_grid_desc_dev_buf.ToDevice(&a_e0_e1_k0_k1_e2_grid_desc); + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.ToDevice( + &b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc); + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.ToDevice( + &c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.ToDevice( + &c_blockid_to_k_n_h_w_block_cluster_adaptor); + + if(has_main_e0_block_loop) + { + + const auto kernel = + kernel_gemm_dlops_v3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + activ_type>; + + ave_time = launch_and_time_kernel( + kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + cast_pointer_to_constant_address_space( + a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer())); + } + else + { + + const auto kernel = + kernel_gemm_dlops_v3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + activ_type>; + + ave_time = launch_and_time_kernel( + kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + cast_pointer_to_constant_address_space( + a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer())); + } +#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR + { + static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), ""); + + const auto kernel = + kernel_gemm_dlops_v3, + remove_reference_t, + remove_reference_t, + remove_reference_t, + has_main_e0_block_loop, + activ_type>; + + ave_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid); + } +#endif + return ave_time; + } +}; +#endif diff --git a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp deleted file mode 100644 index efd4ce6a19..0000000000 --- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp +++ /dev/null @@ -1,349 +0,0 @@ -#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP -#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP - -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "gridwise_gemm_dlops_v2.hpp" -#include "gridwise_operation_wrapper.hpp" - -template -struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad -{ - template - __host__ void Run(const ck::TensorDescriptor& wei_k_c_y_x_global_desc, - const ck::TensorDescriptor& in_n_c_hi_wi_global_desc, - const ck::TensorDescriptor& out_n_k0_ho_wo_k1_global_desc, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const FloatAB* __restrict__ p_wei_global, - const FloatAB* __restrict__ p_in_global, - FloatC* __restrict__ p_out_global) const - { - using namespace ck; - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - - const auto N = in_n_c_hi_wi_global_desc.GetLength(I0); - const auto C = in_n_c_hi_wi_global_desc.GetLength(I1); - const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1); - - const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2); - const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3); - - const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2); - const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3); - - const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4); - - const auto K = wei_k_c_y_x_global_desc.GetLength(I0); - const auto Y = wei_k_c_y_x_global_desc.GetLength(I2); - const auto X = wei_k_c_y_x_global_desc.GetLength(I3); - - const auto ConvStrideH = conv_strides[I0]; - const auto ConvStrideW = conv_strides[I1]; - - const auto ConvDilationH = conv_dilations[I0]; - const auto ConvDilationW = conv_dilations[I1]; - - const auto InLeftPadH = in_left_pads[I0]; - const auto InLeftPadW = in_left_pads[I1]; - - const auto InRightPadH = in_right_pads[I0]; - const auto InRightPadW = in_right_pads[I1]; - - // weight tensor - const auto wei_e_k_global_desc = transform_tensor_descriptor( - make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)), - make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<1>{}, Sequence<0>{})); - - // input tensor - const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( - in_n_c_hi_wi_global_desc, - make_tuple(make_pass_through_transform(N), - make_pass_through_transform(C), - make_pad_transform(Hi, InLeftPadH, InRightPadH), - make_pad_transform(Wi, InLeftPadW, InRightPadW)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( - in_n_c_hip_wip_global_desc, - make_tuple( - make_pass_through_transform(N), - make_pass_through_transform(C), - make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)), - make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); - - const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor( - in_n_c_y_ho_x_wo_global_desc, - make_tuple(make_merge_transform(make_tuple(C, Y, X)), - make_pass_through_transform(N), - make_pass_through_transform(Ho), - make_pass_through_transform(Wo)), - make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - // output tensor - const auto out_k_n_ho_wo_global_desc = transform_tensor_descriptor( - make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)), - make_tuple(make_merge_transform(make_tuple(K0, K1)), - make_pass_through_transform(N), - make_pass_through_transform(Ho), - make_pass_through_transform(Wo)), - make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto E = C * Y * X; - - if(!((K % KPerBlock) == 0 && (Ho % HoPerBlock) == 0 && (Wo % WoPerBlock) == 0 && - (E % EPerBlock) == 0)) - { - throw std::runtime_error("wrong! GEMM size no divisible"); - } - - // hack to control index calculation when iterating over a_k_m_global tensor - constexpr auto a_e_k_global_step_hacks = - make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}), - make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{})); - - constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{}; - - constexpr auto b_e_n_ho_wo_global_step_hacks = - make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); - - constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack = - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}; - - // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor - // hack for NKHW format - constexpr auto c_k_n_ho_wo_global_tensor_step_hacks = - make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}), - make_tuple(Sequence<0, 2, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{})); - -#if 1 - // GEMM - using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3< - BlockSize, - FloatAB, - FloatAcc, - FloatC, - InMemoryDataOperationEnum_t::Set, - decltype(wei_e_k_global_desc), - decltype(in_e_n_ho_wo_global_desc), - decltype(out_k_n_ho_wo_global_desc), - KPerBlock, - HoPerBlock, - WoPerBlock, - EPerBlock, - KPerThread, - HoPerThread, - WoPerThread, - EPerThread, - ABlockTransferThreadSliceLengths_E_K, - ABlockTransferThreadClusterLengths_E_K, - Sequence<1, 0>, - Sequence<1, 0>, - 0, - ABlockTransferSrcScalarPerVector_E, - ABlockTransferDstScalarPerVector_K, - false, // don't move back src coordinate after threadwise copy - Sequence<0, 2, 3, 1>, - 3, - BThreadTransferSrcScalarPerVector_W, - false, // don't move back src coordinate after threadwise copy, which will be fused with - // MoveSrcSliceWindow() to save addr computation - Sequence<0, 2, 3, 1>, - 0, - CThreadTransferDstScalarPerVector_W, - decltype(a_e_k_global_step_hacks), - decltype(b_e_n_ho_wo_global_step_hacks), - decltype(c_k_n_ho_wo_global_tensor_step_hacks), - decltype(a_e_k_global_move_slice_window_step_hack), - decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>; - - const auto GridSize = (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock) * N; - - const bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1; - - const bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0; - - index_t nrepeat = 100; - - for(index_t i = 0; i < 5; ++i) - { - std::cout << "Start running " << nrepeat << " times..." << std::endl; - - KernelTimer timer; - timer.Start(); - std::cout << "has_main_k_block_loop: " << has_main_k_block_loop - << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop - << std::endl; - - for(index_t j = 0; j < nrepeat; ++j) - { - if(has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - wei_e_k_global_desc, - p_wei_global, - in_e_n_ho_wo_global_desc, - p_in_global, - out_k_n_ho_wo_global_desc, - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(has_main_k_block_loop && !has_double_tail_k_block_loop) - { - const auto kernel = run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - wei_e_k_global_desc, - p_wei_global, - in_e_n_ho_wo_global_desc, - p_in_global, - out_k_n_ho_wo_global_desc, - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(!has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - wei_e_k_global_desc, - p_wei_global, - in_e_n_ho_wo_global_desc, - p_in_global, - out_k_n_ho_wo_global_desc, - p_out_global, - integral_constant{}, - integral_constant{}); - } - else - { - const auto kernel = run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - wei_e_k_global_desc, - p_wei_global, - in_e_n_ho_wo_global_desc, - p_in_global, - out_k_n_ho_wo_global_desc, - p_out_global, - integral_constant{}, - integral_constant{}); - } - } - - timer.End(); - - float ave_time = timer.GetElapsedTime() / nrepeat; - - float perf = - static_cast(calculate_convolution_flops(in_n_c_hi_wi_global_desc, - wei_k_c_y_x_global_desc, - out_n_k0_ho_wo_k1_global_desc)) / - (std::size_t(1000) * 1000 * 1000) / ave_time; - - std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" - << std::endl; - } -#endif - } -}; -#endif diff --git a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp deleted file mode 100644 index 70f73cbf4a..0000000000 --- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp +++ /dev/null @@ -1,364 +0,0 @@ -#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP -#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP - -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "gridwise_gemm_dlops_v2.hpp" -#include "gridwise_operation_wrapper.hpp" - -template -struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad -{ - template - __host__ void Run(const ck::TensorDescriptor& wei_k_c_y_x_global_desc, - const ck::TensorDescriptor& in_n_c_hi_wi_global_desc, - const ck::TensorDescriptor& out_n_k0_ho_wo_k1_global_desc, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const FloatAB* __restrict__ p_wei_global, - const FloatAB* __restrict__ p_in_global, - FloatC* __restrict__ p_out_global) const - { - using namespace ck; - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - - const auto N = in_n_c_hi_wi_global_desc.GetLength(I0); - const auto C = in_n_c_hi_wi_global_desc.GetLength(I1); - const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1); - - const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2); - const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3); - - const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2); - const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3); - - const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4); - - const auto K = wei_k_c_y_x_global_desc.GetLength(I0); - const auto Y = wei_k_c_y_x_global_desc.GetLength(I2); - const auto X = wei_k_c_y_x_global_desc.GetLength(I3); - - const auto ConvStrideH = conv_strides[I0]; - const auto ConvStrideW = conv_strides[I1]; - - const auto ConvDilationH = conv_dilations[I0]; - const auto ConvDilationW = conv_dilations[I1]; - - const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock; - const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock; - - const auto OutRightPadH = Hop - Ho; - const auto OutRightPadW = Wop - Wo; - - const auto InLeftPadH = in_left_pads[I0]; - const auto InLeftPadW = in_left_pads[I1]; - - const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH; - const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW; - - std::cerr << "OutRightPadH = " << OutRightPadH << " OutRightPadW = " << OutRightPadW - << std::endl; - std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW - << std::endl; - - // weight tensor - const auto wei_e_k_global_desc = transform_tensor_descriptor( - make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)), - make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<1>{}, Sequence<0>{})); - - // input tensor - const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( - in_n_c_hi_wi_global_desc, - make_tuple(make_pass_through_transform(N), - make_pass_through_transform(C), - make_pad_transform(Hi, InLeftPadH, InRightPadH), - make_pad_transform(Wi, InLeftPadW, InRightPadW)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( - in_n_c_hip_wip_global_desc, - make_tuple( - make_pass_through_transform(N), - make_pass_through_transform(C), - make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)), - make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW))), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); - - const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor( - in_n_c_y_ho_x_wo_global_desc, - make_tuple(make_merge_transform(make_tuple(C, Y, X)), - make_pass_through_transform(N), - make_pass_through_transform(Hop), - make_pass_through_transform(Wop)), - make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - // output tensor - const auto out_k_n_hop_wop_global_desc = transform_tensor_descriptor( - make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)), - make_tuple(make_merge_transform(make_tuple(K0, K1)), - make_pass_through_transform(N), - make_pad_transform(Ho, 0, OutRightPadH), - make_pad_transform(Wo, 0, OutRightPadW)), - make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto E = C * Y * X; - - std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl; - - if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 && - (E % EPerBlock) == 0)) - { - throw std::runtime_error("wrong! GEMM size no divisible"); - } - - // hack to control index calculation when iterating over a_k_m_global tensor - constexpr auto a_e_k_global_step_hacks = - make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}), - make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{})); - - constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{}; - - constexpr auto b_e_n_ho_wo_global_step_hacks = - make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); - - constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack = - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}; - - // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor - // hack for NKHW format - constexpr auto c_k_n_ho_wo_global_tensor_step_hacks = - make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}), - make_tuple(Sequence<0, 2, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{})); - - // GEMM - using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3< - BlockSize, - FloatAB, - FloatAcc, - FloatC, - InMemoryDataOperationEnum_t::Set, - decltype(wei_e_k_global_desc), - decltype(in_e_n_ho_wo_global_desc), - decltype(out_k_n_hop_wop_global_desc), - KPerBlock, - HoPerBlock, - WoPerBlock, - EPerBlock, - KPerThread, - HoPerThread, - WoPerThread, - EPerThread, - ABlockTransferThreadSliceLengths_E_K, - ABlockTransferThreadClusterLengths_E_K, - Sequence<1, 0>, - Sequence<1, 0>, - 0, - ABlockTransferSrcScalarPerVector_E, - ABlockTransferDstScalarPerVector_K, - false, // don't move back src coordinate after threadwise copy - Sequence<0, 2, 3, 1>, - 3, - BThreadTransferSrcScalarPerVector_W, - false, // don't move back src coordinate after threadwise copy, which will be fused with - // MoveSrcSliceWindow() to save addr computation - Sequence<0, 2, 3, 1>, - 0, - CThreadTransferDstScalarPerVector_W, - decltype(a_e_k_global_step_hacks), - decltype(b_e_n_ho_wo_global_step_hacks), - decltype(c_k_n_ho_wo_global_tensor_step_hacks), - decltype(a_e_k_global_move_slice_window_step_hack), - decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>; - - const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N; - - const bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1; - - const bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0; - - index_t nrepeat = 100; - - for(index_t i = 0; i < 5; ++i) - { - std::cout << "Start running " << nrepeat << " times..." << std::endl; - - KernelTimer timer; - timer.Start(); - std::cout << "has_main_k_block_loop: " << has_main_k_block_loop - << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop - << std::endl; - - for(index_t j = 0; j < nrepeat; ++j) - { - if(has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - wei_e_k_global_desc, - p_wei_global, - in_e_n_ho_wo_global_desc, - p_in_global, - out_k_n_hop_wop_global_desc, - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(has_main_k_block_loop && !has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - wei_e_k_global_desc, - p_wei_global, - in_e_n_ho_wo_global_desc, - p_in_global, - out_k_n_hop_wop_global_desc, - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(!has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - wei_e_k_global_desc, - p_wei_global, - in_e_n_ho_wo_global_desc, - p_in_global, - out_k_n_hop_wop_global_desc, - p_out_global, - integral_constant{}, - integral_constant{}); - } - else - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - wei_e_k_global_desc, - p_wei_global, - in_e_n_ho_wo_global_desc, - p_in_global, - out_k_n_hop_wop_global_desc, - p_out_global, - integral_constant{}, - integral_constant{}); - } - } - - timer.End(); - - float ave_time = timer.GetElapsedTime() / nrepeat; - - float perf = - static_cast(calculate_convolution_flops(in_n_c_hi_wi_global_desc, - wei_k_c_y_x_global_desc, - out_n_k0_ho_wo_k1_global_desc)) / - (std::size_t(1000) * 1000 * 1000) / ave_time; - - std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" - << std::endl; - } - } -}; -#endif diff --git a/host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp b/host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp new file mode 100644 index 0000000000..3d3d54fa45 --- /dev/null +++ b/host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp @@ -0,0 +1,569 @@ +#ifndef DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP +#define DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP + +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_dlops_v3.hpp" + +template +struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool +{ + template + __host__ float Run(const ck::TensorDescriptor& wei_k_c0_y_x_c1_global_desc, + const ck::TensorDescriptor& in_n_c0_hi_wi_c1_global_desc, + const ck::TensorDescriptor& out_n_k0_ho_wo_k1_global_desc, + const ck::TensorDescriptor& max_n_k0_hx_wx_k1_global_desc, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads& in_right_pads, + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatC* __restrict__ p_bias_grid, + FloatC* __restrict__ p_c_grid, + FloatC* __restrict__ p_d_grid, + const int nrepeat) const + { + using namespace ck; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; + + const auto N = in_n_c0_hi_wi_c1_global_desc.GetLength(I0); + const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1); + const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2); + const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3); + // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4); + + const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1); + const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2); + const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3); + const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4); + + const auto Hx = max_n_k0_hx_wx_k1_global_desc.GetLength(I2); + const auto Wx = max_n_k0_hx_wx_k1_global_desc.GetLength(I3); + + const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0); + const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2); + const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3); + + const auto ConvStrideH = conv_strides[I0]; + const auto ConvStrideW = conv_strides[I1]; + + const auto ConvDilationH = conv_dilations[I0]; + const auto ConvDilationW = conv_dilations[I1]; + +#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR + const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{}; + const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{}; + + const auto OutRightPadH = Hop - Ho; + const auto OutRightPadW = Wop - Wo; + + const auto OutRightPadHx = Number{}; + const auto OutRightPadWx = Number{}; +#else + const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock; + const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock; + + const auto OutRightPadH = Hop - Ho; + const auto OutRightPadW = Wop - Wo; + + const auto OutRightPadHx = OutRightPadH / 2; + const auto OutRightPadWx = OutRightPadW / 2; +#endif + + const auto InLeftPadH = in_left_pads[I0]; + const auto InLeftPadW = in_left_pads[I1]; + + const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH; + const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW; + + const auto E = C0 * Y * X; + + constexpr auto E1 = Number{}; + constexpr auto E2 = Number{}; + constexpr auto K2 = Number{}; + + const auto E0 = E / E1; + + // weight tensor + const auto a_e_k_e2_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)), + make_tuple(make_pass_through_transform(K), + make_pass_through_transform(C0 * Y * X), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{})); + + const auto a_e0_e1_k_e2_grid_desc = + transform_tensor_descriptor(a_e_k_e2_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(E0, E1)), + make_pass_through_transform(K), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{})); + + // input tensor + const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)), + make_tuple(make_pass_through_transform(N), + make_pass_through_transform(C0), + make_pad_transform(Hi, InLeftPadH, InRightPadH), + make_pad_transform(Wi, InLeftPadW, InRightPadW), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + + const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor( + in_n_c0_hip_wip_e2_global_desc, + make_tuple( + make_pass_through_transform(N), + make_pass_through_transform(C0), + make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)), + make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{})); + + const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor( + in_n_c0_y_ho_x_wo_e2_global_desc, + make_tuple(make_merge_transform(make_tuple(C0, Y, X)), + make_pass_through_transform(N), + make_pass_through_transform(Hop), + make_pass_through_transform(Wop), + make_pass_through_transform(E2)), + make_tuple( + Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + + const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor( + in_e_n_ho_wo_e2_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(E0, E1)), + make_pass_through_transform(N), + make_pass_through_transform(Hop), + make_pass_through_transform(Wop), + make_pass_through_transform(E2)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple( + Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{})); + + // output tensor + const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)), + make_tuple(make_merge_transform(make_tuple(K0, K1)), + make_pass_through_transform(N), + make_pad_transform(Ho, I0, OutRightPadH), + make_pad_transform(Wo, I0, OutRightPadW)), + make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + // max tensor + const auto d_k_n_hx_wx_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)), + make_tuple(make_merge_transform(make_tuple(K0, K1)), + make_pass_through_transform(N), + make_pad_transform(Hx, I0, OutRightPadHx), + make_pad_transform(Wx, I0, OutRightPadWx)), + make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl; + + if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 && + (E1 % E1PerBlock) == 0)) + { + throw std::runtime_error("wrong! GEMM size no divisible"); + } + + // clang-format off + + // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor + constexpr auto a_e0_e1_k_e2_global_step_hacks = + make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); + + constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack = + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}; + + // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = + make_tuple( + make_tuple( + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple( + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}) + ); + + constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack = + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}; + + constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks = + make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); + + constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks = + make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); + + // clang-format on + + // GEMM + using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3< + BlockSize, + FloatAB, + FloatAcc, + FloatC, + InMemoryDataOperationEnum_t::Set, + decltype(a_e0_e1_k_e2_grid_desc), + decltype(b_e0_e1_n_ho_wo_e2_grid_desc), + decltype(c_k_n_hop_wop_grid_desc), + decltype(d_k_n_hx_wx_grid_desc), + E1, + E2, + K2, + KPerBlock, + HoPerBlock, + WoPerBlock, + E1PerBlock, + KPerThread, + HoPerThread, + WoPerThread, + EPerThread, + ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2, + ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2, + Sequence<2, 3, 0, 1, 4>, + Sequence<0, 1, 2, 3, 4>, + 4, + ABlockTransferSrcScalarPerVector_E2, + ABlockTransferDstScalarPerVector_E2, + false, // don't move back src coordinate after threadwise copy + Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2 + 9, + BThreadTransferSrcScalarPerVector_E2, + false, // don't move back src coordinate after threadwise copy, which will be fused + // with MoveSrcSliceWindow() to save addr computation + Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2 + 1, + CThreadTransferDstScalarPerVector_K, + decltype(a_e0_e1_k_e2_global_step_hacks), + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks), + decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks), + decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks), + decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack), + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>; + + const auto a_e0_e1_k0_k1_e2_grid_desc = + GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc); + const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = + GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc); + const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = + GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc); + const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = + GridwiseGemm::MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(d_k_n_hx_wx_grid_desc); + + using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc); + using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 = + decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc); + using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + using DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx = decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc); + + const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N; + + const bool has_main_e0_block_loop = E0 > 1; + + std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl; + + const auto c_blockid_to_k_n_h_w_block_cluster_adaptor = + GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc); + + using CBlockIdToBlockClusterAdaptor_K_N_H_W = + decltype(c_blockid_to_k_n_h_w_block_cluster_adaptor); + + float ave_time = 0; + +#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE + + if(has_main_e0_block_loop) + { + const auto kernel = kernel_gemm_dlops_v3_maxpool< + GridwiseGemm, + FloatAB, + FloatC, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + activ_type>; + + ave_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_d_grid, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor); + } + else + { + const auto kernel = kernel_gemm_dlops_v3_maxpool< + GridwiseGemm, + FloatAB, + FloatC, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + activ_type>; + + ave_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_d_grid, + a_e0_e1_k0_k1_e2_grid_desc, + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, + c_blockid_to_k_n_h_w_block_cluster_adaptor); + } + +#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER + DeviceMem a_e0_e1_k0_k1_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K0_K1_E2)); + DeviceMem b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf( + sizeof(BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2)); + DeviceMem c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf( + sizeof(CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2)); + DeviceMem d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf( + sizeof(DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx)); + DeviceMem c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf( + sizeof(CBlockIdToBlockClusterAdaptor_K_N_H_W)); + + a_e0_e1_k0_k1_e2_grid_desc_dev_buf.ToDevice(&a_e0_e1_k0_k1_e2_grid_desc); + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.ToDevice( + &b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc); + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.ToDevice( + &c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.ToDevice( + &d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc); + c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.ToDevice( + &c_blockid_to_k_n_h_w_block_cluster_adaptor); + + if(has_main_e0_block_loop) + { + + const auto kernel = kernel_gemm_dlops_v3_maxpool< + GridwiseGemm, + FloatAB, + FloatC, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + true, + activ_type>; + + ave_time = launch_and_time_kernel( + kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_d_grid, + cast_pointer_to_constant_address_space( + a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer())); + } + else + { + + const auto kernel = kernel_gemm_dlops_v3_maxpool< + GridwiseGemm, + FloatAB, + FloatC, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + false, + activ_type>; + + ave_time = launch_and_time_kernel( + kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_d_grid, + cast_pointer_to_constant_address_space( + a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer())); + } +#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR + { + static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), ""); + static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), ""); + + const auto kernel = kernel_gemm_dlops_v3_maxpool< + GridwiseGemm, + FloatAB, + FloatC, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, + has_main_e0_block_loop, + activ_type>; + + ave_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_bias_grid, + p_c_grid, + p_d_grid); + } +#endif + return ave_time; + } +}; +#endif diff --git a/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp b/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp new file mode 100644 index 0000000000..d818f3c950 --- /dev/null +++ b/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp @@ -0,0 +1,414 @@ +#include +#include +#include +#include +#include +#include +#include "config.hpp" +#include "debug.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "conv_common.hpp" +#include "device_tensor.hpp" +#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" + +#define USE_DYNAMIC_MODE 0 +#define USE_CONV_FWD_V5R1_NCHWC 1 + +enum ConvForwardAlgo +{ + V5R1NCHWC // 0 +}; + +template +void host_direct_convolution_add_nchwc(const Tensor& in, + const Tensor& wei, + const Tensor& add, + const Tensor& bias, + Tensor& add_host, + Tensor& out_host, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads&, + const ck::ActivTypeEnum_t activ_type) +{ + using namespace ck; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) { + double v = 0; + auto k = k0 * out_host.mDesc.GetLengths()[4] + k1; + + for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0) + { + for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y) + { + int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0]; + for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x) + { + int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1]; + if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && + wi < in.mDesc.GetLengths()[3]) + { + + for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1) + { + v += static_cast(in(n, c0, hi, wi, c1)) * + static_cast(wei(k, c0, y, x, c1)); + } + } + } + } + } + + v += bias(k0, k1); + v = activ(v, activ_type); + + const int hox2 = ho * 2; + const int wox2 = wo * 2; + + out_host(n, k0, ho, wo, k1) = v; + + add_host(n, k0, hox2, wox2, k1) = v + add(n, k0, hox2, wox2, k1); + add_host(n, k0, hox2, wox2 + 1, k1) = v + add(n, k0, hox2, wox2 + 1, k1); + add_host(n, k0, hox2 + 1, wox2, k1) = v + add(n, k0, hox2 + 1, wox2, k1); + add_host(n, k0, hox2 + 1, wox2 + 1, k1) = v + add(n, k0, hox2 + 1, wox2 + 1, k1); + }; + + make_ParallelTensorFunctor(f_nchw, + out_host.mDesc.GetLengths()[0], + out_host.mDesc.GetLengths()[1], + out_host.mDesc.GetLengths()[2], + out_host.mDesc.GetLengths()[3], + out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency()); +} + +int main(int argc, char* argv[]) +{ + using namespace ck; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; + constexpr auto I5 = Number<5>{}; + constexpr auto I6 = Number<6>{}; + constexpr auto I7 = Number<7>{}; + +#if USE_DYNAMIC_MODE + // dynamic mode + if(argc != 23) + { + printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n"); + printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " + "RightPx\n"); + exit(1); + } + + constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + + const ConvForwardAlgo algo = static_cast(std::stoi(argv[1])); + const bool do_verification = std::stoi(argv[2]); + const int init_method = std::stoi(argv[3]); + const bool do_log = std::stoi(argv[4]); + const int nrepeat = std::stoi(argv[5]); + + const index_t N = std::stoi(argv[6]); + const index_t K0 = std::stoi(argv[7]); + const index_t K1 = std::stoi(argv[8]); + const index_t C0 = std::stoi(argv[9]); + const index_t C1 = std::stoi(argv[10]); + const index_t Y = std::stoi(argv[11]); + const index_t X = std::stoi(argv[12]); + const index_t Hi = std::stoi(argv[13]); + const index_t Wi = std::stoi(argv[14]); + + const index_t conv_stride_h = std::stoi(argv[15]); + const index_t conv_stride_w = std::stoi(argv[16]); + const index_t conv_dilation_h = std::stoi(argv[17]); + const index_t conv_dilation_w = std::stoi(argv[18]); + const index_t in_left_pad_h = std::stoi(argv[19]); + const index_t in_left_pad_w = std::stoi(argv[20]); + const index_t in_right_pad_h = std::stoi(argv[21]); + const index_t in_right_pad_w = std::stoi(argv[22]); + + const index_t YEff = (Y - 1) * conv_dilation_h + 1; + const index_t XEff = (X - 1) * conv_dilation_w + 1; + + const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; + const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; + + const auto Hox2 = Ho * 2; + const auto Wox2 = Wo * 2; +#else + // static mode + if(argc < 6) + { + printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n"); + exit(1); + } + + const ConvForwardAlgo algo = static_cast(std::stoi(argv[1])); + + const bool do_verification = std::stoi(argv[2]); + const int init_method = std::stoi(argv[3]); + const bool do_log = std::stoi(argv[4]); + const int nrepeat = std::stoi(argv[5]); + + constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + +#if 0 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<1080>{}; + constexpr auto Wi = Number<1920>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K1 = Number<8>{}; + constexpr auto K0 = Number<8>{}; +#elif 0 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<540>{}; + constexpr auto Wi = Number<960>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#elif 0 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<270>{}; + constexpr auto Wi = Number<480>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#elif 1 + constexpr auto N = Number<128>{}; + constexpr auto Hi = Number<135>{}; + constexpr auto Wi = Number<240>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#elif 1 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<32>{}; + constexpr auto Wi = Number<32>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K1 = Number<8>{}; + constexpr auto K0 = Number<8>{}; +#endif + + constexpr auto conv_stride_h = I1; + constexpr auto conv_stride_w = I1; + constexpr auto conv_dilation_h = I1; + constexpr auto conv_dilation_w = I1; + constexpr auto in_left_pad_h = I1; + constexpr auto in_left_pad_w = I1; + constexpr auto in_right_pad_h = I1; + constexpr auto in_right_pad_w = I1; + + constexpr auto YEff = (Y - I1) * conv_dilation_h + I1; + constexpr auto XEff = (X - I1) * conv_dilation_w + I1; + + constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1; + constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1; + + constexpr auto Hox2 = Number{}; + constexpr auto Wox2 = Number{}; + +#endif + +#if 0 + using in_data_t = float; + using acc_data_t = float; + using out_data_t = float; +#elif 1 + using in_data_t = half_t; + using acc_data_t = float; + using out_data_t = half_t; +#elif 1 + using in_data_t = int8_t; + using acc_data_t = int32_t; + using out_data_t = int8_t; +#endif + + std::vector in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5), + add_lengths_host(5), bias_lengths_host(2); + + in_lengths_host[0] = static_cast(N); + in_lengths_host[1] = static_cast(C0); + in_lengths_host[2] = static_cast(Hi); + in_lengths_host[3] = static_cast(Wi); + in_lengths_host[4] = static_cast(C1); + + wei_lengths_host[0] = static_cast(K0 * K1); + wei_lengths_host[1] = static_cast(C0); + wei_lengths_host[2] = static_cast(Y); + wei_lengths_host[3] = static_cast(X); + wei_lengths_host[4] = static_cast(C1); + + out_lengths_host[0] = static_cast(N); + out_lengths_host[1] = static_cast(K0); + out_lengths_host[2] = static_cast(Ho); + out_lengths_host[3] = static_cast(Wo); + out_lengths_host[4] = static_cast(K1); + + add_lengths_host[0] = static_cast(N); + add_lengths_host[1] = static_cast(K0); + add_lengths_host[2] = static_cast(Hox2); + add_lengths_host[3] = static_cast(Wox2); + add_lengths_host[4] = static_cast(K1); + + bias_lengths_host[0] = static_cast(K0); + bias_lengths_host[1] = static_cast(K1); + + Tensor in(in_lengths_host); + Tensor wei(wei_lengths_host); + Tensor add(add_lengths_host); + Tensor add_device(add_lengths_host); + Tensor add_host(add_lengths_host); + Tensor bias(bias_lengths_host); + Tensor out_host(out_lengths_host); + + ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: "); + ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: "); + ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: "); + + print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w)); + print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w)); + print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); + print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + switch(init_method) + { + case 0: + // no initialization + break; + case 1: + in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + break; + case 2: + in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + case 3: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + break; + case 4: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + case 5: + in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); + break; + default: + in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + + auto gen_wei = [](auto... is) { + return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); + }; + wei.GenerateTensorValue(gen_wei, num_thread); + } + + bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + add.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + + auto f_make_for_device_nchwc = [&]() { + const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1); + const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1); + const auto add_lengths_dev = make_tuple(N, K0, Hox2, Wox2, K1); + const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1); + const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w); + const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w); + const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w); + const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w); + + return make_tuple(in_lengths_dev, + wei_lengths_dev, + add_lengths_dev, + out_lengths_dev, + conv_strides_dev, + conv_dilations_dev, + in_left_pads_dev, + in_right_pads_dev); + }; + +#if USE_CONV_FWD_V5R1_NCHWC + if(algo == ConvForwardAlgo::V5R1NCHWC) + { + const auto tmp = f_make_for_device_nchwc(); + + device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1( + tmp[I0], // in_lengths_dev + tmp[I1], // wei_lengths_dev + tmp[I2], // add_lengths_dev + tmp[I3], // out_lengths_dev + tmp[I4], // conv_strides_dev + tmp[I5], // conv_dilations_dev + tmp[I6], // in_left_pads_dev + tmp[I7], // in_right_pads_dev + in, + wei, + bias, + add, + add_device, + nrepeat); + } +#endif + + if(do_verification) + { + host_direct_convolution_add_nchwc(in, + wei, + add, + bias, + add_host, + out_host, + make_tuple(conv_stride_h, conv_stride_w), + make_tuple(conv_dilation_h, conv_dilation_w), + make_tuple(in_left_pad_h, in_left_pad_w), + make_tuple(in_right_pad_h, in_right_pad_w), + activ_type); + + check_error(add_host, add_device); + + if(do_log) + { + LogRangeAsType(std::cout << "in : ", in.mData, ",") << std::endl; + LogRangeAsType(std::cout << "wei: ", wei.mData, ",") << std::endl; + LogRangeAsType(std::cout << "add_host: ", add_host.mData, ",") << std::endl; + LogRangeAsType(std::cout << "add_device: ", add_device.mData, ",") << std::endl; + } + } +} diff --git a/host/driver_offline/src/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp index 30a72e3bbb..208f99098d 100644 --- a/host/driver_offline/src/conv_fwd_driver_offline.cpp +++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp @@ -15,17 +15,15 @@ #include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp" #include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp" -#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp" -#define USE_DYNAMIC_MODE 1 +#define USE_DYNAMIC_MODE 0 #define USE_CONV_FWD_V4R4_NCHW 0 -#define USE_CONV_FWD_V4R4R2_NHWC 0 -#define USE_CONV_FWD_V6R1_NCHW 0 -#define USE_CONV_FWD_V5R1_NCHW 0 +#define USE_CONV_FWD_V4R4R2_NHWC 1 +#define USE_CONV_FWD_V6R1_NCHW 1 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0 -#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1 +#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0 enum ConvTensorLayout { @@ -41,9 +39,8 @@ enum ConvForwardAlgo V4R4NCHW, // 0 V4R4R2NHWC, // 1 V6R1NCHW, // 2 - V5R1NCHW, // 3 - V4R4R2XDLNCHW, // 4 - V4R4R4XDLNHWC // 5 + V4R4R2XDLNCHW, // 3 + V4R4R4XDLNHWC // 4 }; template {}; constexpr auto X = Number<3>{}; - constexpr auto conv_stride_h = I2; - constexpr auto conv_stride_w = I2; + constexpr auto conv_stride_h = I1; + constexpr auto conv_stride_w = I1; constexpr auto conv_dilation_h = I1; constexpr auto conv_dilation_w = I1; constexpr auto in_left_pad_h = I1; @@ -253,7 +250,7 @@ int main(int argc, char* argv[]) constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1; #endif -#if 0 +#if 1 using in_data_t = float; using acc_data_t = float; using out_data_t = float; @@ -472,33 +469,6 @@ int main(int argc, char* argv[]) } #endif -#if USE_CONV_FWD_V5R1_NCHW - if(algo == ConvForwardAlgo::V5R1NCHW) - { - if(layout != ConvTensorLayout::NCHW) - { - throw std::runtime_error("wrong! layout"); - } - - const auto tmp = f_make_for_device_nchw(); - - device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(tmp[I0], - tmp[I1], - tmp[I2], - tmp[I3], - tmp[I4], - tmp[I5], - tmp[I6], - in, - wei, - out_device, - nrepeat); - } -#endif - #if USE_CONV_FWD_V4R4R2_XDL_NCHW if(algo == ConvForwardAlgo::V4R4R2XDLNCHW) { diff --git a/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp b/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp new file mode 100644 index 0000000000..6b34254c74 --- /dev/null +++ b/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp @@ -0,0 +1,391 @@ +#include +#include +#include +#include +#include +#include +#include "config.hpp" +#include "debug.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "conv_common.hpp" +#include "device_tensor.hpp" +#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" + +#define USE_DYNAMIC_MODE 0 +#define USE_CONV_FWD_V5R1_NCHWC 1 + +enum ConvForwardAlgo +{ + V5R1NCHWC // 0 +}; + +template +void host_direct_convolution_nchwc(const Tensor& in, + const Tensor& wei, + const Tensor& bias, + Tensor& out, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads&, + const ck::ActivTypeEnum_t activ_type) +{ + using namespace ck; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) { + double v = 0; + const int k = k0 * out.mDesc.GetLengths()[4] + k1; + + for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0) + { + for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y) + { + int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0]; + for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x) + { + int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1]; + if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && + wi < in.mDesc.GetLengths()[3]) + { + for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1) + { + v += static_cast(in(n, c0, hi, wi, c1)) * + static_cast(wei(k, c0, y, x, c1)); + } + } + } + } + } + v += bias(k0, k1); + out(n, k0, ho, wo, k1) = activ(v, activ_type); + }; + + make_ParallelTensorFunctor(f_nchw, + out.mDesc.GetLengths()[0], + out.mDesc.GetLengths()[1], + out.mDesc.GetLengths()[2], + out.mDesc.GetLengths()[3], + out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency()); +} + +int main(int argc, char* argv[]) +{ + using namespace ck; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; + constexpr auto I5 = Number<5>{}; + constexpr auto I6 = Number<6>{}; + +#if USE_DYNAMIC_MODE + // dynamic mode + if(argc != 23) + { + printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n"); + printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " + "RightPx\n"); + exit(1); + } + + constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + + const ConvForwardAlgo algo = static_cast(std::stoi(argv[1])); + const bool do_verification = std::stoi(argv[2]); + const int init_method = std::stoi(argv[3]); + const bool do_log = std::stoi(argv[4]); + const int nrepeat = std::stoi(argv[5]); + + const index_t N = std::stoi(argv[6]); + const index_t K0 = std::stoi(argv[7]); + const index_t K1 = std::stoi(argv[8]); + const index_t C0 = std::stoi(argv[9]); + const index_t C1 = std::stoi(argv[10]); + const index_t Y = std::stoi(argv[11]); + const index_t X = std::stoi(argv[12]); + const index_t Hi = std::stoi(argv[13]); + const index_t Wi = std::stoi(argv[14]); + + const index_t conv_stride_h = std::stoi(argv[15]); + const index_t conv_stride_w = std::stoi(argv[16]); + const index_t conv_dilation_h = std::stoi(argv[17]); + const index_t conv_dilation_w = std::stoi(argv[18]); + const index_t in_left_pad_h = std::stoi(argv[19]); + const index_t in_left_pad_w = std::stoi(argv[20]); + const index_t in_right_pad_h = std::stoi(argv[21]); + const index_t in_right_pad_w = std::stoi(argv[22]); + + const index_t YEff = (Y - 1) * conv_dilation_h + 1; + const index_t XEff = (X - 1) * conv_dilation_w + 1; + + const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; + const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; +#else + // static mode + if(argc < 6) + { + printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n"); + exit(1); + } + + const ConvForwardAlgo algo = static_cast(std::stoi(argv[1])); + + const bool do_verification = std::stoi(argv[2]); + const int init_method = std::stoi(argv[3]); + const bool do_log = std::stoi(argv[4]); + const int nrepeat = std::stoi(argv[5]); + + // constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid; + constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + +#if 0 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<1080>{}; + constexpr auto Wi = Number<1920>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<1>{}; + constexpr auto K1 = Number<4>{}; +#elif 1 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<1080>{}; + constexpr auto Wi = Number<1920>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#elif 0 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<1080>{}; + constexpr auto Wi = Number<1920>{}; + constexpr auto Y = Number<1>{}; + constexpr auto X = Number<1>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#elif 0 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<540>{}; + constexpr auto Wi = Number<960>{}; + constexpr auto Y = Number<1>{}; + constexpr auto X = Number<1>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#elif 0 + constexpr auto N = Number<128>{}; + constexpr auto Hi = Number<270>{}; + constexpr auto Wi = Number<480>{}; + constexpr auto Y = Number<1>{}; + constexpr auto X = Number<1>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#endif + + constexpr auto conv_stride_h = I1; + constexpr auto conv_stride_w = I1; + constexpr auto conv_dilation_h = I1; + constexpr auto conv_dilation_w = I1; + +#if 1 + constexpr auto in_left_pad_h = I1; + constexpr auto in_left_pad_w = I1; + constexpr auto in_right_pad_h = I1; + constexpr auto in_right_pad_w = I1; +#else + constexpr auto in_left_pad_h = I0; + constexpr auto in_left_pad_w = I0; + constexpr auto in_right_pad_h = I0; + constexpr auto in_right_pad_w = I0; +#endif + + constexpr auto YEff = (Y - I1) * conv_dilation_h + I1; + constexpr auto XEff = (X - I1) * conv_dilation_w + I1; + + constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1; + constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1; +#endif + +#if 0 + using in_data_t = float; + using acc_data_t = float; + using out_data_t = float; +#elif 1 + using in_data_t = half_t; + using acc_data_t = float; + using out_data_t = half_t; +#elif 1 + using in_data_t = int8_t; + using acc_data_t = int32_t; + using out_data_t = int8_t; +#endif + + std::vector in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5), + bias_lengths_host(2); + + in_lengths_host[0] = static_cast(N); + in_lengths_host[1] = static_cast(C0); + in_lengths_host[2] = static_cast(Hi); + in_lengths_host[3] = static_cast(Wi); + in_lengths_host[4] = static_cast(C1); + + wei_lengths_host[0] = static_cast(K0 * K1); + wei_lengths_host[1] = static_cast(C0); + wei_lengths_host[2] = static_cast(Y); + wei_lengths_host[3] = static_cast(X); + wei_lengths_host[4] = static_cast(C1); + + out_lengths_host[0] = static_cast(N); + out_lengths_host[1] = static_cast(K0); + out_lengths_host[2] = static_cast(Ho); + out_lengths_host[3] = static_cast(Wo); + out_lengths_host[4] = static_cast(K1); + + bias_lengths_host[0] = static_cast(K0); + bias_lengths_host[1] = static_cast(K1); + + Tensor in(in_lengths_host); + Tensor wei(wei_lengths_host); + Tensor bias(bias_lengths_host); + Tensor out_host(out_lengths_host); + Tensor out_device(out_lengths_host); + + ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: "); + ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: "); + ostream_HostTensorDescriptor(bias.mDesc, std::cout << "bias: "); + ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: "); + + print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w)); + print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w)); + print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); + print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + switch(init_method) + { + case 0: + // no initialization + break; + case 1: + in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + break; + case 2: + in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + case 3: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + break; + case 4: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + case 5: + in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); + bias.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); + break; + default: + in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + + auto gen_wei = [](auto... is) { + return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); + }; + wei.GenerateTensorValue(gen_wei, num_thread); + } + + auto f_make_for_device_nchwc = [&]() { + const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1); + const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1); + const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1); + const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w); + const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w); + const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w); + const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w); + + return make_tuple(in_lengths_dev, + wei_lengths_dev, + out_lengths_dev, + conv_strides_dev, + conv_dilations_dev, + in_left_pads_dev, + in_right_pads_dev); + }; + +#if USE_CONV_FWD_V5R1_NCHWC + if(algo == ConvForwardAlgo::V5R1NCHWC) + { + const auto tmp = f_make_for_device_nchwc(); + + device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1( + tmp[I0], + tmp[I1], + tmp[I2], + tmp[I3], + tmp[I4], + tmp[I5], + tmp[I6], + in, + wei, + bias, + out_device, + nrepeat); + } +#endif + + if(do_verification) + { + host_direct_convolution_nchwc(in, + wei, + bias, + out_host, + make_tuple(conv_stride_h, conv_stride_w), + make_tuple(conv_dilation_h, conv_dilation_w), + make_tuple(in_left_pad_h, in_left_pad_w), + make_tuple(in_right_pad_h, in_right_pad_w), + activ_type); + + check_error(out_host, out_device); + + if(do_log) + { + LogRangeAsType(std::cout << "in : ", in.mData, ",") << std::endl; + LogRangeAsType(std::cout << "wei: ", wei.mData, ",") << std::endl; + LogRangeAsType(std::cout << "bias: ", bias.mData, ",") << std::endl; + LogRangeAsType(std::cout << "out_host : ", out_host.mData, ",") << std::endl; + LogRangeAsType(std::cout << "out_device: ", out_device.mData, ",") << std::endl; + } + } +} diff --git a/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp b/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp new file mode 100644 index 0000000000..d8a22bda33 --- /dev/null +++ b/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp @@ -0,0 +1,413 @@ +#include +#include +#include +#include +#include +#include +#include "config.hpp" +#include "debug.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "conv_common.hpp" +#include "device_tensor.hpp" +#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" + +#define USE_DYNAMIC_MODE 0 +#define USE_CONV_FWD_V5R1_NCHWC 1 + +enum ConvForwardAlgo +{ + V5R1NCHWC // 0 +}; + +template +void host_direct_convolution_maxpool_nchwc(const Tensor& in, + const Tensor& wei, + const Tensor& bias, + Tensor& out_host, + Tensor& max_host, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads&, + const ck::ActivTypeEnum_t activ_type) +{ + using namespace ck; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) { + double v = 0; + auto k = k0 * out_host.mDesc.GetLengths()[4] + k1; + + for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0) + { + for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y) + { + int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0]; + for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x) + { + int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1]; + if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && + wi < in.mDesc.GetLengths()[3]) + { + for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1) + { + v += static_cast(in(n, c0, hi, wi, c1)) * + static_cast(wei(k, c0, y, x, c1)); + } + } + } + } + } + + v += bias(k0, k1); + v = activ(v, activ_type); + + out_host(n, k0, ho, wo, k1) = v; + }; + + make_ParallelTensorFunctor(f_nchw, + out_host.mDesc.GetLengths()[0], + out_host.mDesc.GetLengths()[1], + out_host.mDesc.GetLengths()[2], + out_host.mDesc.GetLengths()[3], + out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency()); + + auto maxpool_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) { + auto hx = ho * 2; + auto wx = wo * 2; + + auto v0 = out_host(n, k0, hx, wx, k1); + auto v1 = out_host(n, k0, hx, wx + 1, k1); + auto v2 = out_host(n, k0, hx + 1, wx, k1); + auto v3 = out_host(n, k0, hx + 1, wx + 1, k1); + + max_host(n, k0, ho, wo, k1) = std::max({v0, v1, v2, v3}); + }; + + make_ParallelTensorFunctor(maxpool_nchw, + max_host.mDesc.GetLengths()[0], + max_host.mDesc.GetLengths()[1], + max_host.mDesc.GetLengths()[2], + max_host.mDesc.GetLengths()[3], + max_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency()); +} + +int main(int argc, char* argv[]) +{ + using namespace ck; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; + constexpr auto I5 = Number<5>{}; + constexpr auto I6 = Number<6>{}; + constexpr auto I7 = Number<7>{}; + +#if USE_DYNAMIC_MODE + // dynamic mode + if(argc != 23) + { + printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n"); + printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " + "RightPx\n"); + exit(1); + } + + constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + + const ConvForwardAlgo algo = static_cast(std::stoi(argv[1])); + const bool do_verification = std::stoi(argv[2]); + const int init_method = std::stoi(argv[3]); + const bool do_log = std::stoi(argv[4]); + const int nrepeat = std::stoi(argv[5]); + + const index_t N = std::stoi(argv[6]); + const index_t K0 = std::stoi(argv[7]); + const index_t K1 = std::stoi(argv[8]); + const index_t C0 = std::stoi(argv[9]); + const index_t C1 = std::stoi(argv[10]); + const index_t Y = std::stoi(argv[11]); + const index_t X = std::stoi(argv[12]); + const index_t Hi = std::stoi(argv[13]); + const index_t Wi = std::stoi(argv[14]); + + const index_t conv_stride_h = std::stoi(argv[15]); + const index_t conv_stride_w = std::stoi(argv[16]); + const index_t conv_dilation_h = std::stoi(argv[17]); + const index_t conv_dilation_w = std::stoi(argv[18]); + const index_t in_left_pad_h = std::stoi(argv[19]); + const index_t in_left_pad_w = std::stoi(argv[20]); + const index_t in_right_pad_h = std::stoi(argv[21]); + const index_t in_right_pad_w = std::stoi(argv[22]); + + const index_t YEff = (Y - 1) * conv_dilation_h + 1; + const index_t XEff = (X - 1) * conv_dilation_w + 1; + + const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; + const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; + + const index_t Ho_2 = Ho / 2; + const index_t Wo_2 = Wo / 2; +#else + // static mode + if(argc < 6) + { + printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n"); + exit(1); + } + + const ConvForwardAlgo algo = static_cast(std::stoi(argv[1])); + + const bool do_verification = std::stoi(argv[2]); + const int init_method = std::stoi(argv[3]); + const bool do_log = std::stoi(argv[4]); + const int nrepeat = std::stoi(argv[5]); + + constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + +#if 1 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<1080>{}; + constexpr auto Wi = Number<1920>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#elif 0 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<1080>{}; + constexpr auto Wi = Number<1920>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<3>{}; + constexpr auto C1 = Number<4>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#elif 0 + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<540>{}; + constexpr auto Wi = Number<960>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#elif 0 + constexpr auto N = Number<128>{}; + constexpr auto Hi = Number<270>{}; + constexpr auto Wi = Number<480>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; +#endif + + constexpr auto conv_stride_h = I1; + constexpr auto conv_stride_w = I1; + constexpr auto conv_dilation_h = I1; + constexpr auto conv_dilation_w = I1; + constexpr auto in_left_pad_h = I1; + constexpr auto in_left_pad_w = I1; + constexpr auto in_right_pad_h = I1; + constexpr auto in_right_pad_w = I1; + + constexpr auto YEff = (Y - I1) * conv_dilation_h + I1; + constexpr auto XEff = (X - I1) * conv_dilation_w + I1; + + constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1; + constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1; + + constexpr auto Ho_2 = Number{}; + constexpr auto Wo_2 = Number{}; + +#endif + +#if 0 + using in_data_t = float; + using acc_data_t = float; + using out_data_t = float; +#elif 1 + using in_data_t = half_t; + using acc_data_t = float; + using out_data_t = half_t; +#elif 1 + using in_data_t = int8_t; + using acc_data_t = int32_t; + using out_data_t = int8_t; +#endif + + std::vector in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5), + max_lengths_host(5), bias_lengths_host(2); + + in_lengths_host[0] = static_cast(N); + in_lengths_host[1] = static_cast(C0); + in_lengths_host[2] = static_cast(Hi); + in_lengths_host[3] = static_cast(Wi); + in_lengths_host[4] = static_cast(C1); + + wei_lengths_host[0] = static_cast(K0 * K1); + wei_lengths_host[1] = static_cast(C0); + wei_lengths_host[2] = static_cast(Y); + wei_lengths_host[3] = static_cast(X); + wei_lengths_host[4] = static_cast(C1); + + out_lengths_host[0] = static_cast(N); + out_lengths_host[1] = static_cast(K0); + out_lengths_host[2] = static_cast(Ho); + out_lengths_host[3] = static_cast(Wo); + out_lengths_host[4] = static_cast(K1); + + max_lengths_host[0] = static_cast(N); + max_lengths_host[1] = static_cast(K0); + max_lengths_host[2] = static_cast(Ho_2); + max_lengths_host[3] = static_cast(Wo_2); + max_lengths_host[4] = static_cast(K1); + + bias_lengths_host[0] = static_cast(K0); + bias_lengths_host[1] = static_cast(K1); + + Tensor in(in_lengths_host); + Tensor wei(wei_lengths_host); + Tensor bias(bias_lengths_host); + Tensor out_device(out_lengths_host); + Tensor out_host(out_lengths_host); + Tensor max_device(max_lengths_host); + Tensor max_host(max_lengths_host); + + ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: "); + ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: "); + + print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w)); + print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w)); + print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); + print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + switch(init_method) + { + case 0: + // no initialization + break; + case 1: + in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + break; + case 2: + in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + case 3: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + break; + case 4: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + case 5: + in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); + break; + default: + in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + + auto gen_wei = [](auto... is) { + return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); + }; + wei.GenerateTensorValue(gen_wei, num_thread); + } + + bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + + auto f_make_for_device_nchwc = [&]() { + const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1); + const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1); + const auto max_lengths_dev = make_tuple(N, K0, Ho_2, Wo_2, K1); + const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1); + const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w); + const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w); + const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w); + const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w); + + return make_tuple(in_lengths_dev, + wei_lengths_dev, + max_lengths_dev, + out_lengths_dev, + conv_strides_dev, + conv_dilations_dev, + in_left_pads_dev, + in_right_pads_dev); + }; + +#if USE_CONV_FWD_V5R1_NCHWC + if(algo == ConvForwardAlgo::V5R1NCHWC) + { + const auto tmp = f_make_for_device_nchwc(); + + device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1< + in_data_t, + acc_data_t, + out_data_t, + activ_type>(tmp[I0], // in_lengths_dev + tmp[I1], // wei_lengths_dev + tmp[I2], // max_lengths_dev + tmp[I3], // out_lengths_dev + tmp[I4], // conv_strides_dev + tmp[I5], // conv_dilations_dev + tmp[I6], // in_left_pads_dev + tmp[I7], // in_right_pads_dev + in, + wei, + bias, + out_device, + max_device, + nrepeat); + } +#endif + + if(do_verification) + { + host_direct_convolution_maxpool_nchwc(in, + wei, + bias, + out_host, + max_host, + make_tuple(conv_stride_h, conv_stride_w), + make_tuple(conv_dilation_h, conv_dilation_w), + make_tuple(in_left_pad_h, in_left_pad_w), + make_tuple(in_right_pad_h, in_right_pad_w), + activ_type); + + check_error(out_host, out_device); + check_error(max_host, max_device); + + if(do_log) + { + // LogRangeAsType(std::cout << "in : ", in.mData, ",") << std::endl; + // LogRangeAsType(std::cout << "wei: ", wei.mData, ",") << std::endl; + // LogRangeAsType(std::cout << "out_device: ", out_device.mData, ",") << + // std::endl; + LogRangeAsType(std::cout << "max_host: ", max_host.mData, ",") << std::endl; + LogRangeAsType(std::cout << "max_device: ", max_device.mData, ",") << std::endl; + } + } +} diff --git a/host/host_tensor/include/conv_common.hpp b/host/host_tensor/include/conv_common.hpp index bd336aae12..8c11abda49 100644 --- a/host/host_tensor/include/conv_common.hpp +++ b/host/host_tensor/include/conv_common.hpp @@ -74,4 +74,17 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes return std::size_t(2) * N * K * Ho * Wo * C * Y * X; } +template +inline auto activ(T v, const ck::ActivTypeEnum_t activ_type) +{ + const T alpha = 0.3; + switch(activ_type) + { + case ck::ActivTypeEnum_t::None: return v; + case ck::ActivTypeEnum_t::LeakyRelu: return (v >= 0 ? v : alpha * v); + case ck::ActivTypeEnum_t::Sigmoid: return (1 / (1 + exp(-v))); + default: throw std::runtime_error("unsupported activ type"); break; + } +} + #endif diff --git a/host/host_tensor/include/host_tensor.hpp b/host/host_tensor/include/host_tensor.hpp index ae30426913..180e724c2d 100644 --- a/host/host_tensor/include/host_tensor.hpp +++ b/host/host_tensor/include/host_tensor.hpp @@ -257,6 +257,18 @@ struct Tensor mDesc.GetLengths()[3])(num_thread); break; } + case 5: { + auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) { + (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4); + }; + make_ParallelTensorFunctor(f, + mDesc.GetLengths()[0], + mDesc.GetLengths()[1], + mDesc.GetLengths()[2], + mDesc.GetLengths()[3], + mDesc.GetLengths()[4])(num_thread); + break; + } default: throw std::runtime_error("unspported dimension"); } }