diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 2fc66819d3..abd771285d 100644 --- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -140,7 +140,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_N = 2; -#elif 1 +#elif 0 // for 3x3, 34x34, v1r3, Pascal // for 3x3, 28x28, v1r3, Pascal // for 3x3, 14x14, v1r3, Pascal @@ -206,6 +206,8 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t OutThreadCopyDataPerWrite_N = 1; #elif 0 // for 3x3, 34x34, v1r1, Vega 20 + constexpr index_t BlockSize = 256; + constexpr index_t NPerBlock = 16; constexpr index_t KPerBlock = 128; constexpr index_t CPerBlock = 4; @@ -227,16 +229,43 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - constexpr index_t InBlockCopy_ThreadPerDimC = 4; - constexpr index_t InBlockCopy_ThreadPerDimH = 4; - constexpr index_t InBlockCopy_ThreadPerDimW = 2; - constexpr index_t InBlockCopy_ThreadPerDimN = 8; - constexpr index_t InBlockCopyDataPerRead_N = 2; + using InBlockCopyClusterLengths_CHWN = Sequence<4, 4, 2, 8>; + constexpr index_t InBlockCopyDataPerRead_N = 2; + + constexpr index_t WeiBlockCopyDataPerRead_K = 2; - constexpr index_t WeiBlockCopyDataPerRead_K = 2; constexpr index_t OutThreadCopyDataPerWrite_N = 4; - +#elif 1 + // for 3x3, 34x34, v1r3, Vega 20 constexpr index_t BlockSize = 256; + + constexpr index_t NPerBlock = 16; + constexpr index_t KPerBlock = 128; + constexpr index_t CPerBlock = 8; + constexpr index_t HoPerBlock = 2; + constexpr index_t WoPerBlock = 4; + + constexpr index_t NPerThread = 4; + constexpr index_t KPerThread = 8; + constexpr index_t HoPerThread = 1; + constexpr index_t WoPerThread = 2; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 2; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 4, 4>; + constexpr index_t InBlockCopyDataPerRead_N = 4; + + constexpr index_t WeiBlockCopyDataPerRead_K = 4; + + constexpr index_t OutThreadCopyDataPerWrite_N = 4; #elif 0 // for 3x3, 56x56, v1r1, Pascal constexpr index_t NPerBlock = 32; @@ -448,7 +477,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn #elif 1 GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn -#elif 0 +#elif 1 GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn #endif & in_nchw, std::size_t ho = HoPerTile * htile + j; for(int i = 0; i < WoPerTile; ++i) { - std::size_t wo = WoPerTile * wtile + i; + std::size_t wo = WoPerTile * wtile + i; out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); } } @@ -425,13 +425,13 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 3x3, 56x56 - constexpr index_t N = 64; - constexpr index_t C = 64; + constexpr index_t N = 64; + constexpr index_t C = 64; constexpr index_t HI = 56; constexpr index_t WI = 56; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; + constexpr index_t K = 128; + constexpr index_t Y = 3; + constexpr index_t X = 3; constexpr index_t HPad = 0; constexpr index_t WPad = 0; @@ -603,9 +603,9 @@ int main(int argc, char* argv[]) device_direct_convolution_2_nchw_kcyx_nkhw #elif 0 device_direct_convolution_2_vectorized_nchw_kcyx_nkhw -#elif 0 +#elif 1 device_convolution_implicit_gemm_v1_chwn_cyxk_khwn -#elif 0 +#elif 1 device_convolution_implicit_gemm_v1_nchw_cyxk_khwn #elif 1 device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 248a4a8974..aad038020a 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -116,11 +116,7 @@ struct ConstantTensorDescriptor static_for<0, nDim, 1>{}([&](auto IDim) { constexpr index_t idim = IDim.Get(); -#if DEVICE_BACKEND_HIP - id += __mul24(multi_id[idim], GetStride(IDim)); -#else id += multi_id[idim] * GetStride(IDim); -#endif }); return id; diff --git a/src/include/blockwise_3d_tensor_op.hip.hpp b/src/include/blockwise_3d_tensor_op.hip.hpp index 9e73be106d..f82dec7f7b 100644 --- a/src/include/blockwise_3d_tensor_op.hip.hpp +++ b/src/include/blockwise_3d_tensor_op.hip.hpp @@ -213,7 +213,6 @@ struct Blockwise3dTensorCopy3 #pragma unroll for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2) { -#pragma unroll const index_t src_offset = SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, iloop_d1 * thread_per_d1, diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index 2a52eeeac8..14a8c3d9de 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -341,10 +341,11 @@ struct BlockwiseChwnTensorCopyPadded constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; const Float* p_src_tmp = - p_src + src_desc.Get1dIndex(c_block_data_begin, - (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, - (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, - n_block_data_begin); + p_src + + src_desc.Get1dIndex(c_block_data_begin, + (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, + (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, + n_block_data_begin); #if 0 if(get_thread_local_1d_id() == 0) diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hip.hpp index 3926e5905d..55c41c4417 100644 --- a/src/include/blockwise_batched_gemm.hip.hpp +++ b/src/include/blockwise_batched_gemm.hip.hpp @@ -404,8 +404,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 { threadwise_matrix_copy( c_thread_sub_mtx, - p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, - n_repeat * NPerLevel1Cluster), + p_c_thread + + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, + n_repeat * NPerLevel1Cluster), c_block_mtx, p_c_block + c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, diff --git a/src/include/blockwise_direct_convolution.hip.hpp b/src/include/blockwise_direct_convolution.hip.hpp index 5f1665aabd..c79833f17d 100644 --- a/src/include/blockwise_direct_convolution.hip.hpp +++ b/src/include/blockwise_direct_convolution.hip.hpp @@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, Float p_out_thread[out_thread_desc.GetElementSpace()]; threadwise_4d_tensor_copy(out_block_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc, p_out_thread, out_thread_desc.GetLengths()); @@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, // threadwise convolution threadwise_direct_convolution_2( in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data_begin, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data_begin, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0), @@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, threadwise_4d_tensor_copy(out_thread_desc, p_out_thread, out_block_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc.GetLengths()); } } diff --git a/src/include/blockwise_nd_tensor_op.hip.hpp b/src/include/blockwise_nd_tensor_op.hip.hpp index c2d7722454..0eeacfe940 100644 --- a/src/include/blockwise_nd_tensor_op.hip.hpp +++ b/src/include/blockwise_nd_tensor_op.hip.hpp @@ -56,7 +56,7 @@ struct BlockwiseNdTensorCopyReorder_v3 "wrong! BlockSize is not big enough for ThreadPerDims!"); // sanity check: work division - static_for<0, nDim, 1>{}([](auto IDim) { + static_for<0, nDim, 1>{}([&](auto IDim) { constexpr auto I = decltype(IDim){}; constexpr index_t src_len = src_lengths.Get(I); constexpr index_t src_sub_len = src_sub_lengths.Get(I); @@ -220,7 +220,7 @@ struct BlockwiseNdTensorCopyReorder_v3 constexpr index_t dst_offset = DstDesc{}.Get1dIndex(dst_data_multi_id); - // write in the order of dst +// write in the order of dst #if 1 threadwise_nd_tensor_copy_reorder_given_dst2src_v2(thread_tensor_desc, p_clipboard + clipboard_offset, diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp index 97b2984d1d..893343fb08 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp @@ -43,11 +43,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn Float* const __restrict__ p_out_global) const { // be careful of this assertion - static_assert(NPerBlock % NPerThread == 0 && (GemmNPerThreadSubC <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0) || - (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0), - "wrong!"); + static_assert( + NPerBlock % NPerThread == 0 && + ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || + (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0)), + "wrong!"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -219,8 +220,9 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); const Float* p_in_global_block_offset = - p_in_global + in_c_h_w_n_global_desc.Get1dIndex( - 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); + p_in_global + + in_c_h_w_n_global_desc.Get1dIndex( + 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_offset = p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); @@ -275,39 +277,40 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; - static_if{}( - [&](auto f_dummy) { // f_dummy do nothing but perfect forwarding. Using this trick to - // make this lambda a generic lambda, so it won't be compiled until - // instantiated - static_assert((f_dummy(GemmNPerThreadSubC) <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0), - "wrong!"); + static_if{}([&](auto f_dummy) { // f_dummy do nothing but + // perfect forwarding. + // Using this trick to + // make this lambda a generic lambda, so it won't be compiled until + // instantiated + static_assert( + (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), + "wrong!"); - // output is a 10d tensor - constexpr index_t N2 = GemmNPerThreadSubC; - constexpr index_t N1 = NPerBlock / N2; + // output is a 10d tensor + constexpr index_t N2 = GemmNPerThreadSubC; + constexpr index_t N1 = NPerBlock / N2; - constexpr index_t W2 = (GemmNLevel0Cluster * GemmNLevel1Cluster) / - f_dummy(NPerBlock / GemmNPerThreadSubC); - constexpr index_t W1 = WoPerBlock / W2; + constexpr index_t W2 = + (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC); + constexpr index_t W1 = WoPerBlock / W2; - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = + make_ConstantTensorDescriptor(Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -322,47 +325,37 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }) - .else_([&](auto f_dummy) { - static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0, - "wrong!"); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }).else_([&](auto f_dummy) { + static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0, + "wrong!"); - // output is a 10d tensor - constexpr index_t N1 = NPerBlock; + // output is a 10d tensor + constexpr index_t N1 = NPerBlock; - constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; - constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); + constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; + constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; + constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( + Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -382,17 +375,17 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp index 2a6e985cdb..531514e311 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp @@ -44,11 +44,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn Float* const __restrict__ p_out_global) const { // be careful of this assertion - static_assert(NPerBlock % NPerThread == 0 && (GemmNPerThreadSubC <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0) || - (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0), - "wrong!"); + static_assert( + NPerBlock % NPerThread == 0 && + ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || + (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0)), + "wrong!"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -125,8 +126,8 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( Sequence{}); - // blockwise copy - // input: format is [C, Hi, Wi, N] +// blockwise copy +// input: format is [C, Hi, Wi, N] #if 1 const auto blockwise_in_copy = Blockwise4dTensorCopy1{}( - [&](auto f_dummy) { // f_dummy do nothing but perfect forwarding. Using this trick to - // make this lambda a generic lambda, so it won't be compiled until - // instantiated - static_assert((f_dummy(GemmNPerThreadSubC) <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0), - "wrong!"); + static_if{}([&](auto f_dummy) { // f_dummy do nothing but + // perfect forwarding. + // Using this trick to + // make this lambda a generic lambda, so it won't be compiled until + // instantiated + static_assert( + (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), + "wrong!"); - // output is a 10d tensor - constexpr index_t N2 = GemmNPerThreadSubC; - constexpr index_t N1 = NPerBlock / N2; + // output is a 10d tensor + constexpr index_t N2 = GemmNPerThreadSubC; + constexpr index_t N1 = NPerBlock / N2; - constexpr index_t W2 = (GemmNLevel0Cluster * GemmNLevel1Cluster) / - f_dummy(NPerBlock / GemmNPerThreadSubC); - constexpr index_t W1 = WoPerBlock / W2; + constexpr index_t W2 = + (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC); + constexpr index_t W1 = WoPerBlock / W2; - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = + make_ConstantTensorDescriptor(Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -355,47 +358,37 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }) - .else_([&](auto f_dummy) { - static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0, - "wrong!"); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }).else_([&](auto f_dummy) { + static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0, + "wrong!"); - // output is a 10d tensor - constexpr index_t N1 = NPerBlock; + // output is a 10d tensor + constexpr index_t N1 = NPerBlock; - constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; - constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); + constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; + constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; + constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( + Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -415,17 +408,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp index bc71b6daba..5a9c29bfe2 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp @@ -49,8 +49,11 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn { // be careful of this assertion static_assert( - NPerThread <= NPerBlock && NPerBlock % NPerThread == 0, - "wrong! should satisfy: NPerThread <= NPerBlock && NPerBlock % NPerThread == 0"); + NPerBlock % NPerThread == 0 && + ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || + (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0)), + "wrong!"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -262,12 +265,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn const Float* p_wei_global_block_offset = p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, k_block_data_begin); - for(index_t c_block_data_begin = 0; c_block_data_begin < C; + for(index_t + c_block_data_begin = 0; + c_block_data_begin < C; c_block_data_begin += CPerBlock, - p_in_global_block_offset += - CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), - p_wei_global_block_offset += - CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) + p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), + p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) { Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()]; Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; @@ -333,15 +336,16 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn } #endif - threadwise_10d_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_10d_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp index d71ee639a2..669ba69deb 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp @@ -43,11 +43,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn Float* const __restrict__ p_out_global) const { // be careful of this assertion - static_assert(NPerBlock % NPerThread == 0 && (GemmNPerThreadSubC <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0) || - (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0), - "wrong!"); + static_assert( + NPerBlock % NPerThread == 0 && + ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || + (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0)), + "wrong!"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -212,10 +213,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn // set threadwise output tensor to 0 threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); -#if 1 +#if 0 const Float* p_in_global_block_offset = - p_in_global + in_c_h_w_n_global_desc.Get1dIndex( - 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); + p_in_global + + in_c_h_w_n_global_desc.Get1dIndex( + 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_offset = p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); @@ -226,6 +228,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn { for(index_t y = 0; y < Y; ++y) { +#pragma unroll for(index_t x = 0; x < X; ++x) { blockwise_in_copy.Run(p_in_global_block_offset + @@ -287,39 +290,40 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; - static_if{}( - [&](auto f_dummy) { // f_dummy do nothing but perfect forwarding. Using this trick to - // make this lambda a generic lambda, so it won't be compiled until - // instantiated - static_assert((f_dummy(GemmNPerThreadSubC) <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0), - "wrong!"); + static_if{}([&](auto f_dummy) { // f_dummy do nothing but + // perfect forwarding. + // Using this trick to + // make this lambda a generic lambda, so it won't be compiled until + // instantiated + static_assert( + (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), + "wrong!"); - // output is a 10d tensor - constexpr index_t N2 = GemmNPerThreadSubC; - constexpr index_t N1 = NPerBlock / N2; + // output is a 10d tensor + constexpr index_t N2 = GemmNPerThreadSubC; + constexpr index_t N1 = NPerBlock / N2; - constexpr index_t W2 = (GemmNLevel0Cluster * GemmNLevel1Cluster) / - f_dummy(NPerBlock / GemmNPerThreadSubC); - constexpr index_t W1 = WoPerBlock / W2; + constexpr index_t W2 = + (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC); + constexpr index_t W1 = WoPerBlock / W2; - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = + make_ConstantTensorDescriptor(Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -334,47 +338,37 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }) - .else_([&](auto f_dummy) { - static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0, - "wrong!"); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }).else_([&](auto f_dummy) { + static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0, + "wrong!"); - // output is a 10d tensor - constexpr index_t N1 = NPerBlock; + // output is a 10d tensor + constexpr index_t N1 = NPerBlock; - constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; - constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); + constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; + constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; + constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( + Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -394,17 +388,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp index 7a757a6673..640683fe1e 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp @@ -43,11 +43,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn Float* const __restrict__ p_out_global) const { // be careful of this assertion - static_assert(NPerBlock % NPerThread == 0 && (GemmNPerThreadSubC <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0) || - (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0), - "wrong!"); + static_assert( + NPerBlock % NPerThread == 0 && + ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || + (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0)), + "wrong!"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -127,8 +128,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( Sequence{}); - // blockwise copy - // input: format is [C, Hi, Wi, N] +// blockwise copy +// input: format is [C, Hi, Wi, N] #if 0 const auto blockwise_in_copy = Blockwise4dTensorCopy1{}( - [&](auto f_dummy) { // f_dummy do nothing but perfect forwarding. Using this trick to - // make this lambda a generic lambda, so it won't be compiled until - // instantiated - static_assert((f_dummy(GemmNPerThreadSubC) <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0), - "wrong!"); + static_if{}([&](auto f_dummy) { // f_dummy do nothing but + // perfect forwarding. + // Using this trick to + // make this lambda a generic lambda, so it won't be compiled until + // instantiated + static_assert( + (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), + "wrong!"); - // output is a 10d tensor - constexpr index_t N2 = GemmNPerThreadSubC; - constexpr index_t N1 = NPerBlock / N2; + // output is a 10d tensor + constexpr index_t N2 = GemmNPerThreadSubC; + constexpr index_t N1 = NPerBlock / N2; - constexpr index_t W2 = (GemmNLevel0Cluster * GemmNLevel1Cluster) / - f_dummy(NPerBlock / GemmNPerThreadSubC); - constexpr index_t W1 = WoPerBlock / W2; + constexpr index_t W2 = + (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC); + constexpr index_t W1 = WoPerBlock / W2; - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = + make_ConstantTensorDescriptor(Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -396,47 +398,37 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }) - .else_([&](auto f_dummy) { - static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0, - "wrong!"); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }).else_([&](auto f_dummy) { + static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0, + "wrong!"); - // output is a 10d tensor - constexpr index_t N1 = NPerBlock; + // output is a 10d tensor + constexpr index_t N1 = NPerBlock; - constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; - constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); + constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; + constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; + constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( + Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -456,17 +448,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp index b83a61147a..c8bd2efc35 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp @@ -47,11 +47,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn Float* const __restrict__ p_out_global) const { // be careful of this assertion - static_assert(NPerBlock % NPerThread == 0 && (GemmNPerThreadSubC <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0) || - (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0), - "wrong!"); + static_assert( + NPerBlock % NPerThread == 0 && + ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || + (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0)), + "wrong!"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -349,39 +350,40 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; - static_if{}( - [&](auto f_dummy) { // f_dummy do nothing but perfect forwarding. Using this trick to - // make this lambda a generic lambda, so it won't be compiled until - // instantiated - static_assert((f_dummy(GemmNPerThreadSubC) <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0), - "wrong!"); + static_if{}([&](auto f_dummy) { // f_dummy do nothing but + // perfect forwarding. + // Using this trick to + // make this lambda a generic lambda, so it won't be compiled until + // instantiated + static_assert( + (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), + "wrong!"); - // output is a 10d tensor - constexpr index_t N2 = GemmNPerThreadSubC; - constexpr index_t N1 = NPerBlock / N2; + // output is a 10d tensor + constexpr index_t N2 = GemmNPerThreadSubC; + constexpr index_t N1 = NPerBlock / N2; - constexpr index_t W2 = (GemmNLevel0Cluster * GemmNLevel1Cluster) / - f_dummy(NPerBlock / GemmNPerThreadSubC); - constexpr index_t W1 = WoPerBlock / W2; + constexpr index_t W2 = + (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC); + constexpr index_t W1 = WoPerBlock / W2; - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = + make_ConstantTensorDescriptor(Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -396,47 +398,37 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }) - .else_([&](auto f_dummy) { - static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0, - "wrong!"); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }).else_([&](auto f_dummy) { + static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0, + "wrong!"); - // output is a 10d tensor - constexpr index_t N1 = NPerBlock; + // output is a 10d tensor + constexpr index_t N1 = NPerBlock; - constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; - constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); + constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; + constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; + constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( + Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -456,17 +448,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp index 8337bebf69..062d3c4540 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp @@ -47,11 +47,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn Float* const __restrict__ p_out_global) const { // be careful of this assertion - static_assert(NPerBlock % NPerThread == 0 && (GemmNPerThreadSubC <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0) || - (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0), - "wrong!"); + static_assert( + NPerBlock % NPerThread == 0 && + ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || + (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0)), + "wrong!"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -223,8 +224,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn #if 1 const Float* p_in_global_block_offset = - p_in_global + in_n_c_h_w_global_desc.Get1dIndex( - n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin); + p_in_global + + in_n_c_h_w_global_desc.Get1dIndex( + n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin); const Float* p_wei_global_block_offset = p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); @@ -329,39 +331,40 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; - static_if{}( - [&](auto f_dummy) { // f_dummy do nothing but perfect forwarding. Using this trick to - // make this lambda a generic lambda, so it won't be compiled until - // instantiated - static_assert((f_dummy(GemmNPerThreadSubC) <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0), - "wrong!"); + static_if{}([&](auto f_dummy) { // f_dummy do nothing but + // perfect forwarding. + // Using this trick to + // make this lambda a generic lambda, so it won't be compiled until + // instantiated + static_assert( + (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), + "wrong!"); - // output is a 10d tensor - constexpr index_t N2 = GemmNPerThreadSubC; - constexpr index_t N1 = NPerBlock / N2; + // output is a 10d tensor + constexpr index_t N2 = GemmNPerThreadSubC; + constexpr index_t N1 = NPerBlock / N2; - constexpr index_t W2 = (GemmNLevel0Cluster * GemmNLevel1Cluster) / - f_dummy(NPerBlock / GemmNPerThreadSubC); - constexpr index_t W1 = WoPerBlock / W2; + constexpr index_t W2 = + (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC); + constexpr index_t W1 = WoPerBlock / W2; - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = + make_ConstantTensorDescriptor(Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -376,47 +379,37 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }) - .else_([&](auto f_dummy) { - static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0, - "wrong!"); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }).else_([&](auto f_dummy) { + static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0, + "wrong!"); - // output is a 10d tensor - constexpr index_t N1 = NPerBlock; + // output is a 10d tensor + constexpr index_t N1 = NPerBlock; - constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; - constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); + constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; + constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; + constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( + Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -436,17 +429,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn } #endif - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }); + threadwise_nd_tensor_copy( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp index f5c6df8644..445512922a 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp @@ -47,11 +47,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw Float* const __restrict__ p_out_global) const { // be careful of this assertion - static_assert(NPerBlock % NPerThread == 0 && (GemmNPerThreadSubC <= NPerBlock && - NPerBlock % GemmNPerThreadSubC == 0) || - (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0), - "wrong!"); + static_assert( + NPerBlock % NPerThread == 0 && + ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || + (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0)), + "wrong!"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -223,8 +224,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw #if 1 const Float* p_in_global_block_offset = - p_in_global + in_n_c_h_w_global_desc.Get1dIndex( - n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin); + p_in_global + + in_n_c_h_w_global_desc.Get1dIndex( + n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin); const Float* p_wei_global_block_offset = p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); @@ -409,14 +411,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw out_10d_thread_desc, p_out_thread, out_10d_global_desc, - p_out_global + out_n_k_h_w_global_desc.Get1dIndex( - n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_n_k_h_w_global_desc.Get1dIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_10d_thread_desc.GetLengths(), map_out_global2thread); - // Number{}); +// Number{}); #endif }) .else_([&](auto f_dummy) { @@ -500,14 +503,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw out_10d_thread_desc, p_out_thread, out_10d_global_desc, - p_out_global + out_n_k_h_w_global_desc.Get1dIndex( - n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_n_k_h_w_global_desc.Get1dIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_10d_thread_desc.GetLengths(), map_out_global2thread); - // Number{}); +// Number{}); #endif }); } diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index 2e6fdcf049..90370b7b4f 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -365,13 +365,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer constexpr auto out_kb_global_desc = make_ConstantTensorDescriptor(Sequence{}); - threadwise_6d_tensor_copy(out_6d_thread_desc, - p_out_thread, - out_6d_global_desc, - p_out_global + out_kb_global_desc.Get1dIndex( - k_thread_data_begin, b_thread_data_begin), - out_6d_thread_desc.GetLengths(), - Number{}); + threadwise_6d_tensor_copy( + out_6d_thread_desc, + p_out_thread, + out_6d_global_desc, + p_out_global + + out_kb_global_desc.Get1dIndex(k_thread_data_begin, b_thread_data_begin), + out_6d_thread_desc.GetLengths(), + Number{}); } else { diff --git a/src/include/gridwise_direct_convolution_1.hip.hpp b/src/include/gridwise_direct_convolution_1.hip.hpp index 29c7e86b37..7723fb78b4 100644 --- a/src/include/gridwise_direct_convolution_1.hip.hpp +++ b/src/include/gridwise_direct_convolution_1.hip.hpp @@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ c_block_work_begin += CPerBlock) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, - c_block_work_begin, - hi_block_work_begin, - wi_block_work_begin), + blockwise_in_copy.Run(p_in_global + + in_global_desc.Get1dIndex(n_block_work_begin, + c_block_work_begin, + hi_block_work_begin, + wi_block_work_begin), p_in_block); // copy weight tensor to LDS @@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ } // copy output tensor from LDS to device mem - blockwise_out_copy.Run(p_out_block, - p_out_global + out_global_desc.Get1dIndex(n_block_work_begin, - k_block_work_begin, - ho_block_work_begin, - wo_block_work_begin)); + blockwise_out_copy.Run( + p_out_block, + p_out_global + + out_global_desc.Get1dIndex( + n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin)); } diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp index c585c4495b..cbebe28f17 100644 --- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp @@ -175,16 +175,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex( - k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_block); + blockwise_wei_copy.Run( + p_wei_global + + wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_block); __syncthreads(); @@ -194,10 +196,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i #if 1 threadwise_direct_convolution_2( in_nchw_thread_block_desc, - p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -206,10 +209,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i #elif 0 threadwise_direct_convolution_3( in_nchw_thread_block_desc, - p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -224,9 +228,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i out_nkhw_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_nkhw_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index df8c53f107..0b83eccc3c 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -198,9 +198,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( p_in_vec_block); // copy weight tensor to LDS - blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex( - k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_vec_block); + blockwise_wei_copy.Run( + p_wei_vec_global + + wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_vec_block); __syncthreads(); @@ -210,10 +211,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( #if 1 threadwise_direct_convolution_2( in_nchw_vec_thread_block_desc, - p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_vec_block + + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_vec_thread_block_desc, p_wei_vec_block + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -222,10 +224,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( #elif 0 threadwise_direct_convolution_3( in_nchw_vec_thread_block_desc, - p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_vec_block + + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_vec_thread_block_desc, p_wei_vec_block + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -240,9 +243,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( out_nkhw_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_nkhw_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp index fe1ee2191f..f04a283fcf 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp @@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( out_hkwn_thread_desc, p_out_thread, out_khwn_global_desc, - p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), + p_out_global + + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), out_hkwn_thread_desc.GetLengths(), reorder_khwn_from_hkwn); } diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index d0c785c16e..1ebfef0c5d 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim) return os; } -typedef enum -{ +typedef enum { Half = 0, Float = 1, } DataType_t;