diff --git a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp index d91757dc8f..2fc2264f6d 100644 --- a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp @@ -1,7 +1,7 @@ #pragma once #include #include "device.hpp" -#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" +//#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" #include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp" template @@ -47,6 +47,9 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, constexpr unsigned HoPerThread = 2; constexpr unsigned WoPerThread = 2; + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned BlockSize = 128; #endif @@ -59,7 +62,7 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { float time = launch_kernel( -#if 0 +#if 0 gridwise_direct_convolution_2_nchw_kcyx_nkhw #else gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw @@ -78,6 +81,8 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, CPerThread, HoPerThread, WoPerThread, + InBlockCopyDataPerRead, + WeiBlockCopyDataPerRead, BlockSize, GridSize>, dim3(GridSize), diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 6cd75afd79..a952b95380 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -7,11 +7,11 @@ #include "tensor.hpp" #include "ConstantTensorDescriptor.hip.hpp" #include "conv_common.hip.hpp" -#include "device_direct_convolution_1.hpp" +//#include "device_direct_convolution_1.hpp" #include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" -#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" -#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" -#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" +//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" +//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" +//#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" struct GeneratorTensor_1 { diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index b54c4d0c5f..761c32a370 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -383,8 +383,9 @@ struct Blockwise2dTensorCopy3 constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; - static_assert(SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1, - "wrong! only support stride1 == 1!\n"); + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1), + "wrong! only support stride1 == 1 if DataPerRead > 1!\n"); static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, "wrong! only support DataPerRead == 1, 2 or 4!\n"); diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index cc50d9eecd..693b6fe9d5 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -131,11 +131,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds did[3] = is / ref_desc.GetStride(I3); - const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); + const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); - const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); + const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); - f(p_src[aindex], p_dst[bindex]); + f(p_src[src_index], p_dst[dst_index]); } constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); @@ -162,11 +162,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds did[3] = is / ref_desc.GetStride(I3); - const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); + const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); - const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); + const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); - f(p_src[aindex], p_dst[bindex]); + f(p_src[src_index], p_dst[dst_index]); } } } @@ -199,15 +199,112 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy); } -template +template struct Blockwise4dTensorCopy1 { + using vector_t = typename vector_type::type; + + __device__ void SanityCheck() const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1), + "wrong! only support stride3 == 1 if DataPerRead > 1!\n"); + + static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, + "wrong! only support DataPerRead == 1, 2 or 4!\n"); + + static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 && + DstDesc{}.GetStride(I2) % DataPerRead == 0, + "src and dst stride2 should be multiple of DataPerRead to keep alignment"); + + // we allow out-of-bound read from src in D3 dimension, + // but we need to make sure dst stride2 is big enough, + // so that the out-of-bound write won't contaminate next line in dst + constexpr unsigned L3 = CopyLengths{}.Get(I3); + constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead); + + static_assert(read_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2), + "wrong! out-of-bound write will contaminate next line!\n"); + } + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{}; + SanityCheck(); - blockwise_4d_tensor_copy_reorder_by_get_dst_from_src( - SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder); + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + constexpr unsigned L0 = CopyLengths{}.Get(I0); + constexpr unsigned L1 = CopyLengths{}.Get(I1); + constexpr unsigned L2 = CopyLengths{}.Get(I2); + constexpr unsigned L3 = CopyLengths{}.Get(I3); + + constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead); + + constexpr auto ref_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; + + auto f_copy = [&](unsigned is) { + unsigned did[4]; + + did[0] = is / ref_desc.GetStride(I0); + + is -= did[0] * ref_desc.GetStride(I0); + + did[1] = is / ref_desc.GetStride(I1); + + is -= did[1] * ref_desc.GetStride(I1); + + did[2] = is / ref_desc.GetStride(I2); + + is -= did[2] * ref_desc.GetStride(I2); + + did[3] = is / ref_desc.GetStride(I3); + + const unsigned src_index = + src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead); + const unsigned dst_index = + dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead); + + *(reinterpret_cast(p_dst + dst_index)) = + *(reinterpret_cast(p_src + src_index)); + }; + + for(unsigned iloop = 0; iloop < NLoop; ++iloop) + { + unsigned is = threadIdx.x + iloop * BlockSize; + + f_copy(is); + } + + constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); + + if(has_tail) + { + unsigned is = threadIdx.x + NLoop * BlockSize; + + if(is < ref_desc.GetElementSize()) + { + f_copy(is); + } + } } }; @@ -361,8 +458,9 @@ struct Blockwise4dTensorCopy3 constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1, - "wrong! only support stride3 == 1!\n"); + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1), + "wrong! only support stride3 == 1 if DataPerRead > 1!\n"); static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, "wrong! only support DataPerRead == 1, 2 or 4!\n"); @@ -370,7 +468,7 @@ struct Blockwise4dTensorCopy3 static_assert( SrcDesc{}.GetStride(I2) % DataPerRead == 0 && DstDesc{}.GetStride(I2) % DataPerRead == 0, - "wrong! src and dst stride should be multiple of DataPerRead to keep alignment"); + "wrong! src and dst stride2 should be multiple of DataPerRead to keep alignment"); constexpr unsigned L0 = CopyLengths{}.Get(I0); constexpr unsigned L1 = CopyLengths{}.Get(I1); diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index cb2a8a5087..8ce097bdc5 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -1,6 +1,7 @@ #pragma once #include "common.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp" +#include "blockwise_2d_tensor_op.hip.hpp" #include "blockwise_4d_tensor_op.hip.hpp" #include "blockwise_direct_convolution.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp" @@ -20,6 +21,8 @@ template __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( @@ -32,50 +35,72 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_global_desc = InGlobalDesc{}; - constexpr auto wei_global_desc = WeiGlobalDesc{}; - constexpr auto out_global_desc = OutGlobalDesc{}; + constexpr auto in_nchw_global_desc = InGlobalDesc{}; + constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{}; + constexpr auto out_nkhw_global_desc = OutGlobalDesc{}; - constexpr unsigned Y = wei_global_desc.GetLength(I2); - constexpr unsigned X = wei_global_desc.GetLength(I3); + constexpr unsigned N = in_nchw_global_desc.GetLength(I0); + constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0); + constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1); + constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2); + constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3); + + constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor( + Sequence{}); // 2d view of wei for blockwise copy constexpr unsigned HiPerBlock = HoPerBlock + Y - 1; constexpr unsigned WiPerBlock = WoPerBlock + X - 1; - constexpr auto in_block_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); - constexpr auto wei_block_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, + Number{}); // 2d view of wei for blockwise copy + + constexpr auto wei_kcyx_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + Sequence{}); // shared mem - constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); - constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); + constexpr unsigned in_block_size = + in_nchw_block_desc.GetElementSpace(Number{}); + constexpr unsigned wei_block_size = + wei_kcyx_block_desc.GetElementSpace(Number{}); - __shared__ Float p_in_block[in_block_size]; - __shared__ Float p_wei_block[wei_block_size]; + constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead + ? InBlockCopyDataPerRead + : WeiBlockCopyDataPerRead; + + __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; // threadwise tensors constexpr unsigned HiPerThread = HoPerThread + Y - 1; constexpr unsigned WiPerThread = WoPerThread + X - 1; - constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, in_block_desc.GetStrides()); + constexpr auto in_nchw_thread_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + in_nchw_block_desc.GetStrides()); - constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, wei_block_desc.GetStrides()); + constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, wei_kcyx_block_desc.GetStrides()); - constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor( - in_thread_block_desc, wei_thread_block_desc); + constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( + in_nchw_thread_block_desc, wei_kcyx_thread_block_desc); // register - Float p_out_thread[out_thread_desc.GetElementSpace()]; + Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; // divide block work - constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; - constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; - constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; - constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; + constexpr unsigned NBlockWork = + (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; + constexpr unsigned KBlockWork = + (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; + constexpr unsigned HBlockWork = + (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; + constexpr unsigned WBlockWork = + (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; const unsigned block_id = blockIdx.x; @@ -122,34 +147,44 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto blockwise_in_copy = Blockwise4dTensorCopy1{}; + decltype(in_nchw_global_desc), + decltype(in_nchw_block_desc), + decltype(in_nchw_block_desc.GetLengths()), + InBlockCopyDataPerRead>{}; +#if 0 constexpr auto blockwise_wei_copy = Blockwise4dTensorCopy1{}; + decltype(wei_kcyx_global_desc), + decltype(wei_kcyx_block_desc), + decltype(wei_kcyx_block_desc.GetLengths())>{}; +#elif 1 + const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; +#endif // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread); + threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread); - for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1); + for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.Run( - p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_block); + blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex( + k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_block); __syncthreads(); @@ -158,25 +193,27 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( // threadwise convolution #if 1 threadwise_direct_convolution_2( - in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), - wei_thread_block_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - out_thread_desc, + in_nchw_thread_block_desc, + p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_kcyx_thread_block_desc, + p_wei_block + + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_nkhw_thread_desc, p_out_thread); #elif 0 threadwise_direct_convolution_3( - in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), - wei_thread_block_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - out_thread_desc, + in_nchw_thread_block_desc, + p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_kcyx_thread_block_desc, + p_wei_block + + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_nkhw_thread_desc, p_out_thread); #endif } @@ -184,12 +221,12 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( // copy output tensor from register to global mem threadwise_4d_tensor_copy( - out_thread_desc, + out_nkhw_thread_desc, p_out_thread, - out_global_desc, - p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), - out_thread_desc.GetLengths()); + out_nkhw_global_desc, + p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_nkhw_thread_desc.GetLengths()); }