From fd8de384170d6100a837b19e37139665c89e2054 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 16 Mar 2019 10:50:46 -0500 Subject: [PATCH 01/13] refactor --- ...e_direct_convolution_2_nchw_kcyx_nkhw.hpp} | 48 +-- ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp | 71 +++-- driver/driver.hip.cpp | 19 +- ...rect_convolution_2_nchw_kcyx_nkhw.hip.hpp} | 7 +- ..._gemm_convolution_2_chwn_cyxk_khwn.hip.hpp | 281 ++++++++++++++++++ ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 4 +- 6 files changed, 359 insertions(+), 71 deletions(-) rename driver/{device_direct_convolution_2.hpp => device_direct_convolution_2_nchw_kcyx_nkhw.hpp} (77%) rename src/include/{gridwise_direct_convolution_2.hip.hpp => gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp} (96%) create mode 100644 src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp diff --git a/driver/device_direct_convolution_2.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp similarity index 77% rename from driver/device_direct_convolution_2.hpp rename to driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp index 1baedafc46..602702949e 100644 --- a/driver/device_direct_convolution_2.hpp +++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp @@ -1,16 +1,16 @@ #pragma once #include #include "device.hpp" -#include "gridwise_direct_convolution_2.hip.hpp" +#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" template -void device_direct_convolution_2(InDesc, - const Tensor& in, - WeiDesc, - const Tensor& wei, - OutDesc, - Tensor& out, - unsigned nrepeat) +void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, + const Tensor& in, + WeiDesc, + const Tensor& wei, + OutDesc, + Tensor& out, + unsigned nrepeat) { std::size_t data_sz = sizeof(T); DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); @@ -57,22 +57,22 @@ void device_direct_convolution_2(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - float time = launch_kernel(gridwise_direct_convolution_2, + float time = launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw, dim3(GridSize), dim3(BlockSize), static_cast(in_device_buf.GetDeviceBuffer()), diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp index a657949f35..c885894165 100644 --- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp @@ -1,6 +1,7 @@ #pragma once #include #include "device.hpp" +#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp" #include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp" template @@ -209,39 +210,43 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - float time = - launch_kernel(gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer< - GridSize, - BlockSize, - T, - decltype(in_chwn_desc), - decltype(wei_cyxk_desc), - decltype(out_khwn_desc), - BPerBlock, - KPerBlock, - CPerBlock, - BPerThread, - KPerThread, - GemmThreadPerColumnPerCluster, - GemmThreadPerRowPerCluster, - GemmMPerThreadSubC, - GemmNPerThreadSubC, - GemmMLevel0Cluster, - GemmNLevel0Cluster, - GemmMLevel1Cluster, - GemmNLevel1Cluster, - GemmKPerThreadLoop, - InBlockCopyThreadPerDim0, - InBlockCopyThreadPerDim1, - WeiBlockCopyThreadPerDim0, - WeiBlockCopyThreadPerDim1, - InBlockCopyDataPerRead, - WeiBlockCopyDataPerRead>, - dim3(GridSize), - dim3(BlockSize), - static_cast(in_chwn_device_buf.GetDeviceBuffer()), - static_cast(wei_cyxk_device_buf.GetDeviceBuffer()), - static_cast(out_khwn_device_buf.GetDeviceBuffer())); + float time = launch_kernel( +#if 1 + gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn +#else + gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer +#endif + , + dim3(GridSize), + dim3(BlockSize), + static_cast(in_chwn_device_buf.GetDeviceBuffer()), + static_cast(wei_cyxk_device_buf.GetDeviceBuffer()), + static_cast(out_khwn_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index aca345acfd..6cd75afd79 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -8,7 +8,7 @@ #include "ConstantTensorDescriptor.hip.hpp" #include "conv_common.hip.hpp" #include "device_direct_convolution_1.hpp" -#include "device_direct_convolution_2.hpp" +#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" #include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" #include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" #include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" @@ -503,7 +503,7 @@ int main(int argc, char* argv[]) constexpr unsigned HPad = 1; constexpr unsigned WPad = 1; -#elif 0 +#elif 1 // 1x1 filter, 28x28 image constexpr unsigned N = 16; constexpr unsigned C = 256; @@ -577,10 +577,11 @@ int main(int argc, char* argv[]) ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); - Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); - Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); - Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); - Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); + using Float = float; + Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); + Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); + Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); + Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); std::size_t num_thread = std::thread::hardware_concurrency(); @@ -610,9 +611,9 @@ int main(int argc, char* argv[]) #if 1 #if 0 device_direct_convolution_1 -#elif 0 - device_direct_convolution_2 #elif 1 + device_direct_convolution_2_nchw_kcyx_nkhw +#elif 0 device_implicit_gemm_convolution_1_chwn_cyxk_khwn #elif 0 device_implicit_gemm_convolution_2_chwn_cyxk_khwn @@ -633,7 +634,7 @@ int main(int argc, char* argv[]) if(do_verification) { -#if 0 +#if 1 if(Y == 3 && X == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); diff --git a/src/include/gridwise_direct_convolution_2.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp similarity index 96% rename from src/include/gridwise_direct_convolution_2.hip.hpp rename to src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp index 13f9e6cf1d..322d5fd9c2 100644 --- a/src/include/gridwise_direct_convolution_2.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp @@ -22,9 +22,10 @@ template -__global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) +__global__ void +gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp new file mode 100644 index 0000000000..afa3d3ee90 --- /dev/null +++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp @@ -0,0 +1,281 @@ +#pragma once +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" +#include "ConstantMatrixDescriptor.hip.hpp" +#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_2d_tensor_op.hip.hpp" +#include "threadwise_2d_tensor_op.hip.hpp" +#include "blockwise_gemm.hip.hpp" + +// define B = flatten(N, Hi, Wi) +template +__global__ void +gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_chwn_global_desc = InGlobalDesc{}; + constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{}; + constexpr auto out_khwn_global_desc = OutGlobalDesc{}; + + constexpr unsigned C = in_chwn_global_desc.GetLength(I0); + constexpr unsigned Hi = in_chwn_global_desc.GetLength(I1); + constexpr unsigned Wi = in_chwn_global_desc.GetLength(I2); + constexpr unsigned N = in_chwn_global_desc.GetLength(I3); + + constexpr unsigned K = out_khwn_global_desc.GetLength(I0); + constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1); + constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2); + + constexpr unsigned Y = wei_cyxk_global_desc.GetLength(I1); + constexpr unsigned X = wei_cyxk_global_desc.GetLength(I2); + + constexpr unsigned B = N * Hi * Wi; + constexpr unsigned BGhostRead = (Y - 1) * Wi + (X - 1); + + // divide block work by 2d: [K, B] + constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock; + constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock; + + const unsigned k_block_work_id = get_block_1d_id() / BBlockWork; + const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork; + + const unsigned k_block_data_begin = k_block_work_id * KPerBlock; + const unsigned b_block_data_begin = b_block_work_id * BPerBlock; + + // flattend (2d) tensor view of gridwise input + constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence{}); + constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence{}); + + // tensor view of blockwise input and weight + // be careful of alignment + constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + // tensor view of threadwise output in register + constexpr auto out_kb_thread_desc = + make_ConstantTensorDescriptor(Sequence{}); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc"); + print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc"); + print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc"); + + print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc"); + print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc"); + + print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc"); + print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc"); + print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc"); + print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc"); + + printf("KPerBlock %u\n", KPerBlock); + } +#endif + +// blockwise in copy +// formmat is [CPerBlock,BPerBlock + BGhostRead] +#if 0 + const auto blockwise_in_copy = + Blockwise2dTensorCopy1{}; +#elif 0 + const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; +#elif 1 + const auto blockwise_in_copy = Blockwise2dTensorCopy3{}; +#endif + +// blockwise wei copy +// format is [CPerBlock*Y*X,KPerBlock] +#if 0 + const auto blockwise_wei_copy = + Blockwise2dTensorCopy1{}; +#elif 0 + const auto blockwise_wei_copy = Blockwise2dTensorCopy2{}; +#elif 1 + const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; +#endif + + // a series of blockwise GEMM + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx and b_mtx saved in LDS, c_mtx saved in register + // a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K] + // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] + // c_mtx[K,B] is out_block[K,B] + constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + constexpr auto c_kxb_thread_mtx_desc = + make_ConstantMatrixDescriptor(Number{}, Number{}); + + const auto blockwise_gemm = + BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2{}; + + // LDS: be careful of alignment + constexpr unsigned in_block_size = + in_cb_block_desc.GetElementSpace(Number{}); + + constexpr unsigned wei_block_size = + wei_cyxk_block_desc.GetElementSpace(Number{}); + + constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead + ? InBlockCopyDataPerRead + : WeiBlockCopyDataPerRead; + + // LDS + __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; + + const Float* p_in_global_block_offset = + p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); + + const Float* p_wei_global_block_offset = + p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + + // register + Float p_out_thread[out_kb_thread_desc.GetElementSpace()]; + + // set threadwise output tensor to 0 + threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread); + + for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, + p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0), + p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0), + __syncthreads()) + { + // load data + blockwise_in_copy.Run(p_in_global_block_offset, p_in_block); + blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block); + + __syncthreads(); + + // compute on current data + // a series of GEMM + for(unsigned y = 0; y < Y; ++y) + { + for(unsigned x = 0; x < X; ++x) + { + auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; +#if 1 + blockwise_gemm.Run +#else + blockwise_gemm.Run_RegisterDoubleBuffer +#endif + (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block + y * Wi + x, + p_out_thread, + f_accum); + } + } + } + + // output: register to global mem, + const auto c_thread_mtx_begin = + blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); + + const unsigned k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row; + const unsigned b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col; + + for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k) + { + for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b) + { + const auto c_thread_mtx_distance = + blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b); + + unsigned k_data = k_thread_data_begin + c_thread_mtx_distance.row; + unsigned b_data = b_thread_data_begin + c_thread_mtx_distance.col; + + unsigned h_data = b_data / (Wi * N); + unsigned itmp = b_data - h_data * (Wi * N); + unsigned w_data = itmp / N; + unsigned n_data = itmp - w_data * N; + + if(n_data < N && h_data < Ho && w_data < Wo) + { + p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] = + p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]; + } + } + } +} diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index 7c802266d8..60d827293b 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b __syncthreads(); // load next data -#if 1 +#if 0 blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next); blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next); #elif 1 @@ -292,7 +292,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b } } -#if 0 +#if 1 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_next); #endif From a0584426ff5b6b8b448c971b97c9b1a4d86ba010 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 17 Mar 2019 03:22:41 -0500 Subject: [PATCH 02/13] refactoring ConstantTensorDescriptor --- ...ce_direct_convolution_2_nchw_kcyx_nkhw.hpp | 49 +-- ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp | 2 +- src/include/Array.hip.hpp | 18 + src/include/ConstantTensorDescriptor.hip.hpp | 323 +++--------------- src/include/Sequence.hip.hpp | 92 +++++ src/include/common.hip.hpp | 52 +-- src/include/constant_integral.hip.hpp | 12 + src/include/functional.hip.hpp | 49 +++ ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 195 +++++++++++ 9 files changed, 452 insertions(+), 340 deletions(-) create mode 100644 src/include/Array.hip.hpp create mode 100644 src/include/Sequence.hip.hpp create mode 100644 src/include/constant_integral.hip.hpp create mode 100644 src/include/functional.hip.hpp create mode 100644 src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp diff --git a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp index 602702949e..d91757dc8f 100644 --- a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp @@ -2,6 +2,7 @@ #include #include "device.hpp" #include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" +#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp" template void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, @@ -57,27 +58,33 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - float time = launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw, - dim3(GridSize), - dim3(BlockSize), - static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer())); + float time = launch_kernel( +#if 0 + gridwise_direct_convolution_2_nchw_kcyx_nkhw +#else + gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw +#endif + , + dim3(GridSize), + dim3(BlockSize), + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp index c885894165..3edd8253dd 100644 --- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp @@ -211,7 +211,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { float time = launch_kernel( -#if 1 +#if 0 gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn #else gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp new file mode 100644 index 0000000000..1caab6a4c9 --- /dev/null +++ b/src/include/Array.hip.hpp @@ -0,0 +1,18 @@ +#pragma once + +template +struct Array +{ + using Type = Array; + + static constexpr unsigned nSize = NSize; + + unsigned mData[nSize]; + + template + __host__ __device__ Array(Xs... xs) : mData({static_cast(xs)...}) + { + } + + __host__ __device__ TData operator[](unsigned i) const { return mData[i]; } +}; diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 2352b0f50c..2e5d237e81 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -65,8 +65,8 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence struct ConstantTensorDescriptor { + using Type = ConstantTensorDescriptor; static constexpr unsigned nDim = Lengths::nDim; - using NDimConstant = Number; __host__ __device__ constexpr ConstantTensorDescriptor() { @@ -91,293 +91,70 @@ struct ConstantTensorDescriptor return Strides{}.Get(Number{}); } + // c++14 doesn't support constexpr lambdas, has to use this trick instead + struct GetElementSize_f + { + template + __host__ __device__ constexpr unsigned operator()(IDim idim) const + { + return Type{}.GetLength(idim); + } + }; + __host__ __device__ constexpr unsigned GetElementSize() const { - static_assert(nDim >= 2 && nDim <= 8, "nDim"); - - if(nDim == 2) + // c++14 doesn't support constexpr lambdas, has to use this trick instead + struct multiply { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; + __host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const + { + return a * b; + } + }; - return GetLength(I0) * GetLength(I1); - } - else if(nDim == 3) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2); - } - else if(nDim == 4) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3); - } - else if(nDim == 5) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4); - } - else if(nDim == 6) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) * - GetLength(I5); - } - else if(nDim == 7) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) * - GetLength(I5) * GetLength(I6); - } - else if(nDim == 8) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) * - GetLength(I5) * GetLength(I6) * GetLength(I7); - } - else - { - assert(false); - } + return static_const_reduce_n{}(GetElementSize_f{}, multiply{}); } + // c++14 doesn't support constexpr lambdas, has to use this trick instead + struct GetElementSpace_f + { + template + __host__ __device__ constexpr unsigned operator()(IDim idim) const + { + return (Type{}.GetLength(idim) - 1) * Type{}.GetStride(idim); + } + }; + template > __host__ __device__ constexpr unsigned GetElementSpace(Align align = Align{}) const { - static_assert(nDim >= 2 && nDim <= 8, "nDim"); - - constexpr unsigned align_size = align.Get(); - - if(nDim == 2) + // c++14 doesn't support constexpr lambdas, has to use this trick instead + struct add { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; + __host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const + { + return a + b; + } + }; - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - align_size; - } - else if(nDim == 3) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + align_size; - } - else if(nDim == 4) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + - align_size; - } - else if(nDim == 5) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + - (GetLength(I4) - 1) * GetStride(I4) + align_size; - } - else if(nDim == 6) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + - (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) + - align_size; - } - else if(nDim == 7) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + - (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) + - (GetLength(I6) - 1) * GetStride(I6) + align_size; - } - else if(nDim == 8) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + - (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) + - (GetLength(I6) - 1) * GetStride(I6) + (GetLength(I7) - 1) * GetStride(I7) + - align_size; - } + return static_const_reduce_n{}(GetElementSpace_f{}, add{}) + align.Get(); } - // this is ugly, only for 2d - __host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1) const + template + __host__ __device__ unsigned Get1dIndex(Is... is) const { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; + static_assert(sizeof...(Is) == nDim, "number of multi-index is wrong"); - static_assert(nDim == 2, "nDim is not 2"); - return i0 * GetStride(I0) + i1 * GetStride(I1); - } + const auto multi_id = Array(is...); - // this is ugly, only for 3d - __host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1, unsigned i2) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; + unsigned id = 0; - static_assert(nDim == 3, "nDim is not 3"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2); - } + static_loop_n{}([&](auto IDim) { + constexpr unsigned idim = IDim.Get(); + id += multi_id[idim] * GetStride(IDim); + }); - // this is ugly, only for 4d - __host__ __device__ unsigned - Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - static_assert(nDim == 4, "nDim is not 4"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3); - } - - // this is ugly, only for 5d - __host__ __device__ unsigned - Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - - static_assert(nDim == 5, "nDim is not 5"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) + - i4 * GetStride(I4); - } - - // this is ugly, only for 6d - __host__ __device__ unsigned - Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4, unsigned i5) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - - static_assert(nDim == 6, "nDim is not 6"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) + - i4 * GetStride(I4) + i5 * GetStride(I5); - } - - // this is ugly, only for 7d - __host__ __device__ unsigned Get1dIndex(unsigned i0, - unsigned i1, - unsigned i2, - unsigned i3, - unsigned i4, - unsigned i5, - unsigned i6) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - - static_assert(nDim == 7, "nDim is not 7"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) + - i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6); - } - - // this is ugly, only for 8d - __host__ __device__ unsigned Get1dIndex(unsigned i0, - unsigned i1, - unsigned i2, - unsigned i3, - unsigned i4, - unsigned i5, - unsigned i6, - unsigned i7) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - - static_assert(nDim == 8, "nDim is not 8"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) + - i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6) + i7 * GetStride(I7); + return id; } __host__ __device__ constexpr auto Condense() const @@ -385,6 +162,12 @@ struct ConstantTensorDescriptor constexpr auto default_strides = calculate_default_strides(Lengths{}); return ConstantTensorDescriptor{}; } + + template + __host__ __device__ constexpr auto Vectorize(Number, Number) const + { + assert(false); // not implemented + } }; template diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp new file mode 100644 index 0000000000..c8ca7a0f24 --- /dev/null +++ b/src/include/Sequence.hip.hpp @@ -0,0 +1,92 @@ +#pragma once +#include "constant_integral.hip.hpp" +#include "functional.hip.hpp" + +template +struct Sequence +{ + using Type = Sequence; + + static constexpr unsigned nDim = sizeof...(Is); + + const unsigned mData[nDim] = {Is...}; + + template + __host__ __device__ constexpr unsigned Get(Number) const + { + return mData[I]; + } + + // this is ugly, only for nDIm = 4 + template + __host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence) const + { + static_assert(nDim == 4, "nDim != 4"); + + constexpr auto old_sequence = Type{}; + + constexpr unsigned NR0 = old_sequence.mData[I0]; + constexpr unsigned NR1 = old_sequence.mData[I1]; + constexpr unsigned NR2 = old_sequence.mData[I2]; + constexpr unsigned NR3 = old_sequence.mData[I3]; + + return Sequence{}; + } + + template + __host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence) const + { + // don't know how to implement this + printf("Sequence::ReorderByPutOldToNew not implemented"); + assert(false); + } + + template + __host__ __device__ constexpr auto PushBack(Number) const + { + return Sequence{}; + } + + __host__ __device__ constexpr auto PopBack() const; + + template + __host__ __device__ constexpr auto Transform(F f) const + { + return Sequence{}; + } +}; + +template +__host__ __device__ constexpr auto sequence_pop_back(Sequence) +{ + static_assert(sizeof...(Is) >= 1, "empty Sequence!"); + return Sequence{}; +} + +template +__host__ __device__ constexpr auto sequence_sequence_op(Sequence, Sequence, F f) +{ + static_assert(Sequence::nDim == Sequence::nDim, "Dim not the same"); + + return Sequence{}; +} + +template +__host__ __device__ constexpr auto sequence_sequence_add(Sequence, Sequence) +{ + struct add + { + __host__ __device__ constexpr unsigned operator()(unsigned x, unsigned y) const + { + return x + y; + } + }; + + return sequence_sequence_op(Sequence{}, Sequence{}, add{}); +} + +template +__host__ __device__ constexpr auto Sequence::PopBack() const +{ + return sequence_pop_back(Type{}); +} diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index 2df008fcad..f447fce784 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -1,4 +1,8 @@ #pragma once +#include "constant_integral.hip.hpp" +#include "Sequence.hip.hpp" +#include "Array.hip.hpp" +#include "functional.hip.hpp" __device__ unsigned get_thread_local_1d_id() { return threadIdx.x; } @@ -91,54 +95,6 @@ struct vector_type }; #endif -template -struct integral_constant -{ - static const T value = N; - - __host__ __device__ constexpr T Get() const { return value; } -}; - -template -using Number = integral_constant; - -template -struct Sequence -{ - using Type = Sequence; - - static constexpr unsigned nDim = sizeof...(Is); - - const unsigned mData[nDim] = {Is...}; - - template - __host__ __device__ constexpr unsigned Get(Number) const - { - return mData[I]; - } - - template - __host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence) const - { - constexpr auto old_sequence = Type{}; - - constexpr unsigned NR0 = old_sequence.mData[I0]; - constexpr unsigned NR1 = old_sequence.mData[I1]; - constexpr unsigned NR2 = old_sequence.mData[I2]; - constexpr unsigned NR3 = old_sequence.mData[I3]; - - return Sequence{}; - } - - template - __host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence) const - { - // don't know how to implement this - printf("Sequence::ReorderByPutOldToNew not implemented"); - assert(false); - } -}; - template __host__ __device__ constexpr T max(T a, T b) { diff --git a/src/include/constant_integral.hip.hpp b/src/include/constant_integral.hip.hpp new file mode 100644 index 0000000000..70dc69d181 --- /dev/null +++ b/src/include/constant_integral.hip.hpp @@ -0,0 +1,12 @@ +#pragma once + +template +struct integral_constant +{ + static const T value = N; + + __host__ __device__ constexpr T Get() const { return value; } +}; + +template +using Number = integral_constant; diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp new file mode 100644 index 0000000000..598d5c3c71 --- /dev/null +++ b/src/include/functional.hip.hpp @@ -0,0 +1,49 @@ +#pragma once +#include "constant_integral.hip.hpp" + +template +struct static_loop_n +{ + template + __host__ __device__ void operator()(F f) const + { + static_assert(NLoop > 1, "out-of-range"); + + f(Number{}); + static_loop_n{}(f); + } +}; + +template <> +struct static_loop_n<1> +{ + template + __host__ __device__ void operator()(F f) const + { + f(Number<0>{}); + } +}; + +template +struct static_const_reduce_n +{ + template + __host__ __device__ constexpr auto operator()(F f, Reduce r) const + { + static_assert(NLoop > 1, "out-of-range"); + + constexpr auto a = f(Number{}); + auto b = static_const_reduce_n{}(f, r); // cannot use constexpr here, weird + return r(a, b); + } +}; + +template <> +struct static_const_reduce_n<1> +{ + template + __host__ __device__ constexpr auto operator()(F f, Reduce) const + { + return f(Number<0>{}); + } +}; diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp new file mode 100644 index 0000000000..cb2a8a5087 --- /dev/null +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -0,0 +1,195 @@ +#pragma once +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" +#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_direct_convolution.hip.hpp" +#include "threadwise_4d_tensor_op.hip.hpp" +#include "threadwise_direct_convolution.hip.hpp" + +template +__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( + const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_global_desc = InGlobalDesc{}; + constexpr auto wei_global_desc = WeiGlobalDesc{}; + constexpr auto out_global_desc = OutGlobalDesc{}; + + constexpr unsigned Y = wei_global_desc.GetLength(I2); + constexpr unsigned X = wei_global_desc.GetLength(I3); + + constexpr unsigned HiPerBlock = HoPerBlock + Y - 1; + constexpr unsigned WiPerBlock = WoPerBlock + X - 1; + + constexpr auto in_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto wei_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + // shared mem + constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); + constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); + + __shared__ Float p_in_block[in_block_size]; + __shared__ Float p_wei_block[wei_block_size]; + + // threadwise tensors + constexpr unsigned HiPerThread = HoPerThread + Y - 1; + constexpr unsigned WiPerThread = WoPerThread + X - 1; + + constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, in_block_desc.GetStrides()); + + constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, wei_block_desc.GetStrides()); + + constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor( + in_thread_block_desc, wei_thread_block_desc); + + // register + Float p_out_thread[out_thread_desc.GetElementSpace()]; + + // divide block work + constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; + constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; + constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; + constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; + + const unsigned block_id = blockIdx.x; + + unsigned itmp = block_id; + const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork); + itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork); + const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork); + itmp -= k_block_work_id * (HBlockWork * WBlockWork); + const unsigned h_block_work_id = itmp / WBlockWork; + const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork; + + const unsigned n_block_data_begin = n_block_work_id * NPerBlock; + const unsigned k_block_data_begin = k_block_work_id * KPerBlock; + const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock; + const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock; + + const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding + const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding + + // divide thread work + constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread; + constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread; + constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread; + constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread; + + const unsigned thread_id = threadIdx.x; + + itmp = thread_id; + const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork); + itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork); + const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork); + itmp -= k_thread_work_id * (HThreadWork * WThreadWork); + const unsigned h_thread_work_id = itmp / WThreadWork; + const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork; + + const unsigned n_thread_data_begin = n_thread_work_id * NPerThread; + const unsigned k_thread_data_begin = k_thread_work_id * KPerThread; + const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread; + const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread; + + const unsigned hi_thread_data_begin = ho_thread_data_begin; + const unsigned wi_thread_data_begin = wo_thread_data_begin; + + constexpr auto blockwise_in_copy = + Blockwise4dTensorCopy1{}; + + constexpr auto blockwise_wei_copy = + Blockwise4dTensorCopy1{}; + + // set threadwise output tensor to 0 + threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread); + + for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1); + c_block_data_begin += CPerBlock, __syncthreads()) + { + // copy input tensor to LDS + blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), + p_in_block); + + // copy weight tensor to LDS + blockwise_wei_copy.Run( + p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_block); + + __syncthreads(); + + for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) + { +// threadwise convolution +#if 1 + threadwise_direct_convolution_2( + in_thread_block_desc, + p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_thread_block_desc, + p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_thread_desc, + p_out_thread); +#elif 0 + threadwise_direct_convolution_3( + in_thread_block_desc, + p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_thread_block_desc, + p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_thread_desc, + p_out_thread); +#endif + } + } + + // copy output tensor from register to global mem + threadwise_4d_tensor_copy( + out_thread_desc, + p_out_thread, + out_global_desc, + p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_thread_desc.GetLengths()); +} From 03eef73c5be07a1e02c090eacd24f0a9f6aa850e Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 17 Mar 2019 15:36:38 -0500 Subject: [PATCH 03/13] refactoring block copy --- ...ce_direct_convolution_2_nchw_kcyx_nkhw.hpp | 9 +- driver/driver.hip.cpp | 8 +- src/include/blockwise_2d_tensor_op.hip.hpp | 5 +- src/include/blockwise_4d_tensor_op.hip.hpp | 124 ++++++++++++-- ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 161 +++++++++++------- 5 files changed, 224 insertions(+), 83 deletions(-) diff --git a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp index d91757dc8f..2fc2264f6d 100644 --- a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp @@ -1,7 +1,7 @@ #pragma once #include #include "device.hpp" -#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" +//#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" #include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp" template @@ -47,6 +47,9 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, constexpr unsigned HoPerThread = 2; constexpr unsigned WoPerThread = 2; + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned BlockSize = 128; #endif @@ -59,7 +62,7 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { float time = launch_kernel( -#if 0 +#if 0 gridwise_direct_convolution_2_nchw_kcyx_nkhw #else gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw @@ -78,6 +81,8 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, CPerThread, HoPerThread, WoPerThread, + InBlockCopyDataPerRead, + WeiBlockCopyDataPerRead, BlockSize, GridSize>, dim3(GridSize), diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 6cd75afd79..a952b95380 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -7,11 +7,11 @@ #include "tensor.hpp" #include "ConstantTensorDescriptor.hip.hpp" #include "conv_common.hip.hpp" -#include "device_direct_convolution_1.hpp" +//#include "device_direct_convolution_1.hpp" #include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" -#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" -#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" -#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" +//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" +//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" +//#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" struct GeneratorTensor_1 { diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index b54c4d0c5f..761c32a370 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -383,8 +383,9 @@ struct Blockwise2dTensorCopy3 constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; - static_assert(SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1, - "wrong! only support stride1 == 1!\n"); + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1), + "wrong! only support stride1 == 1 if DataPerRead > 1!\n"); static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, "wrong! only support DataPerRead == 1, 2 or 4!\n"); diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index cc50d9eecd..693b6fe9d5 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -131,11 +131,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds did[3] = is / ref_desc.GetStride(I3); - const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); + const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); - const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); + const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); - f(p_src[aindex], p_dst[bindex]); + f(p_src[src_index], p_dst[dst_index]); } constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); @@ -162,11 +162,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds did[3] = is / ref_desc.GetStride(I3); - const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); + const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); - const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); + const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); - f(p_src[aindex], p_dst[bindex]); + f(p_src[src_index], p_dst[dst_index]); } } } @@ -199,15 +199,112 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy); } -template +template struct Blockwise4dTensorCopy1 { + using vector_t = typename vector_type::type; + + __device__ void SanityCheck() const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1), + "wrong! only support stride3 == 1 if DataPerRead > 1!\n"); + + static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, + "wrong! only support DataPerRead == 1, 2 or 4!\n"); + + static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 && + DstDesc{}.GetStride(I2) % DataPerRead == 0, + "src and dst stride2 should be multiple of DataPerRead to keep alignment"); + + // we allow out-of-bound read from src in D3 dimension, + // but we need to make sure dst stride2 is big enough, + // so that the out-of-bound write won't contaminate next line in dst + constexpr unsigned L3 = CopyLengths{}.Get(I3); + constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead); + + static_assert(read_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2), + "wrong! out-of-bound write will contaminate next line!\n"); + } + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{}; + SanityCheck(); - blockwise_4d_tensor_copy_reorder_by_get_dst_from_src( - SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder); + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + constexpr unsigned L0 = CopyLengths{}.Get(I0); + constexpr unsigned L1 = CopyLengths{}.Get(I1); + constexpr unsigned L2 = CopyLengths{}.Get(I2); + constexpr unsigned L3 = CopyLengths{}.Get(I3); + + constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead); + + constexpr auto ref_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; + + auto f_copy = [&](unsigned is) { + unsigned did[4]; + + did[0] = is / ref_desc.GetStride(I0); + + is -= did[0] * ref_desc.GetStride(I0); + + did[1] = is / ref_desc.GetStride(I1); + + is -= did[1] * ref_desc.GetStride(I1); + + did[2] = is / ref_desc.GetStride(I2); + + is -= did[2] * ref_desc.GetStride(I2); + + did[3] = is / ref_desc.GetStride(I3); + + const unsigned src_index = + src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead); + const unsigned dst_index = + dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead); + + *(reinterpret_cast(p_dst + dst_index)) = + *(reinterpret_cast(p_src + src_index)); + }; + + for(unsigned iloop = 0; iloop < NLoop; ++iloop) + { + unsigned is = threadIdx.x + iloop * BlockSize; + + f_copy(is); + } + + constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); + + if(has_tail) + { + unsigned is = threadIdx.x + NLoop * BlockSize; + + if(is < ref_desc.GetElementSize()) + { + f_copy(is); + } + } } }; @@ -361,8 +458,9 @@ struct Blockwise4dTensorCopy3 constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1, - "wrong! only support stride3 == 1!\n"); + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1), + "wrong! only support stride3 == 1 if DataPerRead > 1!\n"); static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, "wrong! only support DataPerRead == 1, 2 or 4!\n"); @@ -370,7 +468,7 @@ struct Blockwise4dTensorCopy3 static_assert( SrcDesc{}.GetStride(I2) % DataPerRead == 0 && DstDesc{}.GetStride(I2) % DataPerRead == 0, - "wrong! src and dst stride should be multiple of DataPerRead to keep alignment"); + "wrong! src and dst stride2 should be multiple of DataPerRead to keep alignment"); constexpr unsigned L0 = CopyLengths{}.Get(I0); constexpr unsigned L1 = CopyLengths{}.Get(I1); diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index cb2a8a5087..8ce097bdc5 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -1,6 +1,7 @@ #pragma once #include "common.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp" +#include "blockwise_2d_tensor_op.hip.hpp" #include "blockwise_4d_tensor_op.hip.hpp" #include "blockwise_direct_convolution.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp" @@ -20,6 +21,8 @@ template __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( @@ -32,50 +35,72 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_global_desc = InGlobalDesc{}; - constexpr auto wei_global_desc = WeiGlobalDesc{}; - constexpr auto out_global_desc = OutGlobalDesc{}; + constexpr auto in_nchw_global_desc = InGlobalDesc{}; + constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{}; + constexpr auto out_nkhw_global_desc = OutGlobalDesc{}; - constexpr unsigned Y = wei_global_desc.GetLength(I2); - constexpr unsigned X = wei_global_desc.GetLength(I3); + constexpr unsigned N = in_nchw_global_desc.GetLength(I0); + constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0); + constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1); + constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2); + constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3); + + constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor( + Sequence{}); // 2d view of wei for blockwise copy constexpr unsigned HiPerBlock = HoPerBlock + Y - 1; constexpr unsigned WiPerBlock = WoPerBlock + X - 1; - constexpr auto in_block_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); - constexpr auto wei_block_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, + Number{}); // 2d view of wei for blockwise copy + + constexpr auto wei_kcyx_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + Sequence{}); // shared mem - constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); - constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); + constexpr unsigned in_block_size = + in_nchw_block_desc.GetElementSpace(Number{}); + constexpr unsigned wei_block_size = + wei_kcyx_block_desc.GetElementSpace(Number{}); - __shared__ Float p_in_block[in_block_size]; - __shared__ Float p_wei_block[wei_block_size]; + constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead + ? InBlockCopyDataPerRead + : WeiBlockCopyDataPerRead; + + __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; // threadwise tensors constexpr unsigned HiPerThread = HoPerThread + Y - 1; constexpr unsigned WiPerThread = WoPerThread + X - 1; - constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, in_block_desc.GetStrides()); + constexpr auto in_nchw_thread_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + in_nchw_block_desc.GetStrides()); - constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, wei_block_desc.GetStrides()); + constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, wei_kcyx_block_desc.GetStrides()); - constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor( - in_thread_block_desc, wei_thread_block_desc); + constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( + in_nchw_thread_block_desc, wei_kcyx_thread_block_desc); // register - Float p_out_thread[out_thread_desc.GetElementSpace()]; + Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; // divide block work - constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; - constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; - constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; - constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; + constexpr unsigned NBlockWork = + (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; + constexpr unsigned KBlockWork = + (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; + constexpr unsigned HBlockWork = + (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; + constexpr unsigned WBlockWork = + (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; const unsigned block_id = blockIdx.x; @@ -122,34 +147,44 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto blockwise_in_copy = Blockwise4dTensorCopy1{}; + decltype(in_nchw_global_desc), + decltype(in_nchw_block_desc), + decltype(in_nchw_block_desc.GetLengths()), + InBlockCopyDataPerRead>{}; +#if 0 constexpr auto blockwise_wei_copy = Blockwise4dTensorCopy1{}; + decltype(wei_kcyx_global_desc), + decltype(wei_kcyx_block_desc), + decltype(wei_kcyx_block_desc.GetLengths())>{}; +#elif 1 + const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; +#endif // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread); + threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread); - for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1); + for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.Run( - p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_block); + blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex( + k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_block); __syncthreads(); @@ -158,25 +193,27 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( // threadwise convolution #if 1 threadwise_direct_convolution_2( - in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), - wei_thread_block_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - out_thread_desc, + in_nchw_thread_block_desc, + p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_kcyx_thread_block_desc, + p_wei_block + + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_nkhw_thread_desc, p_out_thread); #elif 0 threadwise_direct_convolution_3( - in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), - wei_thread_block_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - out_thread_desc, + in_nchw_thread_block_desc, + p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_kcyx_thread_block_desc, + p_wei_block + + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_nkhw_thread_desc, p_out_thread); #endif } @@ -184,12 +221,12 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( // copy output tensor from register to global mem threadwise_4d_tensor_copy( - out_thread_desc, + out_nkhw_thread_desc, p_out_thread, - out_global_desc, - p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), - out_thread_desc.GetLengths()); + out_nkhw_global_desc, + p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_nkhw_thread_desc.GetLengths()); } From 7faf269c995e5594935a16dfdae75a49f62f4991 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 17 Mar 2019 21:48:46 -0500 Subject: [PATCH 04/13] refactor --- ...irect_convolution_2_nchw_kcyx_nkhw.hip.hpp | 170 +++++++++++------- ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 3 +- 2 files changed, 106 insertions(+), 67 deletions(-) diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp index 322d5fd9c2..da4542d9cb 100644 --- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp @@ -1,6 +1,7 @@ #pragma once #include "common.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp" +#include "blockwise_2d_tensor_op.hip.hpp" #include "blockwise_4d_tensor_op.hip.hpp" #include "blockwise_direct_convolution.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp" @@ -20,62 +21,86 @@ template -__global__ void -gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) +__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( + const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_global_desc = InGlobalDesc{}; - constexpr auto wei_global_desc = WeiGlobalDesc{}; - constexpr auto out_global_desc = OutGlobalDesc{}; + constexpr auto in_nchw_global_desc = InGlobalDesc{}; + constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{}; + constexpr auto out_nkhw_global_desc = OutGlobalDesc{}; - constexpr unsigned Y = wei_global_desc.GetLength(I2); - constexpr unsigned X = wei_global_desc.GetLength(I3); + constexpr unsigned N = in_nchw_global_desc.GetLength(I0); + constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0); + constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1); + constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2); + constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3); + + constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor( + Sequence{}); // 2d view of wei for blockwise copy constexpr unsigned HiPerBlock = HoPerBlock + Y - 1; constexpr unsigned WiPerBlock = WoPerBlock + X - 1; - constexpr auto in_block_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); - constexpr auto wei_block_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, + Number{}); // 2d view of wei for blockwise copy + + constexpr auto wei_kcyx_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + Sequence{}); // shared mem - constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); - constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); + constexpr unsigned in_block_size = + in_nchw_block_desc.GetElementSpace(Number{}); + constexpr unsigned wei_block_size = + wei_kcyx_block_desc.GetElementSpace(Number{}); - __shared__ Float p_in_block[in_block_size]; - __shared__ Float p_wei_block[wei_block_size]; + constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead + ? InBlockCopyDataPerRead + : WeiBlockCopyDataPerRead; + + __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; // threadwise tensors constexpr unsigned HiPerThread = HoPerThread + Y - 1; constexpr unsigned WiPerThread = WoPerThread + X - 1; - constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, in_block_desc.GetStrides()); + constexpr auto in_nchw_thread_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + in_nchw_block_desc.GetStrides()); - constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, wei_block_desc.GetStrides()); + constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, wei_kcyx_block_desc.GetStrides()); - constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor( - in_thread_block_desc, wei_thread_block_desc); + constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( + in_nchw_thread_block_desc, wei_kcyx_thread_block_desc); // register - Float p_out_thread[out_thread_desc.GetElementSpace()]; + Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; // divide block work - constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; - constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; - constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; - constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; + constexpr unsigned NBlockWork = + (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; + constexpr unsigned KBlockWork = + (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; + constexpr unsigned HBlockWork = + (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; + constexpr unsigned WBlockWork = + (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; const unsigned block_id = blockIdx.x; @@ -122,34 +147,45 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i constexpr auto blockwise_in_copy = Blockwise4dTensorCopy1{}; + decltype(in_nchw_global_desc), + decltype(in_nchw_block_desc), + decltype(in_nchw_block_desc.GetLengths()), + InBlockCopyDataPerRead>{}; +#if 0 constexpr auto blockwise_wei_copy = Blockwise4dTensorCopy1{}; + decltype(wei_kcyx_global_desc), + decltype(wei_kcyx_block_desc), + decltype(wei_kcyx_block_desc.GetLengths()), + 1>{}; +#elif 1 + const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; +#endif // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread); + threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread); - for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1); + for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.Run( - p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_block); + blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex( + k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_block); __syncthreads(); @@ -158,25 +194,27 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i // threadwise convolution #if 1 threadwise_direct_convolution_2( - in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), - wei_thread_block_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - out_thread_desc, + in_nchw_thread_block_desc, + p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_kcyx_thread_block_desc, + p_wei_block + + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_nkhw_thread_desc, p_out_thread); #elif 0 threadwise_direct_convolution_3( - in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), - wei_thread_block_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - out_thread_desc, + in_nchw_thread_block_desc, + p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_kcyx_thread_block_desc, + p_wei_block + + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_nkhw_thread_desc, p_out_thread); #endif } @@ -184,12 +222,12 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i // copy output tensor from register to global mem threadwise_4d_tensor_copy( - out_thread_desc, + out_nkhw_thread_desc, p_out_thread, - out_global_desc, - p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), - out_thread_desc.GetLengths()); + out_nkhw_global_desc, + p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_nkhw_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index 8ce097bdc5..da4542d9cb 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -158,7 +158,8 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( Float, decltype(wei_kcyx_global_desc), decltype(wei_kcyx_block_desc), - decltype(wei_kcyx_block_desc.GetLengths())>{}; + decltype(wei_kcyx_block_desc.GetLengths()), + 1>{}; #elif 1 const auto blockwise_wei_copy = Blockwise2dTensorCopy3 Date: Mon, 18 Mar 2019 15:03:17 -0500 Subject: [PATCH 05/13] adding fp16 direct that reads pre-vectorized data --- ...ce_direct_convolution_2_nchw_kcyx_nkhw.hpp | 74 ++++---- ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 160 ++++++++++++++++++ driver/driver.hip.cpp | 157 ++++++++--------- src/include/blockwise_2d_tensor_op.hip.hpp | 2 +- src/include/blockwise_4d_tensor_op.hip.hpp | 8 +- src/include/common.hip.hpp | 38 +++-- ...irect_convolution_2_nchw_kcyx_nkhw.hip.hpp | 8 +- ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 97 ++++++----- 8 files changed, 369 insertions(+), 175 deletions(-) create mode 100644 driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp diff --git a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp index 2fc2264f6d..4a49ff9fc8 100644 --- a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp @@ -1,8 +1,7 @@ #pragma once #include #include "device.hpp" -//#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" -#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp" +#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" template void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, @@ -50,6 +49,24 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned BlockSize = 128; +#elif 1 + // 3x3, 34x34, 128 thread, fp16 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned BlockSize = 128; #endif @@ -61,35 +78,30 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - float time = launch_kernel( -#if 0 - gridwise_direct_convolution_2_nchw_kcyx_nkhw -#else - gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw -#endif - , - dim3(GridSize), - dim3(BlockSize), - static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer())); + float time = + launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp new file mode 100644 index 0000000000..19633f0462 --- /dev/null +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -0,0 +1,160 @@ +#pragma once +#include +#include "device.hpp" +#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp" + +template +void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcyx, + OutDesc, + Tensor& out_nkhw, + unsigned nrepeat) +{ + constexpr unsigned NVector = 1; + using vector_type_t = vector_type; + using vector_t = typename vector_type_t::VectorType; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_nchw_desc = InDesc{}; + constexpr auto wei_kcyx_desc = WeiDesc{}; + constexpr auto out_nkhw_desc = OutDesc{}; + + constexpr unsigned Hi = in_nchw_desc.GetLength(I2); + constexpr unsigned Wi = in_nchw_desc.GetLength(I3); + + constexpr unsigned N = out_nkhw_desc.GetLength(I0); + constexpr unsigned Ho = out_nkhw_desc.GetLength(I2); + constexpr unsigned Wo = out_nkhw_desc.GetLength(I3); + + constexpr unsigned K = wei_kcyx_desc.GetLength(I0); + constexpr unsigned C = wei_kcyx_desc.GetLength(I1); + constexpr unsigned Y = wei_kcyx_desc.GetLength(I2); + constexpr unsigned X = wei_kcyx_desc.GetLength(I3); + + // vectorized input + auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence{}); + ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: "); + + Tensor in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc)); + + auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) { +#if 1 + in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w); +#else + in_nchw_vec(n, c, h, w) = + vector_type_t::pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); +#endif + }; + + make_ParallelTensorFunctor(f_vectorized_nchw, N, C, Hi, Wi)( + std::thread::hardware_concurrency()); + + // vectorize weight + auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence{}); + ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: "); + + Tensor wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc)); + + auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) { +#if 1 + wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x); +#else + wei_kcyx_vec(k, c, y, x) = + vector_type_t::pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x)); +#endif + }; + + make_ParallelTensorFunctor(f_vectorized_kcyx, K, C, Y, X)(std::thread::hardware_concurrency()); + + // + DeviceMem in_nchw_vec_device_buf(sizeof(vector_t) * in_nchw_vec.mDesc.GetElementSpace()); + DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_t) * wei_kcyx_vec.mDesc.GetElementSpace()); + DeviceMem out_nkhw_device_buf(sizeof(T) * out_nkhw.mDesc.GetElementSpace()); + + in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data()); + wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data()); + out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); + +#if 1 + // 3x3, 34x34, 128 thread + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + + constexpr unsigned BlockSize = 128; +#elif 1 + // 3x3, 34x34, 128 thread, fp16 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + + constexpr unsigned BlockSize = 128; +#endif + + constexpr unsigned GridSize = + (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + for(unsigned i = 0; i < nrepeat; ++i) + { + float time = launch_kernel( + gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_nchw_vec_device_buf.GetDeviceBuffer()), + static_cast(wei_kcyx_vec_device_buf.GetDeviceBuffer()), + static_cast(out_nkhw_device_buf.GetDeviceBuffer())); + + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); + } + + out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); +} diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index a952b95380..47ff1b6882 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -9,6 +9,7 @@ #include "conv_common.hip.hpp" //#include "device_direct_convolution_1.hpp" #include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" +#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" //#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" @@ -34,25 +35,6 @@ struct GeneratorTensor_2 } }; -struct GeneratorTensor_3 -{ - template - double operator()(Is... is) - { -#if 0 - std::initializer_list ls = {static_cast(is)...}; - return std::accumulate(ls.begin(), ls.end(), std::size_t(0)); -#elif 1 - assert(sizeof...(Is) > 0); - std::initializer_list ids = {static_cast(is)...}; - std::vector lens(sizeof...(Is), 100); - std::vector strides(sizeof...(Is), 1); - std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is) - 1), strides.rbegin() + 1); - return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1; -#endif - } -}; - struct GeneratorTensor_Checkboard { template @@ -129,7 +111,7 @@ void host_direct_convolution( if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 && wi < in_nchw.mDesc.GetLengths()[3]) { - v += in_nchw(n, c, hi, wi) * wei_kcyx(k, c, y, x); + v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x)); } } } @@ -177,11 +159,11 @@ void host_winograd_3x3_convolution( std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile; std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile; - Tensor in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile}); - Tensor in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile}); - Tensor wei_transform({K, C, HiPerTile, WiPerTile}); - Tensor out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile}); - Tensor out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile}); + Tensor in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile}); + Tensor in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile}); + Tensor wei_transform({K, C, HiPerTile, WiPerTile}); + Tensor out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile}); + Tensor out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile}); auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) { for(int j = 0; j < HiPerTile; ++j) @@ -259,49 +241,61 @@ void host_winograd_3x3_convolution( }; auto f_wei_transform = [&](auto k, auto c) { - wei_transform(k, c, 0, 0) = wei_kcyx(k, c, 0, 0); - wei_transform(k, c, 0, 1) = - 0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2); - wei_transform(k, c, 0, 2) = - 0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2); - wei_transform(k, c, 0, 3) = wei_kcyx(k, c, 0, 2); + wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0)); + wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) + + 0.5 * double(wei_kcyx(k, c, 0, 1)) + + 0.5 * double(wei_kcyx(k, c, 0, 2)); + wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) - + 0.5 * double(wei_kcyx(k, c, 0, 1)) + + 0.5 * double(wei_kcyx(k, c, 0, 2)); + wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2)); - wei_transform(k, c, 1, 0) = - 0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0); - wei_transform(k, c, 1, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) + - 0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 1, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) - - 0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 1, 3) = - 0.5 * wei_kcyx(k, c, 0, 2) + 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2); + wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) + + 0.5 * double(wei_kcyx(k, c, 1, 0)) + + 0.5 * double(wei_kcyx(k, c, 2, 0)); + wei_transform(k, c, 1, 1) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) + + 0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 1, 2) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) - + 0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) + + 0.5 * double(wei_kcyx(k, c, 1, 2)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); - wei_transform(k, c, 2, 0) = - 0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0); - wei_transform(k, c, 2, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) - - 0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 2, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) + - 0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 2, 3) = - 0.5 * wei_kcyx(k, c, 0, 2) - 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2); + wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) - + 0.5 * double(wei_kcyx(k, c, 1, 0)) + + 0.5 * double(wei_kcyx(k, c, 2, 0)); + wei_transform(k, c, 2, 1) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) - + 0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 2, 2) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) + + 0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) - + 0.5 * double(wei_kcyx(k, c, 1, 2)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); - wei_transform(k, c, 3, 0) = wei_kcyx(k, c, 2, 0); - wei_transform(k, c, 3, 1) = - 0.5 * wei_kcyx(k, c, 2, 0) + 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 3, 2) = - 0.5 * wei_kcyx(k, c, 2, 0) - 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 3, 3) = wei_kcyx(k, c, 2, 2); + wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0)); + wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) + + 0.5 * double(wei_kcyx(k, c, 2, 1)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) - + 0.5 * double(wei_kcyx(k, c, 2, 1)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2)); }; auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) { @@ -372,20 +366,25 @@ void host_winograd_3x3_convolution( template void check_error(const Tensor& ref, const Tensor& result) { + // printf("\n"); + float error = 0; float max_diff = -1; float ref_value = 0, result_value = 0; for(int i = 0; i < ref.mData.size(); ++i) { - error += std::abs(ref.mData[i] - result.mData[i]); - float diff = std::abs(ref.mData[i] - result.mData[i]); + error += std::abs(double(ref.mData[i]) - double(result.mData[i])); + float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); if(max_diff < diff) { max_diff = diff; ref_value = ref.mData[i]; result_value = result.mData[i]; } + + // printf("{%f, %f}", double(ref.mData[i]), double(result.mData[i])); } + // printf("\n"); std::cout << "error: " << error << std::endl; std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; @@ -406,13 +405,13 @@ int main(int argc, char* argv[]) constexpr unsigned WPad = 0; #elif 1 // 3x3, 34x34 - constexpr unsigned N = 64; - constexpr unsigned C = 256; + constexpr unsigned N = 64; + constexpr unsigned C = 256; constexpr unsigned HI = 34; constexpr unsigned WI = 34; - constexpr unsigned K = 64; - constexpr unsigned Y = 3; - constexpr unsigned X = 3; + constexpr unsigned K = 64; + constexpr unsigned Y = 3; + constexpr unsigned X = 3; constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; @@ -603,16 +602,22 @@ int main(int argc, char* argv[]) in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); #elif 1 - in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + + auto gen_wei = [](auto... is) { + return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); + }; + wei_kcyx.GenerateTensorValue(gen_wei, num_thread); #endif } #if 1 #if 0 device_direct_convolution_1 -#elif 1 +#elif 0 device_direct_convolution_2_nchw_kcyx_nkhw +#elif 1 + device_direct_convolution_2_vectorized_nchw_kcyx_nkhw #elif 0 device_implicit_gemm_convolution_1_chwn_cyxk_khwn #elif 0 @@ -634,7 +639,6 @@ int main(int argc, char* argv[]) if(do_verification) { -#if 1 if(Y == 3 && X == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); @@ -644,7 +648,6 @@ int main(int argc, char* argv[]) host_direct_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); } check_error(out_nkhw_host, out_nkhw_device); -#endif #if 0 LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index 761c32a370..969f18c4e8 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -373,7 +373,7 @@ template struct Blockwise2dTensorCopy3 { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::VectorType; unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index 693b6fe9d5..e8829c0cbf 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -207,9 +207,9 @@ template struct Blockwise4dTensorCopy1 { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::VectorType; - __device__ void SanityCheck() const + __device__ constexpr Blockwise4dTensorCopy1() { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -239,8 +239,6 @@ struct Blockwise4dTensorCopy1 __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - SanityCheck(); - constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; @@ -446,7 +444,7 @@ template struct Blockwise4dTensorCopy3 { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::VectorType; unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index f447fce784..aa7e2269f6 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -28,44 +28,44 @@ struct vector_type template <> struct vector_type { - using type = float; + using VectorType = float; }; template <> struct vector_type { - using type = float2; + using VectorType = float2; }; template <> struct vector_type { - using type = float4; + using VectorType = float4; }; #if 0 template <> struct vector_type { - using type = half_float::half; + using VectorType = half_float::half; }; template <> struct vector_type { - using type = float; + using VectorType = float; }; template <> struct vector_type { - using type = float2; + using VectorType = float2; }; template <> struct vector_type { - using type = float4; + using VectorType = float4; }; #endif @@ -73,25 +73,41 @@ struct vector_type template <> struct vector_type { - using type = half; + using VectorType = half; + + __host__ __device__ static VectorType pack(half s) { return s; } }; template <> struct vector_type { - using type = half2; + using VectorType = half2; + + union Data + { + VectorType vector; + half scalar[2]; + }; + + __host__ __device__ static VectorType pack(half s0, half s1) + { + Data data; + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } }; template <> struct vector_type { - using type = float2; + using VectorType = float2; }; template <> struct vector_type { - using type = float4; + using VectorType = float4; }; #endif diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp index da4542d9cb..5761a22c16 100644 --- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp @@ -25,10 +25,10 @@ template -__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( - const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) +__global__ void +gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index da4542d9cb..2b3cb03b78 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -11,6 +11,7 @@ template __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( - const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, + const typename vector_type::VectorType* const __restrict__ p_in_global, + const typename vector_type::VectorType* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) { + using scalar_t = Float; + using vector_t = typename vector_type::VectorType; + constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_nchw_global_desc = InGlobalDesc{}; - constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{}; - constexpr auto out_nkhw_global_desc = OutGlobalDesc{}; + constexpr auto in_nchw_vec_global_desc = InGlobalDesc{}; + constexpr auto wei_kcyx_vec_global_desc = WeiGlobalDesc{}; + constexpr auto out_nkhw_global_desc = OutGlobalDesc{}; - constexpr unsigned N = in_nchw_global_desc.GetLength(I0); - constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0); - constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1); - constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2); - constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3); + constexpr unsigned N = in_nchw_vec_global_desc.GetLength(I0); + constexpr unsigned K = wei_kcyx_vec_global_desc.GetLength(I0); + constexpr unsigned C = wei_kcyx_vec_global_desc.GetLength(I1); + constexpr unsigned Y = wei_kcyx_vec_global_desc.GetLength(I2); + constexpr unsigned X = wei_kcyx_vec_global_desc.GetLength(I3); - constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor( + constexpr auto wei_ke_vec_global_desc = make_ConstantTensorDescriptor( Sequence{}); // 2d view of wei for blockwise copy constexpr unsigned HiPerBlock = HoPerBlock + Y - 1; constexpr unsigned WiPerBlock = WoPerBlock + X - 1; - constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto in_nchw_vec_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); - constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_ke_vec_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); // 2d view of wei for blockwise copy - constexpr auto wei_kcyx_block_desc = + constexpr auto wei_kcyx_vec_block_desc = make_ConstantTensorDescriptor(Sequence{}, - Sequence{}); + Sequence{}); // shared mem constexpr unsigned in_block_size = - in_nchw_block_desc.GetElementSpace(Number{}); + in_nchw_vec_block_desc.GetElementSpace(Number{}); constexpr unsigned wei_block_size = - wei_kcyx_block_desc.GetElementSpace(Number{}); + wei_kcyx_vec_block_desc.GetElementSpace(Number{}); constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead ? InBlockCopyDataPerRead @@ -81,10 +85,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto in_nchw_thread_block_desc = make_ConstantTensorDescriptor(Sequence{}, - in_nchw_block_desc.GetStrides()); + in_nchw_vec_block_desc.GetStrides()); constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, wei_kcyx_block_desc.GetStrides()); + Sequence{}, wei_kcyx_vec_block_desc.GetStrides()); constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( in_nchw_thread_block_desc, wei_kcyx_thread_block_desc); @@ -147,26 +151,27 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto blockwise_in_copy = Blockwise4dTensorCopy1{}; #if 0 constexpr auto blockwise_wei_copy = Blockwise4dTensorCopy1{}; #elif 1 - const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; + const auto blockwise_wei_copy = + Blockwise2dTensorCopy3{}; #endif // set threadwise output tensor to 0 @@ -176,14 +181,14 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex( + blockwise_wei_copy.Run(p_wei_global + wei_kcyx_vec_global_desc.Get1dIndex( k_block_data_begin, c_block_data_begin, 0, 0), p_wei_block); @@ -195,25 +200,25 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( #if 1 threadwise_direct_convolution_2( in_nchw_thread_block_desc, - p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + - wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), out_nkhw_thread_desc, p_out_thread); #elif 0 threadwise_direct_convolution_3( in_nchw_thread_block_desc, - p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + - wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), out_nkhw_thread_desc, p_out_thread); #endif From 2832520418fe81185b1a44f8421ec32f2efc1714 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Mar 2019 15:09:52 -0500 Subject: [PATCH 06/13] adding fp16 direct that reads pre-vectorized data --- ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index 2b3cb03b78..5901c42e55 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -27,8 +27,8 @@ template __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( - const typename vector_type::VectorType* const __restrict__ p_in_global, - const typename vector_type::VectorType* const __restrict__ p_wei_global, + const typename vector_type::VectorType* const __restrict__ p_in_vec_global, + const typename vector_type::VectorType* const __restrict__ p_wei_vec_global, Float* const __restrict__ p_out_global) { using scalar_t = Float; @@ -76,25 +76,25 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ? InBlockCopyDataPerRead : WeiBlockCopyDataPerRead; - __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)]; - __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; + __shared__ vector_t p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ vector_t p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; // threadwise tensors constexpr unsigned HiPerThread = HoPerThread + Y - 1; constexpr unsigned WiPerThread = WoPerThread + X - 1; - constexpr auto in_nchw_thread_block_desc = + constexpr auto in_nchw_vec_thread_block_desc = make_ConstantTensorDescriptor(Sequence{}, in_nchw_vec_block_desc.GetStrides()); - constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( + constexpr auto wei_kcyx_vec_thread_block_desc = make_ConstantTensorDescriptor( Sequence{}, wei_kcyx_vec_block_desc.GetStrides()); constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( - in_nchw_thread_block_desc, wei_kcyx_thread_block_desc); + in_nchw_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc); // register - Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; + scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; // divide block work constexpr unsigned NBlockWork = @@ -150,7 +150,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto blockwise_in_copy = Blockwise4dTensorCopy1 Date: Mon, 18 Mar 2019 18:16:02 -0500 Subject: [PATCH 07/13] adding fp16 direct that reads pre-vectorized data --- ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 49 +++++++--- src/include/blockwise_2d_tensor_op.hip.hpp | 2 +- src/include/blockwise_4d_tensor_op.hip.hpp | 4 +- src/include/common.hip.hpp | 92 +------------------ src/include/config.h.in | 2 - src/include/functional.hip.hpp | 8 ++ ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 46 ++++++---- .../threadwise_direct_convolution.hip.hpp | 55 ++++------- 8 files changed, 92 insertions(+), 166 deletions(-) diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp index 19633f0462..d16d05d978 100644 --- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -13,8 +13,8 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, unsigned nrepeat) { constexpr unsigned NVector = 1; - using vector_type_t = vector_type; - using vector_t = typename vector_type_t::VectorType; + using vector_t = vector_type; + using vector_mem_t = typename vector_t::MemoryType; constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -41,40 +41,41 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence{}); ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: "); - Tensor in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc)); + Tensor in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc)); auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) { #if 1 in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w); #else in_nchw_vec(n, c, h, w) = - vector_type_t::pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); + vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); #endif }; - make_ParallelTensorFunctor(f_vectorized_nchw, N, C, Hi, Wi)( + make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)( std::thread::hardware_concurrency()); // vectorize weight auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence{}); ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: "); - Tensor wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc)); + Tensor wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc)); auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) { #if 1 wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x); #else wei_kcyx_vec(k, c, y, x) = - vector_type_t::pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x)); + vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x)); #endif }; - make_ParallelTensorFunctor(f_vectorized_kcyx, K, C, Y, X)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)( + std::thread::hardware_concurrency()); // - DeviceMem in_nchw_vec_device_buf(sizeof(vector_t) * in_nchw_vec.mDesc.GetElementSpace()); - DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_t) * wei_kcyx_vec.mDesc.GetElementSpace()); + DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace()); + DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace()); DeviceMem out_nkhw_device_buf(sizeof(T) * out_nkhw.mDesc.GetElementSpace()); in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data()); @@ -82,7 +83,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); #if 1 - // 3x3, 34x34, 128 thread + // 3x3, 34x34, 128 thread, fp32, vector = 1 constexpr unsigned NPerBlock = 2; constexpr unsigned KPerBlock = 32; constexpr unsigned CPerBlock = 4; @@ -96,24 +97,42 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, constexpr unsigned WoPerThread = 2; constexpr unsigned InBlockCopyDataPerRead = 2; - constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned WeiBlockCopyDataPerRead = 2; constexpr unsigned BlockSize = 128; #elif 1 - // 3x3, 34x34, 128 thread, fp16 + // 3x3, 34x34, 128 thread, fp32, vector = 2 constexpr unsigned NPerBlock = 2; constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 4; + constexpr unsigned CPerBlock = 2; constexpr unsigned HoPerBlock = 2; constexpr unsigned WoPerBlock = 32; constexpr unsigned NPerThread = 2; constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; + constexpr unsigned CPerThread = 1; constexpr unsigned HoPerThread = 2; constexpr unsigned WoPerThread = 2; constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 2; + + constexpr unsigned BlockSize = 128; +#elif 1 + // 3x3, 34x34, 128 thread, fp16 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 4; constexpr unsigned BlockSize = 128; diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index 969f18c4e8..ce3a7a37b9 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -373,7 +373,7 @@ template struct Blockwise2dTensorCopy3 { - using vector_t = typename vector_type::VectorType; + using vector_t = typename vector_type::MemoryType; unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index e8829c0cbf..fa5f36be51 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -207,7 +207,7 @@ template struct Blockwise4dTensorCopy1 { - using vector_t = typename vector_type::VectorType; + using vector_t = typename vector_type::MemoryType; __device__ constexpr Blockwise4dTensorCopy1() { @@ -444,7 +444,7 @@ template struct Blockwise4dTensorCopy3 { - using vector_t = typename vector_type::VectorType; + using vector_t = typename vector_type::MemoryType; unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index aa7e2269f6..d5832dde9d 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -1,4 +1,5 @@ #pragma once +#include "data_type.hip.hpp" #include "constant_integral.hip.hpp" #include "Sequence.hip.hpp" #include "Array.hip.hpp" @@ -20,97 +21,6 @@ struct is_same static const bool value = true; }; -template -struct vector_type -{ -}; - -template <> -struct vector_type -{ - using VectorType = float; -}; - -template <> -struct vector_type -{ - using VectorType = float2; -}; - -template <> -struct vector_type -{ - using VectorType = float4; -}; - -#if 0 -template <> -struct vector_type -{ - using VectorType = half_float::half; -}; - -template <> -struct vector_type -{ - using VectorType = float; -}; - -template <> -struct vector_type -{ - using VectorType = float2; -}; - -template <> -struct vector_type -{ - using VectorType = float4; -}; -#endif - -#if 1 -template <> -struct vector_type -{ - using VectorType = half; - - __host__ __device__ static VectorType pack(half s) { return s; } -}; - -template <> -struct vector_type -{ - using VectorType = half2; - - union Data - { - VectorType vector; - half scalar[2]; - }; - - __host__ __device__ static VectorType pack(half s0, half s1) - { - Data data; - data.scalar[0] = s0; - data.scalar[1] = s1; - return data.vector; - } -}; - -template <> -struct vector_type -{ - using VectorType = float2; -}; - -template <> -struct vector_type -{ - using VectorType = float4; -}; -#endif - template __host__ __device__ constexpr T max(T a, T b) { diff --git a/src/include/config.h.in b/src/include/config.h.in index 9ee0c41f80..7b888c6951 100644 --- a/src/include/config.h.in +++ b/src/include/config.h.in @@ -4,10 +4,8 @@ #if DEVICE_BACKEND_HIP #include "hip/hip_runtime.h" -#include "half.hpp" #elif DEVICE_BACKEND_CUDA #include "cuda_runtime.h" #include "nvToolsExt.h" #include "helper_cuda.h" -#include "cuda_fp16.h" #endif diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp index 598d5c3c71..d3f645eaae 100644 --- a/src/include/functional.hip.hpp +++ b/src/include/functional.hip.hpp @@ -47,3 +47,11 @@ struct static_const_reduce_n<1> return f(Number<0>{}); } }; + +#if 0 +template +__host__ __device__ constexpr auto unpacker(F f) +{ + return [=](auto xs_array){ f(xs...); }; +} +#endif \ No newline at end of file diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index 5901c42e55..825977ab54 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -27,12 +27,14 @@ template __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( - const typename vector_type::VectorType* const __restrict__ p_in_vec_global, - const typename vector_type::VectorType* const __restrict__ p_wei_vec_global, + const typename vector_type::MemoryType* const __restrict__ p_in_vec_global, + const typename vector_type::MemoryType* const __restrict__ p_wei_vec_global, Float* const __restrict__ p_out_global) { - using scalar_t = Float; - using vector_t = typename vector_type::VectorType; + using scalar_t = Float; + using vector_mem_t = typename vector_type::MemoryType; constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -69,6 +71,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( // shared mem constexpr unsigned in_block_size = in_nchw_vec_block_desc.GetElementSpace(Number{}); + constexpr unsigned wei_block_size = wei_kcyx_vec_block_desc.GetElementSpace(Number{}); @@ -76,8 +79,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ? InBlockCopyDataPerRead : WeiBlockCopyDataPerRead; - __shared__ vector_t p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)]; - __shared__ vector_t p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; + __shared__ vector_mem_t + p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ vector_mem_t + p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; // threadwise tensors constexpr unsigned HiPerThread = HoPerThread + Y - 1; @@ -150,7 +155,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto blockwise_in_copy = Blockwise4dTensorCopy1 +template __device__ void threadwise_direct_convolution_1(InDesc, - Float* const __restrict__ p_in, + TInWei* const __restrict__ p_in, WeiDesc, - Float* const __restrict__ p_wei, + TInWei* const __restrict__ p_wei, OutDesc, - Float* __restrict__ p_out) + TOut* __restrict__ p_out) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -51,25 +51,10 @@ __device__ void threadwise_direct_convolution_1(InDesc, const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo); - p_out[out_index] += p_wei[wei_index] * p_in[in_index]; - -#if 0 - // if(threadIdx.x == 0) - { - printf("threadwise_direct_convolution: \t" - "threadIdx.x %u\t" - "out_index %u, p_out[out_index] %f, \t" - "wei_index %u, p_wei[wei_index] %f, \t" - "in_index %u, p_in[in_index] %f\n", - threadIdx.x, - out_index, - p_out[out_index], - wei_index, - p_wei[wei_index], - in_index, - p_in[in_index]); - } -#endif + fused_multiply_add(p_out[out_index], + p_wei[wei_index], + p_in[in_index], + p_out[out_index]); } } } @@ -81,13 +66,13 @@ __device__ void threadwise_direct_convolution_1(InDesc, // Optimized for scenario if p_in and p_wei are in LDS, p_out are in register // Copy in and wei into register before doing convolution -template +template __device__ void threadwise_direct_convolution_2(InDesc, - Float* const __restrict__ p_in, + TInWei* const __restrict__ p_in, WeiDesc, - Float* const __restrict__ p_wei, + TInWei* const __restrict__ p_wei, OutDesc, - Float* __restrict__ p_out) + TOut* __restrict__ p_out) { constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; @@ -97,8 +82,8 @@ __device__ void threadwise_direct_convolution_2(InDesc, constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths()); // register - Float p_in_reg[in_reg_desc.GetElementSpace()]; - Float p_wei_reg[wei_reg_desc.GetElementSpace()]; + TInWei p_in_reg[in_reg_desc.GetElementSpace()]; + TInWei p_wei_reg[wei_reg_desc.GetElementSpace()]; // copy input tensor into register threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc.GetLengths()); @@ -114,13 +99,13 @@ __device__ void threadwise_direct_convolution_2(InDesc, // optimized for scenario where p_in and p_wei are in LDS, p_out is in register // break down a non-1x1 convolution into a sequence of 1x1 convolutions, // load 1x1 weight into register, and do 1x1 convolution in register. -template +template __device__ void threadwise_direct_convolution_3(InDesc, - Float* const __restrict__ p_in, + Data* const __restrict__ p_in, WeiDesc, - Float* const __restrict__ p_wei, + Data* const __restrict__ p_wei, OutDesc, - Float* __restrict__ p_out) + Data* __restrict__ p_out) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -139,8 +124,8 @@ __device__ void threadwise_direct_convolution_3(InDesc, constexpr auto wei_reg_desc = make_ConstantTensorDescriptor( Sequence{}); - Float p_in_reg[in_reg_desc.GetElementSpace()]; - Float p_wei_reg[wei_reg_desc.GetElementSpace()]; + Data p_in_reg[in_reg_desc.GetElementSpace()]; + Data p_wei_reg[wei_reg_desc.GetElementSpace()]; constexpr unsigned in_w_new_read = 1; From 18ffbd680273c5970ff1d105c1ee6fca99e0df88 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Mar 2019 18:16:16 -0500 Subject: [PATCH 08/13] adding fp16 direct that reads pre-vectorized data --- src/include/data_type.hip.hpp | 157 ++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 src/include/data_type.hip.hpp diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp new file mode 100644 index 0000000000..fb0bdabee3 --- /dev/null +++ b/src/include/data_type.hip.hpp @@ -0,0 +1,157 @@ +#pragma once +#include "config.h" + +#if DEVICE_BACKEND_CUDA +namespace CUDA { +#include "cuda_fp16.h" +} +#endif + +using half = CUDA::half; +using half2 = CUDA::half2; + +struct half4 +{ + half data[4]; +}; + +struct half8 +{ + half data[8]; +}; + +template +struct vector_type +{ +}; + +template <> +struct vector_type +{ + using MemoryType = float; +}; + +template <> +struct vector_type +{ + using MemoryType = float2; + + __host__ __device__ static MemoryType Pack(float s0, float s1) + { + union + { + MemoryType vector; + float scalar[2]; + } data; + + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = float4; +}; + +template <> +struct vector_type +{ + using MemoryType = float4; +}; + +template <> +struct vector_type +{ + using MemoryType = half; + + __host__ __device__ static MemoryType Pack(half s) { return s; } +}; + +template <> +struct vector_type +{ + using MemoryType = half2; + + __host__ __device__ static MemoryType Pack(half s0, half s1) + { + union + { + MemoryType vector; + half scalar[2]; + } data; + + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = float2; +}; + +template <> +struct vector_type +{ + using MemoryType = float4; +}; + +template <> +struct vector_type +{ + using MemoryType = half2; +}; + +template <> +struct vector_type +{ + using MemoryType = float2; +}; + +template <> +struct vector_type +{ + using MemoryType = float4; +}; + +template +__device__ void fused_multiply_add(TDst& d, TSrc0 s0, TSrc1 s1, TSrc2 s2) +{ + printf("should not call into base"); + assert(false); +} + +template <> +__device__ void fused_multiply_add(float& d, float s0, float s1, float s2) +{ + d = s0 * s1 + s2; +} + +template <> +__device__ void fused_multiply_add(float& d, float2 s0, float2 s1, float s2) +{ + d = s0.x * s1.x + s0.y * s1.y + s2; +} + +template <> +__device__ void fused_multiply_add(float& d, float4 s0, float4 s1, float s2) +{ + d = s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w + s2; +} + +template <> +__device__ void fused_multiply_add(half& d, half s0, half s1, half s2) +{ + d = s0 * s1 + s2; +} + +template <> +__device__ void fused_multiply_add(half& d, half2 s0, half2 s1, half s2) +{ + d = s0.x * s1.x + s0.y * s1.y + s2; +} \ No newline at end of file From 050a1a6890007d577f9c83cef6e576a76788004a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 19 Mar 2019 00:05:41 -0500 Subject: [PATCH 09/13] adding int8 direct that reads pre-vectorized data --- ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 71 ++++++--- driver/driver.hip.cpp | 65 ++++++--- src/include/data_type.hip.hpp | 138 +++++++++++++++--- ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 30 ++-- src/include/threadwise_4d_tensor_op.hip.hpp | 32 ++-- .../threadwise_direct_convolution.hip.hpp | 6 +- 6 files changed, 247 insertions(+), 95 deletions(-) diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp index d16d05d978..5c687f4206 100644 --- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -3,17 +3,18 @@ #include "device.hpp" #include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp" -template +template void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, - const Tensor& in_nchw, + const Tensor& in_nchw, WeiDesc, - const Tensor& wei_kcyx, + const Tensor& wei_kcyx, OutDesc, - Tensor& out_nkhw, + Tensor& out_nkhw, unsigned nrepeat) { - constexpr unsigned NVector = 1; - using vector_t = vector_type; + constexpr unsigned NVector = 4; + using accum_t = int32_t; + using vector_t = vector_type; using vector_mem_t = typename vector_t::MemoryType; constexpr auto I0 = Number<0>{}; @@ -44,11 +45,16 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, Tensor in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc)); auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) { -#if 1 +#if 0 in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w); -#else +#elif 0 in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); +#elif 1 + in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w), + in_nchw(n, 4 * c + 1, h, w), + in_nchw(n, 4 * c + 2, h, w), + in_nchw(n, 4 * c + 3, h, w)); #endif }; @@ -62,11 +68,16 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, Tensor wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc)); auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) { -#if 1 +#if 0 wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x); -#else +#elif 0 wei_kcyx_vec(k, c, y, x) = vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x)); +#elif 1 + wei_kcyx_vec(k, c, y, x) = vector_t::Pack(wei_kcyx(k, 4 * c, y, x), + wei_kcyx(k, 4 * c + 1, y, x), + wei_kcyx(k, 4 * c + 2, y, x), + wei_kcyx(k, 4 * c + 3, y, x)); #endif }; @@ -76,13 +87,13 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, // DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace()); DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace()); - DeviceMem out_nkhw_device_buf(sizeof(T) * out_nkhw.mDesc.GetElementSpace()); + DeviceMem out_nkhw_device_buf(sizeof(TOut) * out_nkhw.mDesc.GetElementSpace()); in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data()); wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 +#if 0 // 3x3, 34x34, 128 thread, fp32, vector = 1 constexpr unsigned NPerBlock = 2; constexpr unsigned KPerBlock = 32; @@ -100,7 +111,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, constexpr unsigned WeiBlockCopyDataPerRead = 2; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 // 3x3, 34x34, 128 thread, fp32, vector = 2 constexpr unsigned NPerBlock = 2; constexpr unsigned KPerBlock = 32; @@ -117,9 +128,27 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 2; + constexpr unsigned BlockSize = 128; +#elif 0 + // 3x3, 34x34, 128 thread, int8, vector = 4 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 1; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 2; + constexpr unsigned BlockSize = 128; #elif 1 - // 3x3, 34x34, 128 thread, fp16 + // 1x1, 32x32, 128 thread, int8, vector = 4 constexpr unsigned NPerBlock = 2; constexpr unsigned KPerBlock = 32; constexpr unsigned CPerBlock = 4; @@ -128,12 +157,12 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, constexpr unsigned NPerThread = 2; constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; + constexpr unsigned CPerThread = 1; constexpr unsigned HoPerThread = 2; constexpr unsigned WoPerThread = 2; constexpr unsigned InBlockCopyDataPerRead = 2; - constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned WeiBlockCopyDataPerRead = 2; constexpr unsigned BlockSize = 128; #endif @@ -146,7 +175,9 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { float time = launch_kernel( - gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw, dim3(GridSize), dim3(BlockSize), - static_cast(in_nchw_vec_device_buf.GetDeviceBuffer()), - static_cast(wei_kcyx_vec_device_buf.GetDeviceBuffer()), - static_cast(out_nkhw_device_buf.GetDeviceBuffer())); + static_cast(in_nchw_vec_device_buf.GetDeviceBuffer()), + static_cast(wei_kcyx_vec_device_buf.GetDeviceBuffer()), + static_cast(out_nkhw_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 47ff1b6882..17b333c69a 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -88,9 +88,12 @@ auto make_TensorDescriptor(TConstTensorDesc) return TensorDescriptor(lengths, strides); } -template -void host_direct_convolution( - const Tensor& in_nchw, const Tensor& wei_kcyx, Tensor& out, LowerPads, UpperPads) +template +void host_direct_convolution(const Tensor& in_nchw, + const Tensor& wei_kcyx, + Tensor& out_nkhw, + LowerPads, + UpperPads) { unsigned h_pad_low = LowerPads{}.Get(Number<0>{}); unsigned w_pad_low = LowerPads{}.Get(Number<1>{}); @@ -116,21 +119,24 @@ void host_direct_convolution( } } } - out(n, k, ho, wo) = v; + out_nkhw(n, k, ho, wo) = v; }; auto f_par = make_ParallelTensorFunctor(f, - out.mDesc.GetLengths()[0], - out.mDesc.GetLengths()[1], - out.mDesc.GetLengths()[2], - out.mDesc.GetLengths()[3]); + out_nkhw.mDesc.GetLengths()[0], + out_nkhw.mDesc.GetLengths()[1], + out_nkhw.mDesc.GetLengths()[2], + out_nkhw.mDesc.GetLengths()[3]); f_par(std::thread::hardware_concurrency()); } -template -void host_winograd_3x3_convolution( - const Tensor& in_nchw, const Tensor& wei_kcyx, Tensor& out, LowerPads, UpperPads) +template +void host_winograd_3x3_convolution(const Tensor& in_nchw, + const Tensor& wei_kcyx, + Tensor& out_nkhw, + LowerPads, + UpperPads) { constexpr std::size_t HoPerTile = 2; constexpr std::size_t WoPerTile = 2; @@ -144,8 +150,8 @@ void host_winograd_3x3_convolution( std::size_t Y = wei_kcyx.mDesc.GetLengths()[2]; std::size_t X = wei_kcyx.mDesc.GetLengths()[3]; - std::size_t HO = out.mDesc.GetLengths()[2]; - std::size_t WO = out.mDesc.GetLengths()[3]; + std::size_t HO = out_nkhw.mDesc.GetLengths()[2]; + std::size_t WO = out_nkhw.mDesc.GetLengths()[3]; unsigned h_pad_low = LowerPads{}.Get(Number<0>{}); unsigned w_pad_low = LowerPads{}.Get(Number<1>{}); @@ -180,7 +186,7 @@ void host_winograd_3x3_convolution( } else { - in_hold(n, c, htile, wtile, j, i) = T(0); + in_hold(n, c, htile, wtile, j, i) = TIn(0); } } } @@ -347,8 +353,8 @@ void host_winograd_3x3_convolution( std::size_t ho = HoPerTile * htile + j; for(int i = 0; i < WoPerTile; ++i) { - std::size_t wo = WoPerTile * wtile + i; - out(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); + std::size_t wo = WoPerTile * wtile + i; + out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); } } }; @@ -403,7 +409,7 @@ int main(int argc, char* argv[]) constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; -#elif 1 +#elif 0 // 3x3, 34x34 constexpr unsigned N = 64; constexpr unsigned C = 256; @@ -502,7 +508,7 @@ int main(int argc, char* argv[]) constexpr unsigned HPad = 1; constexpr unsigned WPad = 1; -#elif 1 +#elif 0 // 1x1 filter, 28x28 image constexpr unsigned N = 16; constexpr unsigned C = 256; @@ -562,6 +568,18 @@ int main(int argc, char* argv[]) constexpr unsigned HPad = 2; constexpr unsigned WPad = 2; +#elif 1 + // 1x1 filter, 32x32 image + constexpr unsigned N = 64; + constexpr unsigned C = 256; + constexpr unsigned HI = 32; + constexpr unsigned WI = 32; + constexpr unsigned K = 512; + constexpr unsigned Y = 1; + constexpr unsigned X = 1; + + constexpr unsigned HPad = 0; + constexpr unsigned WPad = 0; #endif auto lower_pads = Sequence{}; @@ -576,11 +594,12 @@ int main(int argc, char* argv[]) ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); - using Float = float; - Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); - Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); - Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); - Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); + using in_data_t = char; + using out_data_t = int32_t; + Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); + Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); + Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); + Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); std::size_t num_thread = std::thread::hardware_concurrency(); diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp index fb0bdabee3..bb60adf154 100644 --- a/src/include/data_type.hip.hpp +++ b/src/include/data_type.hip.hpp @@ -10,16 +10,6 @@ namespace CUDA { using half = CUDA::half; using half2 = CUDA::half2; -struct half4 -{ - half data[4]; -}; - -struct half8 -{ - half data[8]; -}; - template struct vector_type { @@ -119,39 +109,141 @@ struct vector_type using MemoryType = float4; }; -template -__device__ void fused_multiply_add(TDst& d, TSrc0 s0, TSrc1 s1, TSrc2 s2) +template <> +struct vector_type { + using MemoryType = char; + + __host__ __device__ static MemoryType Pack(char s) { return s; } +}; + +template <> +struct vector_type +{ + using MemoryType = char2; + + __host__ __device__ static MemoryType Pack(char s0, char s1) + { + union + { + MemoryType vector; + char scalar[2]; + } data; + + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = char4; + + __host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3) + { + union + { + MemoryType vector; + char scalar[4]; + } data; + + data.scalar[0] = s0; + data.scalar[1] = s1; + data.scalar[2] = s2; + data.scalar[3] = s3; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = int64_t; +}; + +template <> +struct vector_type +{ + using MemoryType = char4; +}; + +template <> +struct vector_type +{ + using MemoryType = int64_t; +}; + +template <> +struct vector_type +{ + using MemoryType = int64_t; +}; + +template +__device__ void fused_multiply_accumulate(TDst& d, const TSrc0& s0, const TSrc1& s1) +{ + // static_assert(false, "should not call into base"); printf("should not call into base"); assert(false); } template <> -__device__ void fused_multiply_add(float& d, float s0, float s1, float s2) +__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) { - d = s0 * s1 + s2; + d += s0 * s1; } template <> -__device__ void fused_multiply_add(float& d, float2 s0, float2 s1, float s2) +__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1) { - d = s0.x * s1.x + s0.y * s1.y + s2; + d += s0.x * s1.x; + d += s0.y * s1.y; } template <> -__device__ void fused_multiply_add(float& d, float4 s0, float4 s1, float s2) +__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1) { - d = s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w + s2; + d += s0.x * s1.x; + d += s0.y * s1.y; + d += s0.z * s1.z; + d += s0.w * s1.w; } template <> -__device__ void fused_multiply_add(half& d, half s0, half s1, half s2) +__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { - d = s0 * s1 + s2; + d += s0 * s1; } template <> -__device__ void fused_multiply_add(half& d, half2 s0, half2 s1, half s2) +__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1) { - d = s0.x * s1.x + s0.y * s1.y + s2; -} \ No newline at end of file + d += s0.x * s1.x; + d += s0.y * s1.y; +} + +#if 0 +template <> +__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1) +{ + d += s0.x * s1.x + s0.y * s1.y; +} +#endif + +template <> +__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) +{ + d += s0 * s1; +} + +template <> +__device__ void fused_multiply_accumulate(int32_t& d, const char4& s0, const char4& s1) +{ +#if DEVICE_BACKEND_CUDA + d = __dp4a(s0, s1, d); +#else + d += s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w; +#endif +} diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index 825977ab54..7ae594788b 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -7,7 +7,9 @@ #include "threadwise_4d_tensor_op.hip.hpp" #include "threadwise_direct_convolution.hip.hpp" -template __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( - const typename vector_type::MemoryType* const __restrict__ p_in_vec_global, - const typename vector_type::MemoryType* const __restrict__ p_wei_vec_global, - Float* const __restrict__ p_out_global) + TOut* const __restrict__ p_out_global) { - using scalar_t = Float; - using vector_mem_t = typename vector_type::MemoryType; + using in_scalar_t = TInWei; + using in_vector_mem_t = typename vector_type::MemoryType; + using out_scalar_t = TOut; + using accum_t = TAccum; constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -79,9 +83,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ? InBlockCopyDataPerRead : WeiBlockCopyDataPerRead; - __shared__ vector_mem_t + __shared__ in_vector_mem_t p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)]; - __shared__ vector_mem_t + __shared__ in_vector_mem_t p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; // threadwise tensors @@ -99,7 +103,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( in_nchw_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc); // register - scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; + out_scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; // divide block work constexpr unsigned NBlockWork = @@ -155,7 +159,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto blockwise_in_copy = Blockwise4dTensorCopy1{}; #endif +#if 1 // debug // set threadwise output tensor to 0 threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread); +#endif for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, __syncthreads()) diff --git a/src/include/threadwise_4d_tensor_op.hip.hpp b/src/include/threadwise_4d_tensor_op.hip.hpp index 3d13ae2aa6..5b908d3ac6 100644 --- a/src/include/threadwise_4d_tensor_op.hip.hpp +++ b/src/include/threadwise_4d_tensor_op.hip.hpp @@ -37,7 +37,8 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re // TODO: in order to optimize mem access for different mem type, // need to write specialized version -template __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( SrcDesc, - const Float* __restrict__ p_src, + const SrcData* __restrict__ p_src, DstDesc, - Float* __restrict__ p_dst, + DstData* __restrict__ p_dst, SrcOpLengths, DstFromSrcReorder, F f) @@ -88,33 +89,38 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_d } } -template -__device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p) +template +__device__ void threadwise_4d_tensor_set_zero(Desc, Data* __restrict__ p) { - auto f_set_zero = [](Float& v) { v = Float(0); }; + auto f_set_zero = [](Data& v) { v = Data(0); }; - threadwise_4d_tensor_pointwise_operation_unary( + threadwise_4d_tensor_pointwise_operation_unary( Desc{}, p, f_set_zero); } -template +template __device__ void threadwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, - const Float* __restrict__ p_src, + const SrcData* __restrict__ p_src, DstDesc, - Float* __restrict__ p_dst, + DstData* __restrict__ p_dst, SrcOpLengths, DstFromSrcReorder) { - auto f_copy = [](const Float& src, Float& dst) { dst = src; }; + auto f_copy = [](const SrcData& src, DstData& dst) { dst = static_cast(src); }; threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy); } -template +template __device__ void threadwise_4d_tensor_copy( - SrcDesc, const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths) + SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths) { auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{}; diff --git a/src/include/threadwise_direct_convolution.hip.hpp b/src/include/threadwise_direct_convolution.hip.hpp index 79677b0f9b..b9a509d6a0 100644 --- a/src/include/threadwise_direct_convolution.hip.hpp +++ b/src/include/threadwise_direct_convolution.hip.hpp @@ -51,10 +51,8 @@ __device__ void threadwise_direct_convolution_1(InDesc, const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo); - fused_multiply_add(p_out[out_index], - p_wei[wei_index], - p_in[in_index], - p_out[out_index]); + fused_multiply_accumulate( + p_out[out_index], p_wei[wei_index], p_in[in_index]); } } } From 02d72160dc22e364d1ceb65ca4f59892c097946a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 19 Mar 2019 01:30:28 -0500 Subject: [PATCH 10/13] adding int8 direct that reads pre-vectorized data --- ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 28 +++++----- src/include/data_type.hip.hpp | 56 +++++++++---------- 2 files changed, 41 insertions(+), 43 deletions(-) diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp index 5c687f4206..b25459c30d 100644 --- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -133,14 +133,14 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, // 3x3, 34x34, 128 thread, int8, vector = 4 constexpr unsigned NPerBlock = 2; constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 4; - constexpr unsigned HoPerBlock = 2; + constexpr unsigned CPerBlock = 8; + constexpr unsigned HoPerBlock = 4; constexpr unsigned WoPerBlock = 32; - constexpr unsigned NPerThread = 2; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 1; - constexpr unsigned HoPerThread = 2; + constexpr unsigned NPerThread = 1; + constexpr unsigned KPerThread = 8; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 4; constexpr unsigned WoPerThread = 2; constexpr unsigned InBlockCopyDataPerRead = 2; @@ -149,16 +149,16 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, constexpr unsigned BlockSize = 128; #elif 1 // 1x1, 32x32, 128 thread, int8, vector = 4 - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 4; - constexpr unsigned HoPerBlock = 2; + constexpr unsigned NPerBlock = 1; + constexpr unsigned KPerBlock = 64; + constexpr unsigned CPerBlock = 16; + constexpr unsigned HoPerBlock = 4; constexpr unsigned WoPerBlock = 32; - constexpr unsigned NPerThread = 2; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 1; - constexpr unsigned HoPerThread = 2; + constexpr unsigned NPerThread = 1; + constexpr unsigned KPerThread = 8; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 4; constexpr unsigned WoPerThread = 2; constexpr unsigned InBlockCopyDataPerRead = 2; diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp index bb60adf154..c32e93b6ef 100644 --- a/src/include/data_type.hip.hpp +++ b/src/include/data_type.hip.hpp @@ -120,7 +120,7 @@ struct vector_type template <> struct vector_type { - using MemoryType = char2; + using MemoryType = int16_t; __host__ __device__ static MemoryType Pack(char s0, char s1) { @@ -139,7 +139,7 @@ struct vector_type template <> struct vector_type { - using MemoryType = char4; + using MemoryType = int32_t; __host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3) { @@ -163,6 +163,13 @@ struct vector_type using MemoryType = int64_t; }; +template <> +struct vector_type +{ + using MemoryType = int64_t; +}; + +#if 0 template <> struct vector_type { @@ -175,34 +182,30 @@ struct vector_type using MemoryType = int64_t; }; +template <> +struct vector_type +{ + using MemoryType = int; +}; + template <> struct vector_type { using MemoryType = int64_t; }; +#endif -template -__device__ void fused_multiply_accumulate(TDst& d, const TSrc0& s0, const TSrc1& s1) -{ - // static_assert(false, "should not call into base"); - printf("should not call into base"); - assert(false); -} - -template <> __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) { d += s0 * s1; } -template <> __device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1) { d += s0.x * s1.x; d += s0.y * s1.y; } -template <> __device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1) { d += s0.x * s1.x; @@ -211,13 +214,8 @@ __device__ void fused_multiply_accumulate(float& d, const float4& s0, const floa d += s0.w * s1.w; } -template <> -__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) -{ - d += s0 * s1; -} +__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; } -template <> __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1) { d += s0.x * s1.x; @@ -225,25 +223,25 @@ __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& } #if 0 -template <> __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1) { d += s0.x * s1.x + s0.y * s1.y; } #endif -template <> -__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) -{ - d += s0 * s1; -} +__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; } -template <> -__device__ void fused_multiply_accumulate(int32_t& d, const char4& s0, const char4& s1) +// TODO:: this interface is misleading, int32 is actually int8x4 +// need to make a better interface +__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1) { #if DEVICE_BACKEND_CUDA +#if 1 // debug d = __dp4a(s0, s1, d); -#else - d += s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w; +#elif 1 + asm volatile("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(d) : "r"(s0), "r"(s1), "r"(d)); +#elif 0 // this is wrong! just for debugging + d += (*reinterpret_cast(&s0)) * (*reinterpret_cast(&s1)); +#endif #endif } From e72eece8fcf79d1d3a958089fca1f02bfb71b777 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 21 Mar 2019 09:59:40 -0500 Subject: [PATCH 11/13] added int8x4 --- driver/driver.hip.cpp | 2 +- src/include/data_type.hip.hpp | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 17b333c69a..b1df58265e 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -617,7 +617,7 @@ int main(int argc, char* argv[]) #if 0 in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); -#elif 1 +#elif 0 in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); #elif 1 diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp index c32e93b6ef..ca1f4dcbae 100644 --- a/src/include/data_type.hip.hpp +++ b/src/include/data_type.hip.hpp @@ -231,17 +231,11 @@ __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2 __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; } -// TODO:: this interface is misleading, int32 is actually int8x4 +// TODO:: this interface is misleading, s0, s1 are actually int8x4 // need to make a better interface __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1) { #if DEVICE_BACKEND_CUDA -#if 1 // debug d = __dp4a(s0, s1, d); -#elif 1 - asm volatile("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(d) : "r"(s0), "r"(s1), "r"(d)); -#elif 0 // this is wrong! just for debugging - d += (*reinterpret_cast(&s0)) * (*reinterpret_cast(&s1)); -#endif #endif } From 8c923db423ab4ca0a7ac10310cff3528b38bb520 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 22 Mar 2019 14:22:58 -0500 Subject: [PATCH 12/13] hip build --- ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 30 ++++++------- ...icit_gemm_convolution_1_chwn_cyxk_khwn.hpp | 37 +++++++++++++++- driver/driver.hip.cpp | 32 +++++++++----- src/include/Array.hip.hpp | 2 +- src/include/blockwise_4d_tensor_op.hip.hpp | 9 ++-- .../blockwise_direct_convolution.hip.hpp | 27 ++++++------ src/include/blockwise_gemm.hip.hpp | 29 +++++++------ src/include/common.hip.hpp | 2 + src/include/config.h.in | 2 + src/include/data_type.hip.hpp | 33 ++------------ .../gridwise_direct_convolution_1.hip.hpp | 19 ++++---- ...irect_convolution_2_nchw_kcyx_nkhw.hip.hpp | 43 +++++++++++-------- ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 34 ++++++++------- ..._gemm_convolution_1_chwn_cyxk_khwn.hip.hpp | 28 ++++++------ ...onvolution_1_chwn_cyxk_khwn_padded.hip.hpp | 9 ++-- ..._gemm_convolution_2_chwn_cyxk_khwn.hip.hpp | 4 +- ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 4 +- src/include/tensor.hpp | 3 +- src/include/threadwise_nd_tensor_op.hip.hpp | 4 +- 19 files changed, 196 insertions(+), 155 deletions(-) diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp index b25459c30d..d8bb3b768e 100644 --- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -51,7 +51,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); #elif 1 - in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w), + in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w), in_nchw(n, 4 * c + 1, h, w), in_nchw(n, 4 * c + 2, h, w), in_nchw(n, 4 * c + 3, h, w)); @@ -113,37 +113,37 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, constexpr unsigned BlockSize = 128; #elif 0 // 3x3, 34x34, 128 thread, fp32, vector = 2 - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 2; + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 2; constexpr unsigned HoPerBlock = 2; constexpr unsigned WoPerBlock = 32; - constexpr unsigned NPerThread = 2; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 1; + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 1; constexpr unsigned HoPerThread = 2; constexpr unsigned WoPerThread = 2; - constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 2; constexpr unsigned BlockSize = 128; #elif 0 // 3x3, 34x34, 128 thread, int8, vector = 4 - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 8; + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 8; constexpr unsigned HoPerBlock = 4; constexpr unsigned WoPerBlock = 32; - constexpr unsigned NPerThread = 1; - constexpr unsigned KPerThread = 8; - constexpr unsigned CPerThread = 2; + constexpr unsigned NPerThread = 1; + constexpr unsigned KPerThread = 8; + constexpr unsigned CPerThread = 2; constexpr unsigned HoPerThread = 4; constexpr unsigned WoPerThread = 2; - constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 2; constexpr unsigned BlockSize = 128; diff --git a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp index 2c27080670..99a6eb45c1 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp @@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); out_khwn_device_buf.ToDevice(out_khwn.mData.data()); -#if 1 +#if 0 // for 3x3, 34x34 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 64; @@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, constexpr unsigned WoPerThread = 1; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 // for 1x1, 28x28 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 128; @@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, constexpr unsigned OutThreadCopyDataPerWrite = 2; + constexpr unsigned BlockSize = 128; +#elif 1 + // for 1x1, 14x14 + constexpr unsigned NPerBlock = 16; + constexpr unsigned KPerBlock = 128; + constexpr unsigned CPerBlock = 8; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 2; + + constexpr unsigned NPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; + constexpr unsigned HoPerThread = 1; + constexpr unsigned WoPerThread = 1; + + constexpr unsigned InBlockCopy_ThreadPerDimC = 8; + constexpr unsigned InBlockCopy_ThreadPerDimH = 2; + constexpr unsigned InBlockCopy_ThreadPerDimW = 2; + constexpr unsigned InBlockCopy_ThreadPerDimN = 4; + constexpr unsigned InBlockCopyDataPerRead = 4; + + constexpr unsigned WeiBlockCopyDataPerRead = 4; + + constexpr unsigned GemmMPerThreadSubC = 4; + constexpr unsigned GemmNPerThreadSubC = 4; + constexpr unsigned GemmMLevel0Cluster = 4; + constexpr unsigned GemmNLevel0Cluster = 2; + constexpr unsigned GemmMLevel1Cluster = 2; + constexpr unsigned GemmNLevel1Cluster = 4; + constexpr unsigned GemmKPerThreadLoop = 1; + + constexpr unsigned OutThreadCopyDataPerWrite = 2; + constexpr unsigned BlockSize = 128; #endif diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index b1df58265e..d391ec5b5f 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -8,11 +8,11 @@ #include "ConstantTensorDescriptor.hip.hpp" #include "conv_common.hip.hpp" //#include "device_direct_convolution_1.hpp" -#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" -#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" -//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" +//#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" +//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" +#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" -//#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" +#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" struct GeneratorTensor_1 { @@ -353,7 +353,7 @@ void host_winograd_3x3_convolution(const Tensor& in_nchw, std::size_t ho = HoPerTile * htile + j; for(int i = 0; i < WoPerTile; ++i) { - std::size_t wo = WoPerTile * wtile + i; + std::size_t wo = WoPerTile * wtile + i; out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); } } @@ -568,7 +568,7 @@ int main(int argc, char* argv[]) constexpr unsigned HPad = 2; constexpr unsigned WPad = 2; -#elif 1 +#elif 0 // 1x1 filter, 32x32 image constexpr unsigned N = 64; constexpr unsigned C = 256; @@ -578,6 +578,18 @@ int main(int argc, char* argv[]) constexpr unsigned Y = 1; constexpr unsigned X = 1; + constexpr unsigned HPad = 0; + constexpr unsigned WPad = 0; +#elif 1 + // 1x1 filter, 14x14 image + constexpr unsigned N = 128; + constexpr unsigned C = 2048; + constexpr unsigned HI = 14; + constexpr unsigned WI = 14; + constexpr unsigned K = 512; + constexpr unsigned Y = 1; + constexpr unsigned X = 1; + constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; #endif @@ -594,8 +606,8 @@ int main(int argc, char* argv[]) ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); - using in_data_t = char; - using out_data_t = int32_t; + using in_data_t = float; + using out_data_t = float; Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); @@ -635,9 +647,9 @@ int main(int argc, char* argv[]) device_direct_convolution_1 #elif 0 device_direct_convolution_2_nchw_kcyx_nkhw -#elif 1 - device_direct_convolution_2_vectorized_nchw_kcyx_nkhw #elif 0 + device_direct_convolution_2_vectorized_nchw_kcyx_nkhw +#elif 1 device_implicit_gemm_convolution_1_chwn_cyxk_khwn #elif 0 device_implicit_gemm_convolution_2_chwn_cyxk_khwn diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 1caab6a4c9..89654cbc2b 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -10,7 +10,7 @@ struct Array unsigned mData[nSize]; template - __host__ __device__ Array(Xs... xs) : mData({static_cast(xs)...}) + __host__ __device__ Array(Xs... xs) : mData{static_cast(xs)...} { } diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index fa5f36be51..0660c34ebb 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -340,10 +340,11 @@ struct BlockwiseChwnTensorCopyPadded constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; const Float* p_src_tmp = - p_src + src_desc.Get1dIndex(c_block_data_begin, - (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, - (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, - n_block_data_begin); + p_src + + src_desc.Get1dIndex(c_block_data_begin, + (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, + (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, + n_block_data_begin); #if 0 if(get_thread_local_1d_id() == 0) diff --git a/src/include/blockwise_direct_convolution.hip.hpp b/src/include/blockwise_direct_convolution.hip.hpp index 247ff219f6..7666607c9c 100644 --- a/src/include/blockwise_direct_convolution.hip.hpp +++ b/src/include/blockwise_direct_convolution.hip.hpp @@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, Float p_out_thread[out_thread_desc.GetElementSpace()]; threadwise_4d_tensor_copy(out_block_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc, p_out_thread, out_thread_desc.GetLengths()); @@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, // threadwise convolution threadwise_direct_convolution_2( in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data_begin, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data_begin, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0), @@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, threadwise_4d_tensor_copy(out_thread_desc, p_out_thread, out_block_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc.GetLengths()); } } diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index 33556dde25..221a7153a2 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 constexpr unsigned MRepeat = MPerThread / MPerThreadSubC; constexpr unsigned NRepeat = NPerThread / NPerThreadSubC; - // loop over k +// loop over k #pragma unroll for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) { - // read first batch of A, B - // copy A-sub to form A +// read first batch of A, B +// copy A-sub to form A #pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { @@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 a_thread_sub_mtx.GetLengths()); } - // copy B-sub to form B +// copy B-sub to form B #pragma unroll for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { @@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 b_thread_sub_mtx.GetLengths()); } - // loop over batch +// loop over batch #pragma unroll for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) { @@ -551,14 +551,15 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 c_thread_mtx_begin.batch * BlockMatrixStrideC + c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col); - for(unsigned m_repeat = 0; m_repeat, MRepeat; ++m_repeat) + for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { - for(unsigned n_repeat = 0; n_repeat, NRepeat; ++n_repeat) + for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { threadwise_matrix_copy( c_thread_sub_mtx, - p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, - n_repeat * NPerLevel1Cluster), + p_c_thread + + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, + n_repeat * NPerLevel1Cluster), c_block_mtx, p_c_block + c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, @@ -656,8 +657,9 @@ struct BlockwiseGemmBlockABlockBThreadC constexpr unsigned NClusterWork = (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster); - static_assert(BlockSize == (MClusterWork * MThreadPerCluster) * - (NClusterWork * NThreadPerCluster), + static_assert(BlockSize == + (MClusterWork * MThreadPerCluster) * + (NClusterWork * NThreadPerCluster), "wrong! wrong BlockSize"); if(DistributeThreadAlongColumnFirst) @@ -1256,8 +1258,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), c_thread_sub_mtx, False, - p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, - n_repeat * NPerThreadSubC), + p_c_thread + + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, + n_repeat * NPerThreadSubC), f_accum); } } diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index d5832dde9d..ba0a521fb3 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -21,6 +21,7 @@ struct is_same static const bool value = true; }; +#if 0 template __host__ __device__ constexpr T max(T a, T b) { @@ -32,6 +33,7 @@ __host__ __device__ constexpr T min(T a, T b) { return a < b ? a : b; } +#endif __host__ __device__ constexpr unsigned integer_divide_ceil(unsigned a, unsigned b) { diff --git a/src/include/config.h.in b/src/include/config.h.in index 7b888c6951..bb4f6cb51d 100644 --- a/src/include/config.h.in +++ b/src/include/config.h.in @@ -4,8 +4,10 @@ #if DEVICE_BACKEND_HIP #include "hip/hip_runtime.h" +#include "hip/hip_fp16.h" #elif DEVICE_BACKEND_CUDA #include "cuda_runtime.h" +#include "cuda_fp16.h" #include "nvToolsExt.h" #include "helper_cuda.h" #endif diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp index ca1f4dcbae..95d5b0b33f 100644 --- a/src/include/data_type.hip.hpp +++ b/src/include/data_type.hip.hpp @@ -1,15 +1,6 @@ #pragma once #include "config.h" -#if DEVICE_BACKEND_CUDA -namespace CUDA { -#include "cuda_fp16.h" -} -#endif - -using half = CUDA::half; -using half2 = CUDA::half2; - template struct vector_type { @@ -52,6 +43,7 @@ struct vector_type using MemoryType = float4; }; +#if 0 template <> struct vector_type { @@ -91,24 +83,6 @@ struct vector_type using MemoryType = float4; }; -template <> -struct vector_type -{ - using MemoryType = half2; -}; - -template <> -struct vector_type -{ - using MemoryType = float2; -}; - -template <> -struct vector_type -{ - using MemoryType = float4; -}; - template <> struct vector_type { @@ -169,7 +143,6 @@ struct vector_type using MemoryType = int64_t; }; -#if 0 template <> struct vector_type { @@ -214,6 +187,7 @@ __device__ void fused_multiply_accumulate(float& d, const float4& s0, const floa d += s0.w * s1.w; } +#if 0 __device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; } __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1) @@ -222,12 +196,10 @@ __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& d += s0.y * s1.y; } -#if 0 __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1) { d += s0.x * s1.x + s0.y * s1.y; } -#endif __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; } @@ -239,3 +211,4 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i d = __dp4a(s0, s1, d); #endif } +#endif diff --git a/src/include/gridwise_direct_convolution_1.hip.hpp b/src/include/gridwise_direct_convolution_1.hip.hpp index f4fe1809fc..edcfd6d38e 100644 --- a/src/include/gridwise_direct_convolution_1.hip.hpp +++ b/src/include/gridwise_direct_convolution_1.hip.hpp @@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ c_block_work_begin += CPerBlock) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, - c_block_work_begin, - hi_block_work_begin, - wi_block_work_begin), + blockwise_in_copy.Run(p_in_global + + in_global_desc.Get1dIndex(n_block_work_begin, + c_block_work_begin, + hi_block_work_begin, + wi_block_work_begin), p_in_block); // copy weight tensor to LDS @@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ } // copy output tensor from LDS to device mem - blockwise_out_copy.Run(p_out_block, - p_out_global + out_global_desc.Get1dIndex(n_block_work_begin, - k_block_work_begin, - ho_block_work_begin, - wo_block_work_begin)); + blockwise_out_copy.Run( + p_out_block, + p_out_global + + out_global_desc.Get1dIndex( + n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin)); } diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp index 5761a22c16..1e6d3d24bd 100644 --- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp @@ -176,16 +176,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex( - k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_block); + blockwise_wei_copy.Run( + p_wei_global + + wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_block); __syncthreads(); @@ -195,10 +197,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i #if 1 threadwise_direct_convolution_2( in_nchw_thread_block_desc, - p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -207,10 +210,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i #elif 0 threadwise_direct_convolution_3( in_nchw_thread_block_desc, - p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -225,9 +229,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i out_nkhw_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_nkhw_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index 7ae594788b..4d72368b29 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -200,9 +200,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( p_in_vec_block); // copy weight tensor to LDS - blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex( - k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_vec_block); + blockwise_wei_copy.Run( + p_wei_vec_global + + wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_vec_block); __syncthreads(); @@ -212,10 +213,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( #if 1 threadwise_direct_convolution_2( in_nchw_vec_thread_block_desc, - p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_vec_block + + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_vec_thread_block_desc, p_wei_vec_block + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -224,10 +226,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( #elif 0 threadwise_direct_convolution_3( in_nchw_vec_thread_block_desc, - p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_vec_block + + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_vec_thread_block_desc, p_wei_vec_block + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -242,9 +245,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( out_nkhw_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_nkhw_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp index 1caef669e9..99342d3ca1 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp @@ -184,8 +184,9 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread); const Float* p_in_global_block_begin = - p_in_global + in_chwn_global_desc.Get1dIndex( - 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); + p_in_global + + in_chwn_global_desc.Get1dIndex( + 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_begin = p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); @@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric } } - // output: register to global mem, +// output: register to global mem, #if 0 const auto c_thread_mtx_begin = blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); @@ -286,16 +287,17 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric } #endif - threadwise_8d_tensor_copy(out_8d_thread_desc, - p_out_thread, - out_8d_global_desc, - p_out_global + out_khwn_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_8d_thread_desc.GetLengths(), - Number{}); + threadwise_8d_tensor_copy( + out_8d_thread_desc, + p_out_thread, + out_8d_global_desc, + p_out_global + + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_8d_thread_desc.GetLengths(), + Number{}); } else if(NPerThread == NPerBlock) { diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp index a4904cdf58..790a006023 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp @@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( out_hkwn_thread_desc, p_out_thread, out_khwn_global_desc, - p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), + p_out_global + + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), out_hkwn_thread_desc.GetLengths(), reorder_khwn_from_hkwn); } diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp index afa3d3ee90..f68b57b6a0 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp @@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric decltype(in_cb_block_desc), decltype(in_cb_block_desc.GetLengths())>{}; #elif 0 - const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = Blockwise2dTensorCopy3{}; #elif 0 - const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = Blockwise2dTensorCopy3) { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::MemoryType; static_assert(SrcDesc{}.GetDimension() == 6 && DstDesc{}.GetDimension() == 6 && SrcOpLengths::nDim == 6, @@ -80,7 +80,7 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc, SrcOpLengths, Number) { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::MemoryType; static_assert(SrcDesc{}.GetDimension() == 8 && DstDesc{}.GetDimension() == 8 && SrcOpLengths::nDim == 8, From 18a81e356eef305b244db0e7c46772925a540a44 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 22 Mar 2019 16:33:04 -0500 Subject: [PATCH 13/13] adding assembly --- src/include/blockwise_gemm.hip.hpp | 113 ++++++++++++++++++++++++++--- 1 file changed, 102 insertions(+), 11 deletions(-) diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index 221a7153a2..f80c49a029 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -435,11 +435,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 #pragma unroll for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) { -// read first batch of A, B -// copy A-sub to form A -#pragma unroll + // read first batch of A, B + // copy A-sub to form A + //#pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { +#if 0 threadwise_matrix_copy( a_block_mtx, p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + @@ -447,12 +448,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 a_thread_mtx, p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), a_thread_sub_mtx.GetLengths()); +#else + for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j) + { + p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] = + p_a_block[a_block_mtx.Get1dIndex(k_begin + i, + m_repeat * MPerLevel1Cluster + j) + + mMyThreadOffsetA]; + } + } +#endif } -// copy B-sub to form B -#pragma unroll + // copy B-sub to form B + //#pragma unroll for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { +#if 0 threadwise_matrix_copy( b_block_mtx, p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + @@ -460,13 +474,26 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 b_thread_mtx, p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), b_thread_sub_mtx.GetLengths()); +#else + for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j) + { + p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] = + p_b_block[b_block_mtx.Get1dIndex(k_begin + i, + n_repeat * MPerLevel1Cluster + j) + + mMyThreadOffsetB]; + } + } +#endif } -// loop over batch -#pragma unroll + // loop over batch + //#pragma unroll for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) { - // do current batch of gemm +// do current batch of gemm +#if 0 threadwise_gemm(a_thread_mtx, True, p_a_thread, @@ -477,13 +504,32 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 False, p_c_thread + ib * ThreadMatrixStrideC, f_accum); +#else + for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k) + { + for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j) + { + const unsigned aindex = + a_thread_mtx.Get1dIndex(k, i); // A is transposed + const unsigned bindex = b_thread_mtx.Get1dIndex(k, j); + const unsigned cindex = + c_thread_mtx.Get1dIndex(i, j) + ib * ThreadMatrixStrideC; + + f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); + } + } + } +#endif // read next batch of a, b if(BlockMatrixStrideA != 0) { -#pragma unroll + //#pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { +#if 0 threadwise_matrix_copy( a_block_mtx, p_a_block + @@ -492,14 +538,28 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 a_thread_mtx, p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), a_thread_sub_mtx.GetLengths()); +#else + for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j) + { + p_a_thread[a_thread_mtx.Get1dIndex(i, + m_repeat * MPerThreadSubC + j)] = + p_a_block[a_block_mtx.Get1dIndex( + k_begin + i, m_repeat * MPerLevel1Cluster + j) + + (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA]; + } + } +#endif } } if(BlockMatrixStrideB != 0) { -#pragma unroll + //#pragma unroll for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { +#if 0 threadwise_matrix_copy( b_block_mtx, p_b_block + @@ -508,11 +568,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 b_thread_mtx, p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), b_thread_sub_mtx.GetLengths()); +#else + for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j) + { + p_b_thread[b_thread_mtx.Get1dIndex(i, + n_repeat * NPerThreadSubC + j)] = + p_b_block[b_block_mtx.Get1dIndex( + k_begin + i, n_repeat * MPerLevel1Cluster + j) + + (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB]; + } + } +#endif } } } - // do last batch of gemm +// do last batch of gemm +#if 0 threadwise_gemm(a_thread_mtx, True, p_a_thread, @@ -523,6 +597,23 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 False, p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC, f_accum); +#else + for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k) + { + for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j) + { + const unsigned aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed + const unsigned bindex = b_thread_mtx.Get1dIndex(k, j); + const unsigned cindex = + c_thread_mtx.Get1dIndex(i, j) + (BatchPerThread - 1) * ThreadMatrixStrideC; + + f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); + } + } + } +#endif } }