From a9031464271a961b4b23d9cbf0e5d944dc8a78bf Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 25 Apr 2019 15:14:39 -0500 Subject: [PATCH] implicit gemm v1r3 nchw_cyxk_nkhw --- ...lution_implicit_gemm_v1_nchw_cyxk_khwn.hpp | 4 +- ...lution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp | 224 ++++++++ driver/driver.hip.cpp | 6 +- src/include/Array.hip.hpp | 2 +- src/include/ConstantTensorDescriptor.hip.hpp | 9 +- src/include/Sequence.hip.hpp | 3 +- src/include/blockwise_2d_tensor_op.hip.hpp | 1 - src/include/blockwise_4d_tensor_op.hip.hpp | 336 ------------ src/include/blockwise_batched_gemm.hip.hpp | 3 - src/include/blockwise_nd_tensor_op.hip.hpp | 252 +++++++++ src/include/conv_common.hip.hpp | 1 - src/include/data_type.hip.hpp | 39 +- ..._implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp | 24 +- ...3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp | 24 +- ..._implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp | 24 +- ..._implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp | 514 ++++++++++++++++++ src/include/threadwise_4d_tensor_op.hip.hpp | 129 ----- src/include/threadwise_nd_tensor_op.hip.hpp | 130 ++++- 18 files changed, 1202 insertions(+), 523 deletions(-) create mode 100644 driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp create mode 100644 src/include/blockwise_nd_tensor_op.hip.hpp create mode 100644 src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp index 1cbf4c81b4..193967b8bf 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp @@ -128,7 +128,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>; using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW - constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet + constexpr index_t InBlockReorderDataPerWrite_N = 1; using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used constexpr index_t WeiBlockCopyDataPerRead_K = 4; @@ -163,7 +163,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>; using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW - constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet + constexpr index_t InBlockReorderDataPerWrite_N = 2; using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used constexpr index_t WeiBlockCopyDataPerRead_K = 4; diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp new file mode 100644 index 0000000000..b0d8c24a03 --- /dev/null +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -0,0 +1,224 @@ +#pragma once +#include +#include "device.hpp" +#include "gridwise_convolution_wrapper.hip.hpp" +#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp" + +template +void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcyx, + OutDesc, + Tensor& out_nkhw, + index_t nrepeat) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_nchw_desc = InDesc{}; + constexpr auto wei_kcyx_desc = WeiDesc{}; + constexpr auto out_nkhw_desc = OutDesc{}; + + constexpr index_t Hi = in_nchw_desc.GetLength(I2); + constexpr index_t Wi = in_nchw_desc.GetLength(I3); + + constexpr index_t N = out_nkhw_desc.GetLength(I0); + constexpr index_t Ho = out_nkhw_desc.GetLength(I2); + constexpr index_t Wo = out_nkhw_desc.GetLength(I3); + + constexpr index_t K = wei_kcyx_desc.GetLength(I0); + constexpr index_t C = wei_kcyx_desc.GetLength(I1); + constexpr index_t Y = wei_kcyx_desc.GetLength(I2); + constexpr index_t X = wei_kcyx_desc.GetLength(I3); + + // reorder weight + auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence{}); + ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); + + Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); + + auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) { + wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); + }; + + make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)( + std::thread::hardware_concurrency()); + + std::size_t data_sz = sizeof(T); + DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace()); + DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace()); + DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace()); + + in_nchw_device_buf.ToDevice(in_nchw.mData.data()); + wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); + out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); + +#if 0 + // for 3x3, 28x28, v1r2, Pascal + constexpr index_t BlockSize = 128; + + constexpr index_t NPerBlock = 16; + constexpr index_t KPerBlock = 128; + constexpr index_t CPerBlock = 8; + constexpr index_t HoPerBlock = 2; + constexpr index_t WoPerBlock = 2; + + constexpr index_t NPerThread = 4; + constexpr index_t KPerThread = 8; + constexpr index_t HoPerThread = 1; + constexpr index_t WoPerThread = 2; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 2; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 2; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 2>; + using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>; + using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; + constexpr index_t InBlockReorderDataPerRead_W = 2; + constexpr index_t InBlockReorderDataPerWrite_N = 4; + + using WeiBlockCopyClusterLengths = Sequence<4, 1, 32>; + constexpr index_t WeiBlockCopyDataPerRead_K = 4; + + constexpr index_t OutThreadCopyDataPerWrite_W = 2; +#elif 0 + // for 3x3, 28x28, v1r3, Pascal, bad + constexpr index_t BlockSize = 128; + + constexpr index_t NPerBlock = 16; + constexpr index_t KPerBlock = 128; + constexpr index_t CPerBlock = 8; + constexpr index_t HoPerBlock = 2; + constexpr index_t WoPerBlock = 2; + + constexpr index_t NPerThread = 4; + constexpr index_t KPerThread = 8; + constexpr index_t HoPerThread = 1; + constexpr index_t WoPerThread = 2; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 2; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 2; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>; + using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>; + using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; + constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW + constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet + + using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used + constexpr index_t WeiBlockCopyDataPerRead_K = 4; + + constexpr index_t OutThreadCopyDataPerWrite_W = 2; +#elif 1 + // for 3x3, 34x34, v1r3, Pascal + constexpr index_t BlockSize = 128; + + constexpr index_t NPerBlock = 2; + constexpr index_t KPerBlock = 128; + constexpr index_t CPerBlock = 8; + constexpr index_t HoPerBlock = 2; + constexpr index_t WoPerBlock = 16; + + constexpr index_t NPerThread = 2; + constexpr index_t KPerThread = 8; + constexpr index_t HoPerThread = 1; + constexpr index_t WoPerThread = 4; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 2; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 2; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>; + using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>; + using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; + constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW + constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet + + using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used + constexpr index_t WeiBlockCopyDataPerRead_K = 4; + + constexpr index_t OutThreadCopyDataPerWrite_W = 2; +#endif + + constexpr index_t GridSize = + ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) * + ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + for(index_t i = 0; i < nrepeat; ++i) + { + constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw< + GridSize, + BlockSize, + T, + decltype(in_nchw_desc), + decltype(wei_cyxk_desc), + decltype(out_nkhw_desc), + NPerBlock, + KPerBlock, + CPerBlock, + HoPerBlock, + WoPerBlock, + NPerThread, + KPerThread, + HoPerThread, + WoPerThread, + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB, + InBlockReorderSrcSubLengths_NCHW, + InBlockReorderSrcClusterLengths_NCHW, + InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, + InBlockReorderDataPerRead_W, + InBlockReorderDataPerWrite_N, + WeiBlockCopyClusterLengths, + WeiBlockCopyDataPerRead_K, + OutThreadCopyDataPerWrite_W>{}; + + float time = launch_kernel(run_gridwise_convolution, + dim3(GridSize), + dim3(BlockSize), + 0, + static_cast(in_nchw_device_buf.GetDeviceBuffer()), + static_cast(wei_cyxk_device_buf.GetDeviceBuffer()), + static_cast(out_nkhw_device_buf.GetDeviceBuffer())); + + printf("Elapsed time : %f ms, %f TFlop/s\n", + time, + (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) / + (std::size_t(1000) * 1000 * 1000) / time); + usleep(std::min(time * 1000, float(10000))); + } + + out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); +} diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index a272a0e4a2..3a9a82e686 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -12,7 +12,7 @@ //#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp" -//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" +#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" struct GeneratorTensor_1 @@ -605,8 +605,10 @@ int main(int argc, char* argv[]) device_direct_convolution_2_vectorized_nchw_kcyx_nkhw #elif 0 device_convolution_implicit_gemm_v1_chwn_cyxk_khwn -#elif 1 +#elif 0 device_convolution_implicit_gemm_v1_nchw_cyxk_khwn +#elif 1 + device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw #elif 0 device_convolution_implicit_gemm_v2_chwn_cyxk_khwn #endif diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 2801a426f5..2370e4ed32 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -24,7 +24,7 @@ struct Array { Array new_array; - static_for<0, NSize, 1>{}([=](auto I) { + static_for<0, NSize, 1>{}([&](auto I) { constexpr index_t i = I.Get(); new_array[i] = mData[i]; }); diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 5c3f0d8132..248a4a8974 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -137,11 +137,16 @@ struct ConstantTensorDescriptor } template - __host__ __device__ static constexpr index_t Get1dIndex(Sequence multi_id) + __host__ __device__ static constexpr index_t Get1dIndex(Sequence /*multi_id*/) { static_assert(sizeof...(Is) == nDim, "wrong! Dimension not consistent"); - return Get1dIndex(Is...); + constexpr auto multi_id = Sequence{}; + + constexpr auto seq_tmp = + transform_sequences(mod_conv::multiplies{}, multi_id, GetStrides()); + + return accumulate_on_sequence(seq_tmp, mod_conv::plus{}, Number<0>{}); } __host__ __device__ static Array GetMultiIndex(index_t id) diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp index 632626392e..ba3116ce0b 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hip.hpp @@ -246,7 +246,8 @@ struct accumulate_on_sequence_f }; template -__host__ __device__ constexpr index_t accumulate_on_sequence(Seq, Reduce, Number) +__host__ __device__ constexpr index_t + accumulate_on_sequence(Seq, Reduce, Number /*initial_value*/) { constexpr index_t a = static_const_reduce_n{}(accumulate_on_sequence_f{}, Reduce{}); diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index da7f9b9037..8d457056a2 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -471,7 +471,6 @@ struct Blockwise2dTensorCopy3 DstDesc{}.GetStride(I0) % DataPerRead == 0, "src and dst stride should be multiple of DataPerRead to keep alignment"); - constexpr index_t L0 = CopyLengths{}.Get(I0); constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead; diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index e513eb1e81..2a52eeeac8 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -761,339 +761,3 @@ struct Blockwise4dTensorCopyReorder1 SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy); } }; - -template -struct Blockwise4dTensorCopyReorder3 -{ - static constexpr index_t nDim = SrcLengths::GetSize(); - - index_t mSrcMyThreadOffset; - index_t mDstMyThreadOffset; - - __device__ Blockwise4dTensorCopyReorder3() - { - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - constexpr auto src_lengths = SrcLengths{}; - - constexpr auto map_dst2src = MapDst2Src{}; - - constexpr auto src_sub_lengths = SrcSubLengths{}; - constexpr auto dst_sub_lengths = src_sub_lengths.ReorderGivenNew2Old(map_dst2src); - - constexpr auto map_thread_cluster_2_src_cluster = MapThreadCluster2SrcCluster{}; - - constexpr auto src_cluster_lengths = SrcClusterLengths{}; - constexpr auto thread_cluster_lengths = - src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster); - - constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(thread_cluster_lengths); - - // sanity check: data type - static_assert(is_same::value, "wrong! only support float for now!\n"); - - // sanity check: nDim - static_assert(SrcDesc::GetDimension() == nDim && DstDesc::GetDimension() == nDim && - SrcLengths::GetSize() == nDim && SrcSubLengths::GetSize() == nDim && - SrcClusterLengths::GetSize() == nDim && MapDst2Src::GetSize() == nDim && - MapThreadCluster2SrcCluster::GetSize() == nDim, - "wrong! nDim is not consistent\n"); - - // sanity check: BlockSize - constexpr index_t num_active_thread = thread_cluster_desc.GetElementSize(); - - static_assert(BlockSize >= num_active_thread, - "wrong! BlockSize is not big enough for ThreadPerDims!"); - - // sanity check: work division - static_for<0, nDim, 1>{}([](auto IDim) { - constexpr auto I = decltype(IDim){}; - constexpr index_t src_len = src_lengths.Get(I); - constexpr index_t src_sub_len = src_sub_lengths.Get(I); - constexpr index_t src_cluster_len = src_cluster_lengths.Get(I); - static_assert(src_len % (src_sub_len * src_cluster_len) == 0, - "wrong! cannot evenly divide Src tensor lengths"); - }); - - // sanity check: src read - static_assert(SrcDataPerRead == 1 || SrcDataPerRead == 2 || SrcDataPerRead == 4, - "wrong! only support SrcDataPerRead == 1, 2 or 4!\n"); - - static_assert(SrcDataPerRead == 1 || src_desc.GetStride(Number{}) == 1, - "wrong! only support src.stride(nDim-1) == 1 if SrcDataPerRead > 1!\n"); - - static_assert(src_sub_lengths.Get(Number{}) % SrcDataPerRead == 0, - "wrong! src_sub_lengths[nDim-1] % SrcDataPerRead != 0\n"); - - static_assert(src_desc.GetStride(Number{}) % SrcDataPerRead == 0, - "wrong! should satisfy src_desc.stride(nDim-2) % SrcDataPerRead == 0, to " - "keep alignment"); - - // sanity check: dst write - static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4, - "wrong! only support DstDataPerWrite == 1, 2 or 4!\n"); - - static_assert(DstDataPerWrite == 1 || dst_desc.GetStride(Number{}) == 1, - "wrong! only support dst.stride(nDim-1) == 1 if DstDataPerWrite > 1!\n"); - - static_assert(dst_sub_lengths.Get(Number{}) % DstDataPerWrite == 0, - "wrong! dst_sub_lengths[nDim-1] % DstDataPerWrite != 0\n"); - - static_assert(dst_desc.GetStride(Number{}) % DstDataPerWrite == 0, - "wrong! should satisfy dst_desc.stride(nDim-2) % DstDataPerWrite == 0, to " - "keep alignment"); - - // start dividing work - if(BlockSize > num_active_thread) - { - if(get_thread_local_1d_id() >= num_active_thread) - { - return; - } - } - - const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id()); - - // compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate - // regsiters, or only one copy??? - auto src_data_multi_id = - reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster); - - static_for<0, nDim, 1>{}([&](auto IDim) { - constexpr auto I = decltype(IDim){}; - constexpr index_t i = I.Get(); - // compiler: will it really compute index here, or be associated with Get1dIndex and - // optimized away??? - src_data_multi_id[i] *= src_sub_lengths.Get(I); - }); - - // compiler: will it really compute index here, or be associated with Get1dIndex and - // optimized away??? - const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src); - - mSrcMyThreadOffset = src_desc.Get1dIndex(src_data_multi_id); - mDstMyThreadOffset = dst_desc.Get1dIndex(dst_data_multi_id); - -#if 0 - if(get_block_1d_id() == 0) - { - printf("tid %5u, " - "thread_multi_id %5u %5u %5u %5u, " - "src_data_multi_id %5u %5u %5u %5u, " - "dst_data_multi_id %5u %5u %5u %5u, " - "mSrcMyThreadOffset %u, mDstMyThreadOffset %u\n", - get_thread_local_1d_id(), - thread_multi_id[0], - thread_multi_id[1], - thread_multi_id[2], - thread_multi_id[3], - src_data_multi_id[0], - src_data_multi_id[1], - src_data_multi_id[2], - src_data_multi_id[3], - dst_data_multi_id[0], - dst_data_multi_id[1], - dst_data_multi_id[2], - dst_data_multi_id[3], - mSrcMyThreadOffset, - mDstMyThreadOffset); - } -#endif - } - - __device__ static constexpr index_t GetRegisterClipboardSize() - { - constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; - - constexpr auto src_data_per_cluster_per_dims = transform_sequences( - mod_conv::multiplies{}, thread_sub_tensor_lengths, SrcClusterLengths{}); - - constexpr auto cluster_per_dims = - transform_sequences(mod_conv::integer_divide_ceiler{}, - SrcLengths{}, - src_data_per_cluster_per_dims); - - constexpr auto thread_tensor_lengths = transform_sequences( - mod_conv::multiplies{}, thread_sub_tensor_lengths, cluster_per_dims); - - constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); - - return thread_tensor_desc.GetElementSpace(); - } - - __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, - Float* __restrict__ p_clipboard) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; - - constexpr auto src_data_per_cluster_per_dims = transform_sequences( - mod_conv::multiplies{}, thread_sub_tensor_lengths, SrcClusterLengths{}); - - constexpr auto cluster_per_dims = - transform_sequences(mod_conv::integer_divide_ceiler{}, - SrcLengths{}, - src_data_per_cluster_per_dims); - - constexpr auto thread_tensor_lengths = transform_sequences( - mod_conv::multiplies{}, thread_sub_tensor_lengths, cluster_per_dims); - - constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); - - constexpr auto thread_sub_tensor_desc = - make_ConstantTensorDescriptor(SrcClusterLengths{}, thread_tensor_desc.GetStrides()); - -#if 1 - for(index_t icluster_d0 = 0; icluster_d0 < cluster_per_dims.Get(I0); ++icluster_d0) - { - for(index_t icluster_d1 = 0; icluster_d1 < cluster_per_dims.Get(I1); ++icluster_d1) - { - for(index_t icluster_d2 = 0; icluster_d2 < cluster_per_dims.Get(I2); ++icluster_d2) - { - for(index_t icluster_d3 = 0; icluster_d3 < cluster_per_dims.Get(I3); - ++icluster_d3) - { - const index_t src_offset = SrcDesc{}.Get1dIndex( - icluster_d0 * src_data_per_cluster_per_dims.Get(I0), - icluster_d1 * src_data_per_cluster_per_dims.Get(I1), - icluster_d2 * src_data_per_cluster_per_dims.Get(I2), - icluster_d3 * src_data_per_cluster_per_dims.Get(I3)); - - const index_t clipboard_offset = thread_tensor_desc.Get1dIndex( - icluster_d0 * thread_sub_tensor_lengths.Get(I0), - icluster_d1 * thread_sub_tensor_lengths.Get(I1), - icluster_d2 * thread_sub_tensor_lengths.Get(I2), - icluster_d3 * thread_sub_tensor_lengths.Get(I3)); - - threadwise_nd_tensor_copy(SrcDesc{}, - p_src + src_offset + mSrcMyThreadOffset, - thread_tensor_desc, - p_clipboard + clipboard_offset, - thread_sub_tensor_lengths, - Number{}); - } - } - } - } -#else - static_ford{}([=](auto cluster_ids) { - - }); -#endif - -#if 0 - if(get_block_1d_id() == 0) - { - printf("tid %5u, " - "data: %f %f %f %f %f %f %f %f\n", - get_thread_local_1d_id(), - p_clipboard[0], - p_clipboard[1], - p_clipboard[2], - p_clipboard[3], - p_clipboard[4], - p_clipboard[5], - p_clipboard[6], - p_clipboard[7]); - } -#endif - } - - __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, - Float* __restrict__ p_dst) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; - - constexpr auto src_data_per_cluster_per_dims = transform_sequences( - mod_conv::multiplies{}, thread_sub_tensor_lengths, SrcClusterLengths{}); - - constexpr auto cluster_per_dims = - transform_sequences(mod_conv::integer_divide_ceiler{}, - SrcLengths{}, - src_data_per_cluster_per_dims); - - constexpr auto thread_tensor_lengths = transform_sequences( - mod_conv::multiplies{}, thread_sub_tensor_lengths, cluster_per_dims); - - constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); - - constexpr auto thread_sub_tensor_desc = - make_ConstantTensorDescriptor(SrcClusterLengths{}, thread_tensor_desc.GetStrides()); - - for(index_t icluster_d0 = 0; icluster_d0 < cluster_per_dims.Get(I0); ++icluster_d0) - { - for(index_t icluster_d1 = 0; icluster_d1 < cluster_per_dims.Get(I1); ++icluster_d1) - { - for(index_t icluster_d2 = 0; icluster_d2 < cluster_per_dims.Get(I2); ++icluster_d2) - { - for(index_t icluster_d3 = 0; icluster_d3 < cluster_per_dims.Get(I3); - ++icluster_d3) - { - const index_t clipboard_offset = thread_tensor_desc.Get1dIndex( - icluster_d0 * thread_sub_tensor_lengths.Get(I0), - icluster_d1 * thread_sub_tensor_lengths.Get(I1), - icluster_d2 * thread_sub_tensor_lengths.Get(I2), - icluster_d3 * thread_sub_tensor_lengths.Get(I3)); - - const auto dst_multi_id = reorder_array_given_new2old( - Array{ - icluster_d0 * src_data_per_cluster_per_dims.Get(I0), - icluster_d1 * src_data_per_cluster_per_dims.Get(I1), - icluster_d2 * src_data_per_cluster_per_dims.Get(I2), - icluster_d3 * src_data_per_cluster_per_dims.Get(I3)}, - MapDst2Src{}); - - const index_t dst_offset = DstDesc{}.Get1dIndex(dst_multi_id); - -#if 0 - if(get_block_1d_id() == 0) - { - printf("tid %5u, " - "clipboard_offsetm %5u, dst_offset %5u\n", - get_thread_local_1d_id(), - clipboard_offset, - dst_offset); - } -#endif - - threadwise_4d_tensor_copy_reorder_given_dst2src_v2( - thread_tensor_desc, - p_clipboard + clipboard_offset, - DstDesc{}, - p_dst + dst_offset + mDstMyThreadOffset, - thread_sub_tensor_lengths, - MapDst2Src{}); - } - } - } - } - } - - __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const - { - Float p_clipboard[GetRegisterClipboardSize()]; - - RunLoadRegisterClipboard(p_src, p_clipboard); - RunStoreRegisterClipboard(p_clipboard, p_dst); - } -}; diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hip.hpp index cdde25efa0..3926e5905d 100644 --- a/src/include/blockwise_batched_gemm.hip.hpp +++ b/src/include/blockwise_batched_gemm.hip.hpp @@ -53,7 +53,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 constexpr index_t M = a_block_mtx.NCol(); // A is transposed constexpr index_t N = b_block_mtx.NCol(); - constexpr index_t K = a_block_mtx.NRow(); constexpr index_t MPerThread = c_thread_mtx.NRow(); constexpr index_t NPerThread = c_thread_mtx.NCol(); @@ -114,8 +113,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 __device__ MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) const { - constexpr index_t BatchThreadWork = BatchSize / BatchPerThread; - constexpr index_t ThreadPerLevel1Cluster = MLevel0Cluster * NLevel0Cluster * MLevel1Cluster * NLevel1Cluster; diff --git a/src/include/blockwise_nd_tensor_op.hip.hpp b/src/include/blockwise_nd_tensor_op.hip.hpp new file mode 100644 index 0000000000..c2d7722454 --- /dev/null +++ b/src/include/blockwise_nd_tensor_op.hip.hpp @@ -0,0 +1,252 @@ +#pragma once +#include "threadwise_nd_tensor_op.hip.hpp" + +template +struct BlockwiseNdTensorCopyReorder_v3 +{ + static constexpr index_t nDim = SrcLengths::GetSize(); + + index_t mSrcMyThreadOffset; + index_t mDstMyThreadOffset; + + __device__ BlockwiseNdTensorCopyReorder_v3() + { + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + constexpr auto src_lengths = SrcLengths{}; + + constexpr auto map_dst2src = MapDst2Src{}; + + constexpr auto src_sub_lengths = SrcSubLengths{}; + constexpr auto dst_sub_lengths = src_sub_lengths.ReorderGivenNew2Old(map_dst2src); + + constexpr auto map_thread_cluster_2_src_cluster = MapThreadCluster2SrcCluster{}; + + constexpr auto src_cluster_lengths = SrcClusterLengths{}; + constexpr auto thread_cluster_lengths = + src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster); + + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(thread_cluster_lengths); + + // sanity check: data type + static_assert(is_same::value, "wrong! only support float for now!\n"); + + // sanity check: nDim + static_assert(SrcDesc::GetDimension() == nDim && DstDesc::GetDimension() == nDim && + SrcLengths::GetSize() == nDim && SrcSubLengths::GetSize() == nDim && + SrcClusterLengths::GetSize() == nDim && MapDst2Src::GetSize() == nDim && + MapThreadCluster2SrcCluster::GetSize() == nDim, + "wrong! nDim is not consistent\n"); + + // sanity check: BlockSize + constexpr index_t num_active_thread = thread_cluster_desc.GetElementSize(); + + static_assert(BlockSize >= num_active_thread, + "wrong! BlockSize is not big enough for ThreadPerDims!"); + + // sanity check: work division + static_for<0, nDim, 1>{}([](auto IDim) { + constexpr auto I = decltype(IDim){}; + constexpr index_t src_len = src_lengths.Get(I); + constexpr index_t src_sub_len = src_sub_lengths.Get(I); + constexpr index_t src_cluster_len = src_cluster_lengths.Get(I); + static_assert(src_len % (src_sub_len * src_cluster_len) == 0, + "wrong! cannot evenly divide Src tensor lengths"); + }); + + // sanity check: src read + static_assert(SrcDataPerRead == 1 || SrcDataPerRead == 2 || SrcDataPerRead == 4, + "wrong! only support SrcDataPerRead == 1, 2 or 4!\n"); + + static_assert(SrcDataPerRead == 1 || src_desc.GetStride(Number{}) == 1, + "wrong! only support src.stride(nDim-1) == 1 if SrcDataPerRead > 1!\n"); + + static_assert(src_sub_lengths.Get(Number{}) % SrcDataPerRead == 0, + "wrong! src_sub_lengths[nDim-1] % SrcDataPerRead != 0\n"); + + static_assert(src_desc.GetStride(Number{}) % SrcDataPerRead == 0, + "wrong! should satisfy src_desc.stride(nDim-2) % SrcDataPerRead == 0, to " + "keep alignment"); + + // sanity check: dst write + static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4, + "wrong! only support DstDataPerWrite == 1, 2 or 4!\n"); + + static_assert(DstDataPerWrite == 1 || dst_desc.GetStride(Number{}) == 1, + "wrong! only support dst.stride(nDim-1) == 1 if DstDataPerWrite > 1!\n"); + + static_assert(dst_sub_lengths.Get(Number{}) % DstDataPerWrite == 0, + "wrong! dst_sub_lengths[nDim-1] % DstDataPerWrite != 0\n"); + + static_assert(dst_desc.GetStride(Number{}) % DstDataPerWrite == 0, + "wrong! should satisfy dst_desc.stride(nDim-2) % DstDataPerWrite == 0, to " + "keep alignment"); + + // start dividing work + if(BlockSize > num_active_thread) + { + if(get_thread_local_1d_id() >= num_active_thread) + { + return; + } + } + + const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id()); + + // compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate + // regsiters, or only one copy??? + auto src_data_multi_id = + reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster); + + static_for<0, nDim, 1>{}([&](auto IDim) { + constexpr auto I = decltype(IDim){}; + constexpr index_t i = I.Get(); + // compiler: will it really compute index here, or be associated with Get1dIndex and + // optimized away??? + src_data_multi_id[i] *= src_sub_lengths.Get(I); + }); + + // compiler: will it really compute index here, or be associated with Get1dIndex and + // optimized away??? + const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src); + + mSrcMyThreadOffset = src_desc.Get1dIndex(src_data_multi_id); + mDstMyThreadOffset = dst_desc.Get1dIndex(dst_data_multi_id); + } + + __device__ static constexpr index_t GetRegisterClipboardSize() + { + constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; + + constexpr auto src_data_per_cluster_per_dims = transform_sequences( + mod_conv::multiplies{}, thread_sub_tensor_lengths, SrcClusterLengths{}); + + constexpr auto repeat_lengths = + transform_sequences(mod_conv::integer_divide_ceiler{}, + SrcLengths{}, + src_data_per_cluster_per_dims); + + constexpr auto thread_tensor_lengths = transform_sequences( + mod_conv::multiplies{}, thread_sub_tensor_lengths, repeat_lengths); + + constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); + + return thread_tensor_desc.GetElementSpace(); + } + + __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, + Float* __restrict__ p_clipboard) const + { + constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; + + constexpr auto src_data_per_cluster_per_dims = transform_sequences( + mod_conv::multiplies{}, thread_sub_tensor_lengths, SrcClusterLengths{}); + + constexpr auto repeat_lengths = + transform_sequences(mod_conv::integer_divide_ceiler{}, + SrcLengths{}, + src_data_per_cluster_per_dims); + + constexpr auto thread_tensor_lengths = transform_sequences( + mod_conv::multiplies{}, thread_sub_tensor_lengths, repeat_lengths); + + constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); + + static_ford{}([&](auto repeat_multi_id_) { + constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; + + constexpr auto src_data_multi_id = transform_sequences( + mod_conv::multiplies{}, repeat_multi_id, src_data_per_cluster_per_dims); + + constexpr auto clipboard_data_multi_id = transform_sequences( + mod_conv::multiplies{}, repeat_multi_id, thread_sub_tensor_lengths); + + constexpr index_t src_offset = SrcDesc{}.Get1dIndex(src_data_multi_id); + constexpr index_t clipboard_offset = + thread_tensor_desc.Get1dIndex(clipboard_data_multi_id); + + threadwise_nd_tensor_copy(SrcDesc{}, + p_src + src_offset + mSrcMyThreadOffset, + thread_tensor_desc, + p_clipboard + clipboard_offset, + thread_sub_tensor_lengths, + Number{}); + }); + } + + __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, + Float* __restrict__ p_dst) const + { + constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; + + constexpr auto src_data_per_cluster_per_dims = transform_sequences( + mod_conv::multiplies{}, thread_sub_tensor_lengths, SrcClusterLengths{}); + + constexpr auto repeat_lengths = + transform_sequences(mod_conv::integer_divide_ceiler{}, + SrcLengths{}, + src_data_per_cluster_per_dims); + + constexpr auto thread_tensor_lengths = transform_sequences( + mod_conv::multiplies{}, thread_sub_tensor_lengths, repeat_lengths); + + constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); + + static_ford{}([&](auto repeat_multi_id_) { + constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; + + constexpr auto clipboard_data_multi_id = transform_sequences( + mod_conv::multiplies{}, repeat_multi_id, thread_sub_tensor_lengths); + + constexpr auto src_data_multi_id = transform_sequences( + mod_conv::multiplies{}, repeat_multi_id, src_data_per_cluster_per_dims); + + // reorder src_data_multi_id to get dst_data_multi_id + constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{}); + + constexpr index_t clipboard_offset = + thread_tensor_desc.Get1dIndex(clipboard_data_multi_id); + + constexpr index_t dst_offset = DstDesc{}.Get1dIndex(dst_data_multi_id); + + // write in the order of dst +#if 1 + threadwise_nd_tensor_copy_reorder_given_dst2src_v2(thread_tensor_desc, + p_clipboard + clipboard_offset, + DstDesc{}, + p_dst + dst_offset + + mDstMyThreadOffset, + thread_sub_tensor_lengths, + MapDst2Src{}); +#else + threadwise_nd_tensor_copy_reorder_given_dst2src_v3(thread_tensor_desc, + p_clipboard + clipboard_offset, + DstDesc{}, + p_dst + dst_offset + + mDstMyThreadOffset, + thread_sub_tensor_lengths, + MapDst2Src{}, + Number{}); +#endif + }); + } + + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const + { + Float p_clipboard[GetRegisterClipboardSize()]; + + RunLoadRegisterClipboard(p_src, p_clipboard); + RunStoreRegisterClipboard(p_clipboard, p_dst); + } +}; diff --git a/src/include/conv_common.hip.hpp b/src/include/conv_common.hip.hpp index b58a3e86e5..b1d248d847 100644 --- a/src/include/conv_common.hip.hpp +++ b/src/include/conv_common.hip.hpp @@ -73,7 +73,6 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4 template __host__ __device__ constexpr std::size_t calculate_convolution_flops(InDesc, WeiDesc, OutDesc) { - constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; constexpr auto out_desc = OutDesc{}; diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp index 31791542ae..b341609a39 100644 --- a/src/include/data_type.hip.hpp +++ b/src/include/data_type.hip.hpp @@ -1,5 +1,6 @@ #pragma once #include "config.h" +#include "constant_integral.hip.hpp" template struct vector_type @@ -10,6 +11,13 @@ template <> struct vector_type { typedef float MemoryType; + + template + __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) + { + static_assert(I < 1, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } }; template <> @@ -20,21 +28,29 @@ struct vector_type // instruction typedef float MemoryType __attribute__((ext_vector_type(2))); #elif DEVICE_BACKEND_CUDA - // For some reason, CUDA need this definition to, otherwise + // For some reason, CUDA need this definition, otherwise // compiler won't generate optimal load and store instruction, and // kernel would produce wrong result, indicating the compiler fail to generate correct // instruction, using MemoryType = float2; #endif + union Data + { + MemoryType vector; + float scalar[2]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) + { + static_assert(I < 2, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + __host__ __device__ static MemoryType Pack(float s0, float s1) { - union - { - MemoryType vector; - float scalar[2]; - } data; - + Data data; data.scalar[0] = s0; data.scalar[1] = s1; return data.vector; @@ -49,12 +65,19 @@ struct vector_type // instruction typedef float MemoryType __attribute__((ext_vector_type(4))); #elif DEVICE_BACKEND_CUDA - // For some reason, CUDA need this definition to, otherwise + // For some reason, CUDA need this definition, otherwise // compiler won't generate optimal load and store instruction, and // kernel would produce wrong result, indicating the compiler fail to generate correct // instruction, using MemoryType = float4; #endif + + template + __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) + { + static_assert(I < 4, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } }; #if 0 diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp index a1aea70cec..bc71b6daba 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp @@ -4,7 +4,7 @@ #include "ConstantMatrixDescriptor.hip.hpp" #include "blockwise_2d_tensor_op.hip.hpp" #include "blockwise_3d_tensor_op.hip.hpp" -#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_nd_tensor_op.hip.hpp" #include "threadwise_nd_tensor_op.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp" #include "blockwise_batched_gemm.hip.hpp" @@ -125,17 +125,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{}; const auto blockwise_in_copy_reorder = - Blockwise4dTensorCopyReorder3, - InBlockReorderSrcSubLengths_NCHW, - InBlockReorderSrcClusterLengths_NCHW, - decltype(map_chwn2nchw), - InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, - InBlockReorderDataPerRead_W, - InBlockReorderDataPerWrite_N>{}; + BlockwiseNdTensorCopyReorder_v3, + InBlockReorderSrcSubLengths_NCHW, + InBlockReorderSrcClusterLengths_NCHW, + decltype(map_chwn2nchw), + InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, + InBlockReorderDataPerRead_W, + InBlockReorderDataPerWrite_N>{}; // blockwise wei copy // format is [CPerBlock, X * KPerBlock] diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp index d713b8cf2e..b83a61147a 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp @@ -3,7 +3,7 @@ #include "ConstantTensorDescriptor.hip.hpp" #include "ConstantMatrixDescriptor.hip.hpp" #include "blockwise_2d_tensor_op.hip.hpp" -#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_nd_tensor_op.hip.hpp" #include "threadwise_nd_tensor_op.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp" #include "blockwise_batched_gemm.hip.hpp" @@ -133,17 +133,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{}; const auto blockwise_in_copy_reorder = - Blockwise4dTensorCopyReorder3, - InBlockReorderSrcSubLengths_NCHW, - InBlockReorderSrcClusterLengths_NCHW, - decltype(map_chwn2nchw), - InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, - InBlockReorderDataPerRead_W, - InBlockReorderDataPerWrite_N>{}; + BlockwiseNdTensorCopyReorder_v3, + InBlockReorderSrcSubLengths_NCHW, + InBlockReorderSrcClusterLengths_NCHW, + decltype(map_chwn2nchw), + InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, + InBlockReorderDataPerRead_W, + InBlockReorderDataPerWrite_N>{}; // blockwise wei copy // format is [CPerBlock, KPerBlock] diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp index b6008df44e..8337bebf69 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp @@ -3,7 +3,7 @@ #include "ConstantTensorDescriptor.hip.hpp" #include "ConstantMatrixDescriptor.hip.hpp" #include "blockwise_2d_tensor_op.hip.hpp" -#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_nd_tensor_op.hip.hpp" #include "threadwise_nd_tensor_op.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp" #include "blockwise_batched_gemm.hip.hpp" @@ -130,17 +130,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{}; const auto blockwise_in_copy_reorder = - Blockwise4dTensorCopyReorder3, - InBlockReorderSrcSubLengths_NCHW, - InBlockReorderSrcClusterLengths_NCHW, - decltype(map_chwn2nchw), - InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, - InBlockReorderDataPerRead_W, - InBlockReorderDataPerWrite_N>{}; + BlockwiseNdTensorCopyReorder_v3, + InBlockReorderSrcSubLengths_NCHW, + InBlockReorderSrcClusterLengths_NCHW, + decltype(map_chwn2nchw), + InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, + InBlockReorderDataPerRead_W, + InBlockReorderDataPerWrite_N>{}; // blockwise wei copy // format is [CPerBlock, KPerBlock] diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp new file mode 100644 index 0000000000..f5c6df8644 --- /dev/null +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp @@ -0,0 +1,514 @@ +#pragma once +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" +#include "ConstantMatrixDescriptor.hip.hpp" +#include "blockwise_2d_tensor_op.hip.hpp" +#include "blockwise_nd_tensor_op.hip.hpp" +#include "threadwise_nd_tensor_op.hip.hpp" +#include "threadwise_4d_tensor_op.hip.hpp" +#include "blockwise_batched_gemm.hip.hpp" + +template +struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw +{ + __device__ void Run(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) const + { + // be careful of this assertion + static_assert(NPerBlock % NPerThread == 0 && (GemmNPerThreadSubC <= NPerBlock && + NPerBlock % GemmNPerThreadSubC == 0) || + (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0), + "wrong!"); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; + constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; + + constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); + + constexpr index_t N = out_n_k_h_w_global_desc.GetLength(I0); + constexpr index_t K = out_n_k_h_w_global_desc.GetLength(I1); + constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2); + constexpr index_t Wo = out_n_k_h_w_global_desc.GetLength(I3); + + constexpr index_t Y = wei_c_y_x_k_global_desc.GetLength(I1); + constexpr index_t X = wei_c_y_x_k_global_desc.GetLength(I2); + + // divide block work: [K, Ho, Wo, N] + static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && + Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, + "wrong! cannot evenly divide work for workgroup "); + + // constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock; + constexpr index_t HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock; + constexpr index_t WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock; + constexpr index_t NBlockWork = (N + NPerBlock - 1) / NPerBlock; + + const index_t k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork); + index_t itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork); + const index_t h_block_work_id = itmp / (WBlockWork * NBlockWork); + itmp -= h_block_work_id * (WBlockWork * NBlockWork); + const index_t w_block_work_id = itmp / NBlockWork; + const index_t n_block_work_id = itmp - w_block_work_id * NBlockWork; + + const index_t k_block_data_begin = k_block_work_id * KPerBlock; + const index_t ho_block_data_begin = h_block_work_id * HoPerBlock; + const index_t wo_block_data_begin = w_block_work_id * WoPerBlock; + const index_t n_block_data_begin = n_block_work_id * NPerBlock; + + const index_t hi_block_data_begin = ho_block_data_begin; + const index_t wi_block_data_begin = wo_block_data_begin; + + // global tensor view + constexpr auto wei_c_k_global_desc = + make_ConstantTensorDescriptor(Sequence{}, Sequence{}); + + // LDS tensor view + // be careful of alignment + constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, + WeiBlockCopyDataPerRead_K, + GemmDataPerReadA, + GemmDataPerReadB); + + constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, + Number{}); + + // this check is ad-hoc + // TODO: need to properly implement tensor descriptor with alignment + static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, + "GemmDataPerReadB alignment requirement is not meet"); + + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, + Number{}); + + // tensor view of threadwise output in register + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); + + // blockwise copy + // input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N] + constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{}; + + const auto blockwise_in_copy_reorder = + BlockwiseNdTensorCopyReorder_v3, + InBlockReorderSrcSubLengths_NCHW, + InBlockReorderSrcClusterLengths_NCHW, + decltype(map_chwn2nchw), + InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, + InBlockReorderDataPerRead_W, + InBlockReorderDataPerWrite_N>{}; + + // blockwise wei copy + // format is [CPerBlock, KPerBlock] + const auto blockwise_wei_copy = + Blockwise2dTensorCopy3{}; + + // a series of blockwise batched GEMM + // C_matrix += transpose(A_matrix) * B_matrix + // A_matrix and B_matrix saved in LDS, C_matrix saved in register + // A_matrix[C,K] is a sub-matrix of wei_block[C,K] + // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] + // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] + constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + constexpr auto b_c_wn_block_mtx_desc = + make_ConstantMatrixDescriptor(Number{}, + Number{}, + Number{}); + + constexpr auto c_k_wn_thread_mtx_desc = + make_ConstantMatrixDescriptor(Number{}, + Number{}, + Number{}); + + const auto blockwise_batch_gemm = + BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2< + BlockSize, + decltype(a_c_k_block_mtx_desc), + decltype(b_c_wn_block_mtx_desc), + decltype(c_k_wn_thread_mtx_desc), + 0, + in_c_h_w_n_block_desc.GetStride(I1), + out_k_h_w_n_thread_desc.GetStride(I1), + HoPerBlock, + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + HoPerThread, + GemmDataPerReadA, + GemmDataPerReadB>{}; + + // LDS: be careful of alignment + constexpr index_t in_block_space = + in_c_h_w_n_block_desc.GetElementSpace(Number{}); + constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(Number{}); + + __shared__ Float p_in_block[in_block_space]; + __shared__ Float p_wei_block[wei_block_space]; + + // register + // C++ lambda doesn't capture array, use pointer instead + Float p_out_thread_data[out_k_h_w_n_thread_desc.GetElementSpace()]; + Float* const p_out_thread = p_out_thread_data; + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(in_c_h_w_n_global_desc, "in_c_h_w_n_global_desc"); + print_ConstantTensorDescriptor(wei_c_y_x_k_global_desc, "wei_c_y_x_k_global_desc"); + + print_ConstantTensorDescriptor(in_c_h_w_n_block_desc, "in_c_h_w_n_block_desc"); + print_ConstantTensorDescriptor(wei_c_k_block_desc, "wei_c_k_block_desc"); + + printf("in_block_space %u, wei_block_space %u\n", in_block_space, wei_block_space); + } +#endif + + // set threadwise output tensor to 0 + threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); + +#if 1 + const Float* p_in_global_block_offset = + p_in_global + in_n_c_h_w_global_desc.Get1dIndex( + n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin); + + const Float* p_wei_global_block_offset = + p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + + for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, + p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), + p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) + { + for(index_t y = 0; y < Y; ++y) + { + for(index_t x = 0; x < X; ++x) + { +#if 1 + blockwise_in_copy_reorder.Run(p_in_global_block_offset + + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x), + p_in_block); + + blockwise_wei_copy.Run(p_wei_global_block_offset + + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0), + p_wei_block); +#else + Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()]; + Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + + blockwise_in_copy_reorder.RunLoadRegisterClipboard( + p_in_global_block_offset + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x), + p_in_clipboard); + + blockwise_wei_copy.RunLoadRegisterClipboard( + p_wei_global_block_offset + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0), + p_wei_clipboard); + + blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block); + + blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_clipboard, p_in_block); + +#endif + + __syncthreads(); + + blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); + + __syncthreads(); + } + } + } +#else + for(index_t y = 0; y < Y; ++y) + { + for(index_t x = 0; x < X; ++x) + { + const Float* p_in_global_block_offset = + p_in_global + + in_n_c_h_w_global_desc.Get1dIndex( + n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x); + + const Float* p_wei_global_block_offset = + p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin); + + for(index_t c_block_data_begin = 0; c_block_data_begin < C; + c_block_data_begin += CPerBlock, + p_in_global_block_offset += + CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), + p_wei_global_block_offset += + CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) + { +#if 0 + blockwise_in_copy_reorder.Run(p_in_global_block_offset, + p_in_block); + + blockwise_wei_copy.Run(p_wei_global_block_offset, + p_wei_block); +#else + Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()]; + Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + + blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset, + p_in_clipboard); + blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, + p_wei_clipboard); + + blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block); + blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_clipboard, p_in_block); +#endif + + __syncthreads(); + + blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); + + __syncthreads(); + } + } + } +#endif + + // output: register to global mem, + const auto c_thread_mtx_begin = + blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); + + const index_t k_thread_data_begin = c_thread_mtx_begin.row; + const index_t ho_thread_data_begin = c_thread_mtx_begin.batch; + const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; + const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; + + static_if{}( + [&](auto f_dummy) { // f_dummy do nothing but perfect forwarding. Using this trick to + // make this lambda a generic lambda, so it won't be compiled until + // instantiated + static_assert((f_dummy(GemmNPerThreadSubC) <= NPerBlock && + NPerBlock % GemmNPerThreadSubC == 0), + "wrong!"); + + // output is a 10d tensor + constexpr index_t N2 = GemmNPerThreadSubC; + constexpr index_t N1 = NPerBlock / N2; + + constexpr index_t W2 = (GemmNLevel0Cluster * GemmNLevel1Cluster) / + f_dummy(NPerBlock / GemmNPerThreadSubC); + constexpr index_t W1 = WoPerBlock / W2; + + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; + +#if 0 + constexpr auto out_10d_global_desc = + make_ConstantTensorDescriptor(Sequence{}); +#else + constexpr auto out_10d_global_desc = + make_ConstantTensorDescriptor(Sequence{}); +#endif + + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, + "out_k_h_w_n_thread_desc"); + print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); + + print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, + "out_k_h_w_n_global_desc"); + print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); + } +#endif + +#if 0 + threadwise_nd_tensor_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); +#else + constexpr auto map_out_global2thread = Sequence<7, 8, 9, 0, 1, 2, 6, 3, 4, 5>{}; + + threadwise_nd_tensor_copy_reorder_given_dst2src_v2( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + out_n_k_h_w_global_desc.Get1dIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_10d_thread_desc.GetLengths(), + map_out_global2thread); + // Number{}); +#endif + }) + .else_([&](auto f_dummy) { + static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && + GemmNPerThreadSubC % NPerThread == 0, + "wrong!"); + + // output is a 10d tensor + constexpr index_t N1 = NPerBlock; + + constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; + constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; + constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); + + constexpr index_t K2 = GemmMPerThreadSubC; + constexpr index_t K1 = KPerBlock / KPerThread; + +#if 0 + constexpr auto out_10d_global_desc = + make_ConstantTensorDescriptor(Sequence{}); +#else + constexpr auto out_10d_global_desc = + make_ConstantTensorDescriptor(Sequence{}); +#endif + + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + Sequence{}); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, + "out_k_h_w_n_thread_desc"); + print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); + + print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, + "out_k_h_w_n_global_desc"); + print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); + + for(index_t i = 0; i < 64; ++i) + { + printf("out %f, ", p_out_thread[i]); + } + } +#endif + +#if 0 + threadwise_nd_tensor_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.Get1dIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); +#else + constexpr auto map_out_global2thread = Sequence<8, 9, 0, 1, 2, 3, 4, 5, 6, 7>{}; + + threadwise_nd_tensor_copy_reorder_given_dst2src_v2( + out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + out_n_k_h_w_global_desc.Get1dIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_10d_thread_desc.GetLengths(), + map_out_global2thread); + // Number{}); +#endif + }); + } +}; diff --git a/src/include/threadwise_4d_tensor_op.hip.hpp b/src/include/threadwise_4d_tensor_op.hip.hpp index cdd27199c2..7255650c1a 100644 --- a/src/include/threadwise_4d_tensor_op.hip.hpp +++ b/src/include/threadwise_4d_tensor_op.hip.hpp @@ -139,135 +139,6 @@ __device__ void threadwise_4d_tensor_copy_reorder_given_dst2src(SrcDesc, SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy); } -#if 0 // replaced threadwise_nd_tensor_copy -template -__device__ void threadwise_4d_tensor_copy( - SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths) -{ - auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{}; - - threadwise_4d_tensor_copy_reorder_given_dst2src( - SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder); -} - -// need to assume src and dst is aligned -template -__device__ void threadwise_4d_tensor_copy_v2(SrcDesc, - const Float* __restrict__ p_src, - DstDesc, - Float* __restrict__ p_dst, - SrcOpLengths, - Number) -{ - static_assert(SrcDesc{}.GetDimension() == 4 && DstDesc{}.GetDimension() == 4 && - SrcOpLengths::GetSize() == 4, - "wrong! should be 4 dimension"); - - using vector_t = typename vector_type::MemoryType; - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{}); - - static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1, - "wrong! only support stride3 == 1!\n"); - - static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, - "wrong! only support DataPerRead == 1, 2 or 4!\n"); - - static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 && - DstDesc{}.GetStride(I2) % DataPerRead == 0, - "wrong! src and dst stride should be multiple of DataPerRead to keep alignment"); - - constexpr index_t L3 = SrcOpLengths{}.Get(I3); - - static_assert(L3 % DataPerRead == 0, "wrong! L3 should be evenly divided by DataPerRead"); - - constexpr index_t nloop_d3 = L3 / DataPerRead; - - for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0) - { - for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1) - { - for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2) - { - for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3) - { - const index_t src_index = - src_desc.Get1dIndex(did0, did1, did2, iloop_d3 * DataPerRead); - - const index_t dst_index = - dst_desc.Get1dIndex(did0, did1, did2, iloop_d3 * DataPerRead); - - *(reinterpret_cast(&p_dst[dst_index])) = - *(reinterpret_cast(&p_src[src_index])); - } - } - } - } -} -#endif - -template -__device__ void -threadwise_4d_tensor_copy_reorder_given_dst2src_v2(SrcDesc, - const SrcData* __restrict__ p_src, - DstDesc, - DstData* __restrict__ p_dst, - SrcOpLengths, - MapDst2Src) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr index_t IR0 = MapDst2Src{}.Get(I0); - constexpr index_t IR1 = MapDst2Src{}.Get(I1); - constexpr index_t IR2 = MapDst2Src{}.Get(I2); - constexpr index_t IR3 = MapDst2Src{}.Get(I3); - - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - // ref_desc has dst_desc's ordering - constexpr auto ref_desc = - make_ConstantTensorDescriptor(SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{})); - - for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0) - { - for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1) - { - for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2) - { - for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3) - { - const auto dst_multi_id = Array{did0, did1, did2, did3}; - - const auto src_multi_id = - reorder_array_given_old2new(dst_multi_id, MapDst2Src{}); - - const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id); - - const index_t src_index = src_desc.Get1dIndex(src_multi_id); - - p_dst[dst_index] = p_src[src_index]; - } - } - } - } -} - template __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift) { diff --git a/src/include/threadwise_nd_tensor_op.hip.hpp b/src/include/threadwise_nd_tensor_op.hip.hpp index b75cbd2745..b0b9a87a71 100644 --- a/src/include/threadwise_nd_tensor_op.hip.hpp +++ b/src/include/threadwise_nd_tensor_op.hip.hpp @@ -50,7 +50,7 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc, constexpr index_t nRead = L_Back / DataPerRead; static_ford{}([=](auto Ids) { - static_for<0, nRead, 1>{}([=](auto IRead) { + static_for<0, nRead, 1>{}([&](auto IRead) { constexpr auto multi_id = decltype(Ids){}.PushBack(Number{}); const index_t src_index = src_desc.Get1dIndex(multi_id); @@ -62,3 +62,131 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc, }); }); } + +// write in order of src +template +__device__ void +threadwise_nd_tensor_copy_reorder_given_dst2src_v1(SrcDesc, + const SrcData* __restrict__ p_src, + DstDesc, + DstData* __restrict__ p_dst, + SrcOpLengths, + MapDst2Src) +{ + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + ford{}([&](auto src_multi_id) { + const auto dst_multi_id = reorder_array_given_new2old(src_multi_id, MapDst2Src{}); + + const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id); + + const index_t src_index = src_desc.Get1dIndex(src_multi_id); + + p_dst[dst_index] = p_src[src_index]; + }); +} + +// write in order of dst +template +__device__ void +threadwise_nd_tensor_copy_reorder_given_dst2src_v2(SrcDesc, + const SrcData* __restrict__ p_src, + DstDesc, + DstData* __restrict__ p_dst, + SrcOpLengths, + MapDst2Src) +{ + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + constexpr auto dst_op_lengths = SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{}); + + ford{}([&](auto dst_multi_id) { + const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{}); + + const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id); + + const index_t src_index = src_desc.Get1dIndex(src_multi_id); + + p_dst[dst_index] = p_src[src_index]; + }); +} + +// write in order of dst +template +__device__ void threadwise_nd_tensor_copy_reorder_given_dst2src_v3(SrcDesc, + const Float* __restrict__ p_src, + DstDesc, + Float* __restrict__ p_dst, + SrcOpLengths, + MapDst2Src, + Number) +{ + using vector_t = typename vector_type::MemoryType; + + constexpr index_t nDim = SrcOpLengths::GetSize(); + + static_assert(DstDataPerWrite == 1 || DstDesc{}.GetStride(Number{}) == 1, + "wrong! only support dst.stride[nDim-1] == 1, if DstDataPerWrite != 1"); + + static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4, + "wrong! only support DstDataPerWrite == 1, 2 or 4"); + + static_assert( + DstDesc{}.GetStride(Number{}) % DstDataPerWrite == 0, + "wrong! dst.stride[nDim-2] should be multiple of DstDataPerWrite to keep alignment"); + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + constexpr auto dst_op_lengths = SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{}); + + constexpr index_t L_Dst_Back = dst_op_lengths.Back(); + + static_assert(L_Dst_Back % DstDataPerWrite == 0, + "wrong! dst.lengths[nDim-1] should be evenly divided by DstDataPerWrite"); + + constexpr index_t nWrite = L_Dst_Back / DstDataPerWrite; + + ford{}([&](auto ids) { + static_for<0, nWrite, 1>{}([&](auto IWrite) { + vector_t dst_vec_data; + + // pack data + static_for<0, DstDataPerWrite, 1>{}([&](auto IDstData) { + const auto dst_multi_id = + ids.PushBack(IWrite.Get() * DstDataPerWrite + IDstData.Get()); + + const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{}); + + const index_t src_index = src_desc.Get1dIndex(src_multi_id); + + vector_type::SetScalar( + dst_vec_data, p_src[src_index], IDstData); + }); + + // write data + const auto dst_multi_id = ids.PushBack(IWrite.Get() * DstDataPerWrite); + + const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id); + + *(reinterpret_cast(&p_dst[dst_index])) = dst_vec_data; + }); + }); +}