From 2c9b8c2432ffe2eceba32d07ce8b0e467dd4538e Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 12 Mar 2019 17:20:11 -0500 Subject: [PATCH] update hip build --- driver/driver.hip.cpp | 14 ++++---- src/include/blockwise_4d_tensor_op.hip.hpp | 9 ++--- .../blockwise_direct_convolution.hip.hpp | 27 +++++++------- src/include/blockwise_gemm.hip.hpp | 25 +++++++------ src/include/common.hip.hpp | 4 ++- src/include/config.h.in | 1 - .../gridwise_direct_convolution_1.hip.hpp | 19 +++++----- .../gridwise_direct_convolution_2.hip.hpp | 36 ++++++++++--------- ..._gemm_convolution_1_chwn_cyxk_khwn.hip.hpp | 28 ++++++++------- ...onvolution_1_chwn_cyxk_khwn_padded.hip.hpp | 9 ++--- ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 4 +-- src/include/tensor.hpp | 3 +- 12 files changed, 97 insertions(+), 82 deletions(-) diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 2cd1ac4b24..325da5d1dc 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -47,7 +47,7 @@ struct GeneratorTensor_3 std::initializer_list ids = {static_cast(is)...}; std::vector lens(sizeof...(Is), 100); std::vector strides(sizeof...(Is), 1); - std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is) - 1), strides.rbegin() + 1); + std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is)-1), strides.rbegin() + 1); return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1; #endif } @@ -353,7 +353,7 @@ void host_winograd_3x3_convolution( std::size_t ho = HoPerTile * htile + j; for(int i = 0; i < WoPerTile; ++i) { - std::size_t wo = WoPerTile * wtile + i; + std::size_t wo = WoPerTile * wtile + i; out(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); } } @@ -406,13 +406,13 @@ int main(int argc, char* argv[]) constexpr unsigned WPad = 0; #elif 1 // 3x3, 34x34 - constexpr unsigned N = 64; - constexpr unsigned C = 256; + constexpr unsigned N = 64; + constexpr unsigned C = 256; constexpr unsigned HI = 34; constexpr unsigned WI = 34; - constexpr unsigned K = 64; - constexpr unsigned Y = 3; - constexpr unsigned X = 3; + constexpr unsigned K = 64; + constexpr unsigned Y = 3; + constexpr unsigned X = 3; constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index cc50d9eecd..9087364b71 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -245,10 +245,11 @@ struct BlockwiseChwnTensorCopyPadded constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; const Float* p_src_tmp = - p_src + src_desc.Get1dIndex(c_block_data_begin, - (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, - (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, - n_block_data_begin); + p_src + + src_desc.Get1dIndex(c_block_data_begin, + (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, + (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, + n_block_data_begin); #if 0 if(get_thread_local_1d_id() == 0) diff --git a/src/include/blockwise_direct_convolution.hip.hpp b/src/include/blockwise_direct_convolution.hip.hpp index 247ff219f6..7666607c9c 100644 --- a/src/include/blockwise_direct_convolution.hip.hpp +++ b/src/include/blockwise_direct_convolution.hip.hpp @@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, Float p_out_thread[out_thread_desc.GetElementSpace()]; threadwise_4d_tensor_copy(out_block_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc, p_out_thread, out_thread_desc.GetLengths()); @@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, // threadwise convolution threadwise_direct_convolution_2( in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data_begin, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data_begin, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0), @@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, threadwise_4d_tensor_copy(out_thread_desc, p_out_thread, out_block_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc.GetLengths()); } } diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index 33556dde25..b44bb797b3 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 constexpr unsigned MRepeat = MPerThread / MPerThreadSubC; constexpr unsigned NRepeat = NPerThread / NPerThreadSubC; - // loop over k +// loop over k #pragma unroll for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) { - // read first batch of A, B - // copy A-sub to form A +// read first batch of A, B +// copy A-sub to form A #pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { @@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 a_thread_sub_mtx.GetLengths()); } - // copy B-sub to form B +// copy B-sub to form B #pragma unroll for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { @@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 b_thread_sub_mtx.GetLengths()); } - // loop over batch +// loop over batch #pragma unroll for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) { @@ -557,8 +557,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 { threadwise_matrix_copy( c_thread_sub_mtx, - p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, - n_repeat * NPerLevel1Cluster), + p_c_thread + + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, + n_repeat * NPerLevel1Cluster), c_block_mtx, p_c_block + c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, @@ -656,8 +657,9 @@ struct BlockwiseGemmBlockABlockBThreadC constexpr unsigned NClusterWork = (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster); - static_assert(BlockSize == (MClusterWork * MThreadPerCluster) * - (NClusterWork * NThreadPerCluster), + static_assert(BlockSize == + (MClusterWork * MThreadPerCluster) * + (NClusterWork * NThreadPerCluster), "wrong! wrong BlockSize"); if(DistributeThreadAlongColumnFirst) @@ -1256,8 +1258,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), c_thread_sub_mtx, False, - p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, - n_repeat * NPerThreadSubC), + p_c_thread + + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, + n_repeat * NPerThreadSubC), f_accum); } } diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index 2df008fcad..13cab61b46 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -65,7 +65,7 @@ struct vector_type }; #endif -#if 1 +#if 0 template <> struct vector_type { @@ -139,6 +139,7 @@ struct Sequence } }; +#if DEVICE_BACKEND_CUDA template __host__ __device__ constexpr T max(T a, T b) { @@ -150,6 +151,7 @@ __host__ __device__ constexpr T min(T a, T b) { return a < b ? a : b; } +#endif __host__ __device__ constexpr unsigned integer_divide_ceil(unsigned a, unsigned b) { diff --git a/src/include/config.h.in b/src/include/config.h.in index 9ee0c41f80..1c6e325ff9 100644 --- a/src/include/config.h.in +++ b/src/include/config.h.in @@ -4,7 +4,6 @@ #if DEVICE_BACKEND_HIP #include "hip/hip_runtime.h" -#include "half.hpp" #elif DEVICE_BACKEND_CUDA #include "cuda_runtime.h" #include "nvToolsExt.h" diff --git a/src/include/gridwise_direct_convolution_1.hip.hpp b/src/include/gridwise_direct_convolution_1.hip.hpp index f4fe1809fc..edcfd6d38e 100644 --- a/src/include/gridwise_direct_convolution_1.hip.hpp +++ b/src/include/gridwise_direct_convolution_1.hip.hpp @@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ c_block_work_begin += CPerBlock) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, - c_block_work_begin, - hi_block_work_begin, - wi_block_work_begin), + blockwise_in_copy.Run(p_in_global + + in_global_desc.Get1dIndex(n_block_work_begin, + c_block_work_begin, + hi_block_work_begin, + wi_block_work_begin), p_in_block); // copy weight tensor to LDS @@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ } // copy output tensor from LDS to device mem - blockwise_out_copy.Run(p_out_block, - p_out_global + out_global_desc.Get1dIndex(n_block_work_begin, - k_block_work_begin, - ho_block_work_begin, - wo_block_work_begin)); + blockwise_out_copy.Run( + p_out_block, + p_out_global + + out_global_desc.Get1dIndex( + n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin)); } diff --git a/src/include/gridwise_direct_convolution_2.hip.hpp b/src/include/gridwise_direct_convolution_2.hip.hpp index 13f9e6cf1d..bb65f1d156 100644 --- a/src/include/gridwise_direct_convolution_2.hip.hpp +++ b/src/include/gridwise_direct_convolution_2.hip.hpp @@ -139,10 +139,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + + in_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS @@ -158,10 +159,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ #if 1 threadwise_direct_convolution_2( in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), out_thread_desc, @@ -169,10 +171,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ #elif 0 threadwise_direct_convolution_3( in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), out_thread_desc, @@ -186,9 +189,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ out_thread_desc, p_out_thread, out_global_desc, - p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp index 1caef669e9..99342d3ca1 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp @@ -184,8 +184,9 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread); const Float* p_in_global_block_begin = - p_in_global + in_chwn_global_desc.Get1dIndex( - 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); + p_in_global + + in_chwn_global_desc.Get1dIndex( + 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_begin = p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); @@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric } } - // output: register to global mem, +// output: register to global mem, #if 0 const auto c_thread_mtx_begin = blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); @@ -286,16 +287,17 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric } #endif - threadwise_8d_tensor_copy(out_8d_thread_desc, - p_out_thread, - out_8d_global_desc, - p_out_global + out_khwn_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_8d_thread_desc.GetLengths(), - Number{}); + threadwise_8d_tensor_copy( + out_8d_thread_desc, + p_out_thread, + out_8d_global_desc, + p_out_global + + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_8d_thread_desc.GetLengths(), + Number{}); } else if(NPerThread == NPerBlock) { diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp index a4904cdf58..790a006023 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp @@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( out_hkwn_thread_desc, p_out_thread, out_khwn_global_desc, - p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), + p_out_global + + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), out_hkwn_thread_desc.GetLengths(), reorder_khwn_from_hkwn); } diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index 7c802266d8..37774d360f 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b decltype(in_cb_block_desc), decltype(in_cb_block_desc.GetLengths())>{}; #elif 0 - const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = Blockwise2dTensorCopy3