From 68ae0731f165fd665efebf13a39f1f538c118bbf Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 22 Mar 2019 19:40:01 -0500 Subject: [PATCH] experiment with hip compiler --- build/cmake-hip.sh | 3 + driver/driver.hip.cpp | 4 +- src/include/blockwise_gemm.hip.hpp | 216 ++++++++++++------ ..._gemm_convolution_1_chwn_cyxk_khwn.hip.hpp | 13 +- 4 files changed, 157 insertions(+), 79 deletions(-) diff --git a/build/cmake-hip.sh b/build/cmake-hip.sh index e418ea2fc1..3880c67a17 100755 --- a/build/cmake-hip.sh +++ b/build/cmake-hip.sh @@ -11,6 +11,9 @@ cmake -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_BUILD_TYPE=Release \ -D DEVICE_BACKEND="HIP" \ +-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only" \ +-D CMAKE_CXX_FLAGS="-gline-tables-only" \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ +-D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep" \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ ${MY_PROJECT_SOURCE} diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index e178de95cd..d38161c078 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -593,9 +593,9 @@ int main(int argc, char* argv[]) constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; #elif 1 - // 1x1 filter, 14x14 image, C = 256 + // 1x1 filter, 14x14 image, C = 512 constexpr unsigned N = 128; - constexpr unsigned C = 256; + constexpr unsigned C = 512; constexpr unsigned HI = 14; constexpr unsigned WI = 14; constexpr unsigned K = 512; diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index 9a18ca5fd7..36e80641f2 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -435,12 +435,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 #pragma unroll for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) { - // read first batch of A, B - // copy A-sub to form A - //#pragma unroll +// read first batch of A, B +// copy A-sub to form A +#pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { -#if 0 threadwise_matrix_copy( a_block_mtx, p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + @@ -448,25 +447,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 a_thread_mtx, p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), a_thread_sub_mtx.GetLengths()); -#else - for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i) - { - for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j) - { - p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] = - p_a_block[a_block_mtx.Get1dIndex(k_begin + i, - m_repeat * MPerLevel1Cluster + j) + - mMyThreadOffsetA]; - } - } -#endif } - // copy B-sub to form B - //#pragma unroll +// copy B-sub to form B +#pragma unroll for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { -#if 0 threadwise_matrix_copy( b_block_mtx, p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + @@ -474,26 +460,13 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 b_thread_mtx, p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), b_thread_sub_mtx.GetLengths()); -#else - for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i) - { - for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j) - { - p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] = - p_b_block[b_block_mtx.Get1dIndex(k_begin + i, - n_repeat * MPerLevel1Cluster + j) + - mMyThreadOffsetB]; - } - } -#endif } - // loop over batch - //#pragma unroll +// loop over batch +#pragma unroll for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) { -// do current batch of gemm -#if 0 + // do current batch of gemm threadwise_gemm(a_thread_mtx, True, p_a_thread, @@ -504,7 +477,140 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 False, p_c_thread + ib * ThreadMatrixStrideC, f_accum); -#else + + // read next batch of a, b + if(BlockMatrixStrideA != 0) + { +#pragma unroll + for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) + { + threadwise_matrix_copy( + a_block_mtx, + p_a_block + + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + + (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA, + a_thread_mtx, + p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), + a_thread_sub_mtx.GetLengths()); + } + } + + if(BlockMatrixStrideB != 0) + { +#pragma unroll + for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) + { + threadwise_matrix_copy( + b_block_mtx, + p_b_block + + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + + (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB, + b_thread_mtx, + p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), + b_thread_sub_mtx.GetLengths()); + } + } + } + + // do last batch of gemm + threadwise_gemm(a_thread_mtx, + True, + p_a_thread, + b_thread_mtx, + False, + p_b_thread, + c_thread_mtx, + False, + p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC, + f_accum); + } + } + + // this version put copy and compute in same place, experimenting with compiler behaviour + template + __device__ void Run_v2(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread, + Accumulator f_accum) const + { + constexpr auto True = integral_constant{}; + constexpr auto False = integral_constant{}; + + constexpr auto a_block_mtx = BlockMatrixA{}; + constexpr auto b_block_mtx = BlockMatrixB{}; + constexpr auto c_thread_mtx = ThreadMatrixC{}; + + constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed + + constexpr unsigned MPerThread = c_thread_mtx.NRow(); + constexpr unsigned NPerThread = c_thread_mtx.NCol(); + + // thread A, B for GEMM + // A is transposed, b is not + constexpr auto a_thread_mtx = + make_ConstantMatrixDescriptor(Number{}, Number{}); + + constexpr auto b_thread_mtx = + make_ConstantMatrixDescriptor(Number{}, Number{}); + + // thread A-sub, B-sub for copy + constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; + FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; + + constexpr unsigned MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster; + constexpr unsigned NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; + + constexpr unsigned MRepeat = MPerThread / MPerThreadSubC; + constexpr unsigned NRepeat = NPerThread / NPerThreadSubC; + + // loop over k + //#pragma unroll + for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) + { + // read first batch of A, B + // copy A-sub to form A + //#pragma unroll + for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) + { + for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j) + { + p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] = + p_a_block[a_block_mtx.Get1dIndex(k_begin + i, + m_repeat * MPerLevel1Cluster + j) + + mMyThreadOffsetA]; + } + } + } + + // copy B-sub to form B + //#pragma unroll + for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) + { + for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j) + { + p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] = + p_b_block[b_block_mtx.Get1dIndex(k_begin + i, + n_repeat * MPerLevel1Cluster + j) + + mMyThreadOffsetB]; + } + } + } + + // loop over batch + //#pragma unroll + for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) + { + // do current batch of gemm for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k) { for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i) @@ -521,7 +627,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 } } } -#endif // read next batch of a, b if(BlockMatrixStrideA != 0) @@ -529,16 +634,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 //#pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { -#if 0 - threadwise_matrix_copy( - a_block_mtx, - p_a_block + - a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + - (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA, - a_thread_mtx, - p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), - a_thread_sub_mtx.GetLengths()); -#else for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i) { for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j) @@ -550,7 +645,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA]; } } -#endif } } @@ -559,16 +653,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 //#pragma unroll for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { -#if 0 - threadwise_matrix_copy( - b_block_mtx, - p_b_block + - b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + - (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB, - b_thread_mtx, - p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), - b_thread_sub_mtx.GetLengths()); -#else for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i) { for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j) @@ -580,24 +664,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB]; } } -#endif } } } -// do last batch of gemm -#if 0 - threadwise_gemm(a_thread_mtx, - True, - p_a_thread, - b_thread_mtx, - False, - p_b_thread, - c_thread_mtx, - False, - p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC, - f_accum); -#else + // do last batch of gemm for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k) { for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i) @@ -613,7 +684,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 } } } -#endif } } diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp index 99342d3ca1..292a2f16eb 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp @@ -209,10 +209,15 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric { for(unsigned x = 0; x < X; ++x) { - blockwise_batch_gemm.Run(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0), - p_out_thread, - [](auto& acc, const auto&& v) { acc += v; }); +#if 1 + blockwise_batch_gemm.Run +#elif 0 + blockwise_batch_gemm.Run_v2 +#endif + (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0), + p_out_thread, + [](auto& acc, const auto&& v) { acc += v; }); } } }