From fdb7d416618e4329d3a0dbe33d43cc8d078b5a41 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 2 Apr 2019 17:58:44 -0500 Subject: [PATCH 1/7] cleaning up dead code [ROCm/composable_kernel commit: bdbc0eaad175cd4054105cfb3fc812a8526e3b49] --- ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp | 2 +- driver/driver.hip.cpp | 2 +- {build => script}/cmake-cuda.sh | 0 {build => script}/cmake-hip.sh | 0 src/include/blockwise_batched_gemm.hip.hpp | 455 ------------------ src/include/blockwise_gemm.hip.hpp | 217 +-------- 6 files changed, 6 insertions(+), 670 deletions(-) rename {build => script}/cmake-cuda.sh (100%) rename {build => script}/cmake-hip.sh (100%) diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp index 497aa3e9c1..a3489bc8cc 100644 --- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp @@ -221,7 +221,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, constexpr index_t BlockSize = 128; #elif 0 - // 1x1, 14x14, Vega 20, hack CPerBlock = 1 + // 1x1, 14x14, Vega 20, hack CPerBlock = 1 for debugging constexpr index_t BPerBlock = 64; constexpr index_t KPerBlock = 128; constexpr index_t CPerBlock = 1; diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 0ea091e607..a83e4082c7 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -580,7 +580,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 1x1 filter, 14x14 image, C = 2048 constexpr index_t N = 128; constexpr index_t C = 2048; diff --git a/build/cmake-cuda.sh b/script/cmake-cuda.sh similarity index 100% rename from build/cmake-cuda.sh rename to script/cmake-cuda.sh diff --git a/build/cmake-hip.sh b/script/cmake-hip.sh similarity index 100% rename from build/cmake-hip.sh rename to script/cmake-hip.sh diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hip.hpp index bf2777f140..deba68a261 100644 --- a/src/include/blockwise_batched_gemm.hip.hpp +++ b/src/include/blockwise_batched_gemm.hip.hpp @@ -1,231 +1,6 @@ #pragma once #include "threadwise_gemm.hip.hpp" -template -struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC -{ - index_t mMyThreadOffsetA = 0; - index_t mMyThreadOffsetB = 0; - - struct MatrixIndex - { - index_t batch; - index_t row; - index_t col; - }; - - __device__ Blockwise1dStridedBatchedGemmBlockABlockBThreadC() - { - constexpr auto a_block_mtx = BlockMatrixA{}; - constexpr auto b_block_mtx = BlockMatrixB{}; - - const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); - - mMyThreadOffsetA = c_thread_mtx_index.batch * BlockMatrixStrideA + - ((!TransA) ? a_block_mtx.Get1dIndex(c_thread_mtx_index.row, 0) - : a_block_mtx.Get1dIndex(0, c_thread_mtx_index.row)); - - mMyThreadOffsetB = c_thread_mtx_index.batch * BlockMatrixStrideB + - ((!TransB) ? b_block_mtx.Get1dIndex(0, c_thread_mtx_index.col) - : b_block_mtx.Get1dIndex(c_thread_mtx_index.col, 0)); - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantMatrixDescriptor(BlockMatrixA{}, "a_block_mtx: "); - print_ConstantMatrixDescriptor(BlockMatrixB{}, "b_block_mtx: "); - print_ConstantMatrixDescriptor(ThreadMatrixC{}, "c_thread_mtx: "); - - printf("%u %u, %u %u %u, %u %u\n", - get_block_1d_id(), - get_thread_local_1d_id(), - c_thread_mtx_index.batch, - c_thread_mtx_index.row, - c_thread_mtx_index.col, - mMyThreadOffsetA, - mMyThreadOffsetB); - } -#endif - } - - __device__ MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) const - { - - if(TransA && (!TransB) && (!TransC)) - { - constexpr auto a_block_mtx = BlockMatrixA{}; - constexpr auto b_block_mtx = BlockMatrixB{}; - - static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(), - "wrong! k dimension not consistent!"); - - constexpr index_t MPerBlock = a_block_mtx.NCol(); - constexpr index_t NPerBlock = b_block_mtx.NCol(); - - constexpr auto c_thread_mtx = ThreadMatrixC{}; - - // divide thread work - constexpr index_t MPerThread = c_thread_mtx.NRow(); - constexpr index_t NPerThread = c_thread_mtx.NCol(); - - static_assert(BatchSize % BatchPerThread == 0, "BatchSize % BatchPerThread != 0"); - static_assert(MPerBlock % MPerThread == 0, "MPerBlock % MPerThread != 0"); - static_assert(NPerBlock % NPerThread == 0, "NPerBlock % NPerThread != 0"); - - constexpr index_t BatchThreadWork = (BatchSize + BatchPerThread - 1) / BatchPerThread; - constexpr index_t MThreadWork = (MPerBlock + MPerThread - 1) / MPerThread; - constexpr index_t NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread; - - static_assert(BlockSize == BatchThreadWork * MThreadWork * NThreadWork, - "wrong! wrong BlockSize"); - - if(DistributeThreadAlongColumnFirst) - { - // num of operations can be reduced - const index_t b_work_id = thread_id / (MThreadWork * NThreadWork); - index_t itmp = thread_id - b_work_id * (MThreadWork * NThreadWork); - const index_t m_work_id = itmp / NThreadWork; - const index_t n_work_id = itmp - m_work_id * NThreadWork; - - return MatrixIndex{ - b_work_id * BatchPerThread, m_work_id * MPerThread, n_work_id * NPerThread}; - } - else - { - // not implemented - assert(false); - } - } - else - { - // not implemented - assert(false); - } - } - - // this should be optimized away if input is known - __device__ static MatrixIndex - GetDistanceFromBeginOfThreadMatrixC(index_t batch_in_c, index_t m_in_c, index_t n_in_c) - { - return MatrixIndex{batch_in_c, m_in_c, n_in_c}; - } - - template - __device__ void Run(const FloatA* __restrict__ p_a_block, - const FloatB* __restrict__ p_b_block, - FloatC* __restrict__ p_c_thread, - Accumulator f_accum) const - { - if(TransA && (!TransB) && (!TransC)) - { - constexpr auto True = integral_constant{}; - constexpr auto False = integral_constant{}; - - constexpr auto a_block_mtx = BlockMatrixA{}; - constexpr auto b_block_mtx = BlockMatrixB{}; - constexpr auto c_thread_mtx = ThreadMatrixC{}; - - constexpr index_t KPerBlock = a_block_mtx.NRow(); // A is transposed - - constexpr index_t MPerThread = c_thread_mtx.NRow(); - constexpr index_t NPerThread = c_thread_mtx.NCol(); - - // a is transposed, b is not - constexpr auto a_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); - - constexpr auto b_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); - - FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; - FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; - - // loop over k - for(index_t k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) - { - // read first batch of a, b - threadwise_matrix_copy(a_block_mtx, - p_a_block + mMyThreadOffsetA + - k_begin * a_block_mtx.RowStride(), - a_thread_mtx, - p_a_thread, - a_thread_mtx.GetLengths()); - - threadwise_matrix_copy(b_block_mtx, - p_b_block + mMyThreadOffsetB + - k_begin * b_block_mtx.RowStride(), - b_thread_mtx, - p_b_thread, - b_thread_mtx.GetLengths()); - - // loop over batch - for(index_t ib = 0; ib + 1 < BatchPerThread; ++ib) - { - // do current batch of gemm - threadwise_gemm(a_thread_mtx, - True, - p_a_thread, - b_thread_mtx, - False, - p_b_thread, - c_thread_mtx, - False, - p_c_thread + ib * ThreadMatrixStrideC, - f_accum); - - // read next batch of a, b - if(BlockMatrixStrideA != 0) - { - threadwise_matrix_copy(a_block_mtx, - p_a_block + mMyThreadOffsetA + - (ib + 1) * BlockMatrixStrideA + - +k_begin * a_block_mtx.RowStride(), - a_thread_mtx, - p_a_thread, - a_thread_mtx.GetLengths()); - } - - if(BlockMatrixStrideB != 0) - { - threadwise_matrix_copy(b_block_mtx, - p_b_block + mMyThreadOffsetB + - (ib + 1) * BlockMatrixStrideB + - k_begin * b_block_mtx.RowStride(), - b_thread_mtx, - p_b_thread, - b_thread_mtx.GetLengths()); - } - } - - // do last batch of gemm - threadwise_gemm(a_thread_mtx, - True, - p_a_thread, - b_thread_mtx, - False, - p_b_thread, - c_thread_mtx, - False, - p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC, - f_accum); - } - } - } -}; - template - __device__ void Run_v3(const FloatA* __restrict__ p_a_block, - const FloatB* __restrict__ p_b_block, - FloatC* __restrict__ p_c_thread, - Accumulator f_accum) const - { - constexpr auto True = integral_constant{}; - constexpr auto False = integral_constant{}; - - constexpr auto a_block_mtx = BlockMatrixA{}; - constexpr auto b_block_mtx = BlockMatrixB{}; - constexpr auto c_thread_mtx = ThreadMatrixC{}; - - constexpr index_t KPerBlock = a_block_mtx.NRow(); // A is transposed - - constexpr index_t MPerThread = c_thread_mtx.NRow(); - constexpr index_t NPerThread = c_thread_mtx.NCol(); - - // thread A, B for GEMM - // A is transposed, b is not - constexpr auto a_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); - - constexpr auto b_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); - - // thread A-sub, B-sub for copy - constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; - FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; - - constexpr index_t MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster; - constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; - - constexpr index_t MRepeat = MPerThread / MPerThreadSubC; - constexpr index_t NRepeat = NPerThread / NPerThreadSubC; - - // loop over k - //#pragma unroll - for(index_t k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) - { - // read first batch of A, B - // copy A-sub to form A - //#pragma unroll - for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat) - { - for(index_t i = 0; i < a_thread_sub_mtx.NRow(); ++i) - { -#if 1 - for(index_t j = 0; j < a_thread_sub_mtx.NCol(); ++j) - { - p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] = - p_a_block[a_block_mtx.Get1dIndex(k_begin + i, - m_repeat * MPerLevel1Cluster + j) + - mMyThreadOffsetA]; - } -#else - static_assert(a_thread_sub_mtx.NCol() == 4, "asm only read 4xfp32"); - -#endif - } - } - - // copy B-sub to form B - //#pragma unroll - for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat) - { - for(index_t i = 0; i < b_thread_sub_mtx.NRow(); ++i) - { - for(index_t j = 0; j < b_thread_sub_mtx.NCol(); ++j) - { - p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] = - p_b_block[b_block_mtx.Get1dIndex(k_begin + i, - n_repeat * MPerLevel1Cluster + j) + - mMyThreadOffsetB]; - } - } - } - - // loop over batch - //#pragma unroll - for(index_t ib = 0; ib + 1 < BatchPerThread; ++ib) - { - // do current batch of gemm - for(index_t k = 0; k < a_thread_mtx.NRow(); ++k) - { -#if 0 - for(index_t i = 0; i < c_thread_mtx.NRow(); ++i) - { - for(index_t j = 0; j < c_thread_mtx.NCol(); ++j) - { - const index_t aindex = - a_thread_mtx.Get1dIndex(k, i); // A is transposed - const index_t bindex = b_thread_mtx.Get1dIndex(k, j); - const index_t cindex = - c_thread_mtx.Get1dIndex(i, j) + ib * ThreadMatrixStrideC; - - f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); - } - } -#elif 1 - static_assert(c_thread_mtx.NRow() == 16 && c_thread_mtx.NCol() == 4, - "asm is only for 16x4"); - - const index_t bindex = b_thread_mtx.Get1dIndex(k, 0); - for(index_t i = 0; i < c_thread_mtx.NRow(); ++i) - { - const index_t aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed - const index_t cindex = c_thread_mtx.Get1dIndex(i, 0); - - asm volatile("\n \ - v_mac_f32 %0, %4, %5 \n \ - v_mac_f32 %1, %4, %6 \n \ - v_mac_f32 %2, %4, %7 \n \ - v_mac_f32 %3, %4, %8 \n \ - " - : "=v"(p_c_thread[cindex + 0]), - "=v"(p_c_thread[cindex + 1]), - "=v"(p_c_thread[cindex + 2]), - "=v"(p_c_thread[cindex + 3]) - : "v"(p_a_thread[aindex]), - "v"(p_b_thread[bindex + 0]), - "v"(p_b_thread[bindex + 1]), - "v"(p_b_thread[bindex + 2]), - "v"(p_b_thread[bindex + 3]), - "0"(p_c_thread[cindex + 0]), - "1"(p_c_thread[cindex + 1]), - "2"(p_c_thread[cindex + 2]), - "3"(p_c_thread[cindex + 3])); - } -#endif - } - - // read next batch of a, b - if(BlockMatrixStrideA != 0) - { - //#pragma unroll - for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat) - { - for(index_t i = 0; i < a_thread_sub_mtx.NRow(); ++i) - { - for(index_t j = 0; j < a_thread_sub_mtx.NCol(); ++j) - { - p_a_thread[a_thread_mtx.Get1dIndex(i, - m_repeat * MPerThreadSubC + j)] = - p_a_block[a_block_mtx.Get1dIndex( - k_begin + i, m_repeat * MPerLevel1Cluster + j) + - (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA]; - } - } - } - } - - if(BlockMatrixStrideB != 0) - { - //#pragma unroll - for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat) - { - for(index_t i = 0; i < b_thread_sub_mtx.NRow(); ++i) - { - for(index_t j = 0; j < b_thread_sub_mtx.NCol(); ++j) - { - p_b_thread[b_thread_mtx.Get1dIndex(i, - n_repeat * NPerThreadSubC + j)] = - p_b_block[b_block_mtx.Get1dIndex( - k_begin + i, n_repeat * MPerLevel1Cluster + j) + - (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB]; - } - } - } - } - } - - // do last batch of gemm - for(index_t k = 0; k < a_thread_mtx.NRow(); ++k) - { -#if 0 - for(index_t i = 0; i < c_thread_mtx.NRow(); ++i) - { - for(index_t j = 0; j < c_thread_mtx.NCol(); ++j) - { - const index_t aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed - const index_t bindex = b_thread_mtx.Get1dIndex(k, j); - const index_t cindex = c_thread_mtx.Get1dIndex(i, j) + - (BatchPerThread - 1) * ThreadMatrixStrideC; - - f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); - } - } -#elif 1 - static_assert(c_thread_mtx.NRow() == 16 && c_thread_mtx.NCol() == 4, - "asm is only for 16x4"); - - const index_t bindex = b_thread_mtx.Get1dIndex(k, 0); - for(index_t i = 0; i < c_thread_mtx.NRow(); ++i) - { - const index_t aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed - const index_t cindex = - c_thread_mtx.Get1dIndex(i, 0) + (BatchPerThread - 1) * ThreadMatrixStrideC; - - asm volatile("\n \ - v_mac_f32 %0, %4, %5 \n \ - v_mac_f32 %1, %4, %6 \n \ - v_mac_f32 %2, %4, %7 \n \ - v_mac_f32 %3, %4, %8 \n \ - " - : "=v"(p_c_thread[cindex + 0]), - "=v"(p_c_thread[cindex + 1]), - "=v"(p_c_thread[cindex + 2]), - "=v"(p_c_thread[cindex + 3]) - : "v"(p_a_thread[aindex]), - "v"(p_b_thread[bindex + 0]), - "v"(p_b_thread[bindex + 1]), - "v"(p_b_thread[bindex + 2]), - "v"(p_b_thread[bindex + 3]), - "0"(p_c_thread[cindex + 0]), - "1"(p_c_thread[cindex + 1]), - "2"(p_c_thread[cindex + 2]), - "3"(p_c_thread[cindex + 3])); - } -#endif - } - } - } - template __device__ void CopyThreadMatrixCToBlockMatrixC(const FloatC* __restrict__ p_c_thread, FloatC* __restrict__ p_c_block) const diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index 3e9c57d15f..fee5b704f3 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -3,215 +3,6 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; -template -struct BlockwiseGemmBlockABlockBThreadC -{ - index_t mMyThreadOffsetA = 0; - index_t mMyThreadOffsetB = 0; - - struct MatrixIndex - { - index_t row; - index_t col; - }; - - __device__ BlockwiseGemmBlockABlockBThreadC() - { - constexpr auto a_block_mtx = BlockMatrixA{}; - constexpr auto b_block_mtx = BlockMatrixB{}; - - const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); - - mMyThreadOffsetA = (!TransA) ? a_block_mtx.Get1dIndex(c_thread_mtx_index.row, 0) - : a_block_mtx.Get1dIndex(0, c_thread_mtx_index.row); - - mMyThreadOffsetB = (!TransB) ? b_block_mtx.Get1dIndex(0, c_thread_mtx_index.col) - : b_block_mtx.Get1dIndex(c_thread_mtx_index.col, 0); - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantMatrixDescriptor(BlockMatrixA{}, "a_block_mtx: "); - print_ConstantMatrixDescriptor(BlockMatrixB{}, "b_block_mtx: "); - print_ConstantMatrixDescriptor(ThreadMatrixC{}, "c_thread_mtx: "); - - printf("%u %u, %u %u %u, %u %u\n", - get_block_1d_id(), - get_thread_local_1d_id(), - c_thread_mtx_index.batch, - c_thread_mtx_index.row, - c_thread_mtx_index.col, - mMyThreadOffsetA, - mMyThreadOffsetB); - } -#endif - } - - __device__ MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) const - { - - if(TransA && (!TransB) && (!TransC)) - { - constexpr auto a_block_mtx = BlockMatrixA{}; - constexpr auto b_block_mtx = BlockMatrixB{}; - - static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(), - "wrong! k dimension not consistent!"); - - constexpr index_t MPerBlock = a_block_mtx.NCol(); - constexpr index_t NPerBlock = b_block_mtx.NCol(); - - constexpr auto c_thread_mtx = ThreadMatrixC{}; - - // divide thread work - constexpr index_t MPerThread = c_thread_mtx.NRow(); - constexpr index_t NPerThread = c_thread_mtx.NCol(); - - static_assert(MPerBlock % (MPerThread * MThreadPerCluster) == 0, - "MPerBlock % (MPerThread * MThreadPerCluster) != 0"); - - static_assert(NPerBlock % (NPerThread * NThreadPerCluster) == 0, - "NPerBlock % (NPerThread * NThreadPerCluster) != 0"); - - constexpr index_t MClusterWork = - (MPerBlock + MPerThread * MThreadPerCluster - 1) / (MPerThread * MThreadPerCluster); - - constexpr index_t NClusterWork = - (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster); - - static_assert(BlockSize == - (MClusterWork * MThreadPerCluster) * - (NClusterWork * NThreadPerCluster), - "wrong! wrong BlockSize"); - - if(DistributeThreadAlongColumnFirst) - { - const index_t cluster_work_block_id = - thread_id / (MThreadPerCluster * NThreadPerCluster); - - const index_t thread_work_cluster_id = - thread_id - cluster_work_block_id * (MThreadPerCluster * NThreadPerCluster); - - const index_t m_cluster_work_block_id = cluster_work_block_id / NClusterWork; - const index_t n_cluster_work_block_id = - cluster_work_block_id - m_cluster_work_block_id * NClusterWork; - - const index_t m_thread_work_cluster_id = thread_work_cluster_id / NThreadPerCluster; - const index_t n_thread_work_cluster_id = - thread_work_cluster_id - m_thread_work_cluster_id * NThreadPerCluster; - -#if 0 - if(get_block_1d_id() == 0) - { - printf("%u %u, \t" - "MClusterWork %u MThreadPerCluster %u NClusterWork %u NThreadPerCluster %u \t" - "m_cluster_work_block_id %u n_cluster_work_block_id %u \t" - "m_thread_work_cluster_id %u n_thread_work_cluster_id %u \t" - "\n", - get_block_1d_id(), get_thread_local_1d_id(), - MClusterWork, MThreadPerCluster, NClusterWork, NThreadPerCluster, - m_cluster_work_block_id, n_cluster_work_block_id, - m_thread_work_cluster_id, n_thread_work_cluster_id); - } -#endif - - return MatrixIndex{m_cluster_work_block_id * (MThreadPerCluster * MPerThread) + - m_thread_work_cluster_id * MPerThread, - n_cluster_work_block_id * (NThreadPerCluster * NPerThread) + - n_thread_work_cluster_id * NPerThread}; - } - else - { - // not implemented - assert(false); - } - } - else - { - // not implemented - assert(false); - } - } - - // this should be optimized away if input is known - __device__ static MatrixIndex GetDistanceFromBeginOfThreadMatrixC(index_t m_in_c, - index_t n_in_c) - { - return MatrixIndex{m_in_c, n_in_c}; - } - - template - __device__ void Run(const FloatA* __restrict__ p_a_block, - const FloatB* __restrict__ p_b_block, - FloatC* __restrict__ p_c_thread, - Accumulator f_accum) const - { - if(TransA && (!TransB) && (!TransC)) - { - constexpr auto True = integral_constant{}; - constexpr auto False = integral_constant{}; - - constexpr auto a_block_mtx = BlockMatrixA{}; - constexpr auto b_block_mtx = BlockMatrixB{}; - constexpr auto c_thread_mtx = ThreadMatrixC{}; - - constexpr index_t KPerBlock = a_block_mtx.NRow(); // A is transposed - - constexpr index_t MPerThread = c_thread_mtx.NRow(); - constexpr index_t NPerThread = c_thread_mtx.NCol(); - - // a is transposed, b is not - constexpr auto a_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); - - constexpr auto b_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); - - FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; - FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; - - // loop over k - for(index_t k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) - { - threadwise_matrix_copy(a_block_mtx, - p_a_block + mMyThreadOffsetA + - k_begin * a_block_mtx.RowStride(), - a_thread_mtx, - p_a_thread, - a_thread_mtx.GetLengths()); - - threadwise_matrix_copy(b_block_mtx, - p_b_block + mMyThreadOffsetB + - k_begin * b_block_mtx.RowStride(), - b_thread_mtx, - p_b_thread, - b_thread_mtx.GetLengths()); - - threadwise_gemm(a_thread_mtx, - True, - p_a_thread, - b_thread_mtx, - False, - p_b_thread, - c_thread_mtx, - False, - p_c_thread, - f_accum); - } - } - } -}; - // if following number are power of 2, index calculation shall be greatly reduced: // MPerThreadSubC, NPerThreadSubC, MLevel0Cluster, NLevel0Cluster, MLevel1Cluster, NLevel1Cluster template - __device__ void Run_v2(const FloatA* __restrict__ p_a_block, - const FloatB* __restrict__ p_b_block, - FloatC* __restrict__ p_c_thread, - Accumulator f_accum) const + __device__ void Run_PipelineReadAndCompute(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread, + Accumulator f_accum) const { constexpr auto True = integral_constant{}; constexpr auto False = integral_constant{}; From fb723b967f2adc8270b5920732c87227c7993ac7 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 2 Apr 2019 19:37:02 -0500 Subject: [PATCH 2/7] puting gridwise convolution into its own class [ROCm/composable_kernel commit: 0b41ca2d9e20ad41dc522942aaffb2ebc8ef9b92] --- ...licit_gemm_convolution_2_chwn_cyxk_khwn.hpp | 12 +++++++----- ...on_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp} | 18 +++++++----------- .../gridwise_convolution_wrapper.hip.hpp | 10 ++++++++++ 3 files changed, 24 insertions(+), 16 deletions(-) rename src/include/{gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp => gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp} (95%) create mode 100644 src/include/gridwise_convolution_wrapper.hip.hpp diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp index a3489bc8cc..3aae266e4c 100644 --- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp @@ -1,8 +1,9 @@ #pragma once #include #include "device.hpp" -#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp" -#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp" +#include "gridwise_convolution_wrapper.hip.hpp" +#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp" +//#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp" template void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, @@ -272,7 +273,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, { constexpr auto gridwise_conv = #if 1 - gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn + GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn #else gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer #endif @@ -301,11 +302,12 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, WeiBlockCopyThreadPerDim0, WeiBlockCopyThreadPerDim1, InBlockCopyDataPerRead, - WeiBlockCopyDataPerRead>(); + WeiBlockCopyDataPerRead>{}; - float time = launch_kernel(gridwise_conv.Run, + float time = launch_kernel(run_gridwise_convolution, dim3(GridSize), dim3(BlockSize), + gridwise_conv, static_cast(in_chwn_device_buf.GetDeviceBuffer()), static_cast(wei_cyxk_device_buf.GetDeviceBuffer()), static_cast(out_khwn_device_buf.GetDeviceBuffer())); diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp similarity index 95% rename from src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp rename to src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp index da689bc6b9..5f0d353465 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp @@ -34,10 +34,11 @@ template -class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn +struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn { - public: - __host__ __device__ static index_t GetSharedMemorySize() + __host__ __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn() {} + + __host__ __device__ constexpr index_t GetSharedMemoryUsage() const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -46,7 +47,6 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn constexpr auto in_chwn_global_desc = InGlobalDesc{}; constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{}; - constexpr auto out_khwn_global_desc = OutGlobalDesc{}; constexpr index_t Hi = in_chwn_global_desc.GetLength(I1); constexpr index_t Wi = in_chwn_global_desc.GetLength(I2); @@ -64,10 +64,6 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); - // tensor view of threadwise output in register - constexpr auto out_kb_thread_desc = - make_ConstantTensorDescriptor(Sequence{}); - constexpr index_t max_align = mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); @@ -81,9 +77,9 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn return (in_block_element_space + wei_block_element_space) * sizeof(Float); } - __global__ static void Run(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) + __device__ void Run(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/src/include/gridwise_convolution_wrapper.hip.hpp b/src/include/gridwise_convolution_wrapper.hip.hpp new file mode 100644 index 0000000000..e0abfda3b6 --- /dev/null +++ b/src/include/gridwise_convolution_wrapper.hip.hpp @@ -0,0 +1,10 @@ +#pragma once + +template +__global__ void run_gridwise_convolution(GridwiseConvolution, + const T* const __restrict__ p_in_global, + const T* const __restrict__ p_wei_global, + T* const __restrict__ p_out_global) +{ + GridwiseConvolution{}.Run(p_in_global, p_wei_global, p_out_global); +} From e423954e6e6c14271f7250a7c908c8b0e9c12442 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 2 Apr 2019 20:18:01 -0500 Subject: [PATCH 3/7] puting gridwise convolution into its own class [ROCm/composable_kernel commit: 6290e0b080ce54eca53c871f252ee312f8435a62] --- ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp | 4 +- src/include/blockwise_gemm.hip.hpp | 7 +- src/include/common.hip.hpp | 10 +- ...on_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp | 4 +- ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 381 ------------------ src/include/threadwise_gemm.hip.hpp | 8 +- 6 files changed, 21 insertions(+), 393 deletions(-) delete mode 100644 src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp index 3aae266e4c..9cebb0200a 100644 --- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "gridwise_convolution_wrapper.hip.hpp" #include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp" -//#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp" +#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp" template void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, @@ -275,7 +275,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, #if 1 GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn #else - gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer + GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer #endif {}; constexpr auto False = integral_constant{}; @@ -711,6 +710,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 "67"(p_c_thread[63])); #endif } +#else + printf("asm only support on HIP backend\n"); + assert(false); +#endif } template diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index ca083ee640..fc0d7d8bc3 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -26,7 +26,7 @@ __host__ __device__ constexpr index_t integer_divide_ceil(index_t a, index_t b) return (a + b - 1) / b; } -namespace mod_conv { +namespace mod_conv { // namespace mod_conv template __host__ __device__ constexpr T max(T x, T y) { @@ -62,4 +62,10 @@ __host__ __device__ constexpr T min(T x, Ts... xs) return x < y ? x : y; } -} +}// namespace mod_conv + +#if DEVICE_BACKEND_HIP +// cast a pointer of LDS to its address +extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; +#endif + diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp index 5f0d353465..a95363fa60 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp @@ -280,9 +280,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn for(index_t x = 0; x < X; ++x) { auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; -#if 0 +#if 1 blockwise_gemm.Run -#elif 1 +#elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 0 blockwise_gemm.Run_asm diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp deleted file mode 100644 index 488b0a0da7..0000000000 --- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ /dev/null @@ -1,381 +0,0 @@ -#pragma once -#include "common.hip.hpp" -#include "ConstantTensorDescriptor.hip.hpp" -#include "ConstantMatrixDescriptor.hip.hpp" -#include "blockwise_4d_tensor_op.hip.hpp" -#include "blockwise_2d_tensor_op.hip.hpp" -#include "threadwise_2d_tensor_op.hip.hpp" -#include "blockwise_gemm.hip.hpp" - -// define B = flatten(N, Hi, Wi) -template -class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer -{ - public: - __global__ static void Run(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto in_chwn_global_desc = InGlobalDesc{}; - constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{}; - constexpr auto out_khwn_global_desc = OutGlobalDesc{}; - - constexpr index_t C = in_chwn_global_desc.GetLength(I0); - constexpr index_t Hi = in_chwn_global_desc.GetLength(I1); - constexpr index_t Wi = in_chwn_global_desc.GetLength(I2); - constexpr index_t N = in_chwn_global_desc.GetLength(I3); - - constexpr index_t K = out_khwn_global_desc.GetLength(I0); - constexpr index_t Ho = out_khwn_global_desc.GetLength(I1); - constexpr index_t Wo = out_khwn_global_desc.GetLength(I2); - - constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1); - constexpr index_t X = wei_cyxk_global_desc.GetLength(I2); - - constexpr index_t B = N * Hi * Wi; - constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1); - - // divide block work by 2d: [K, B] - constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock; - constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock; - - const index_t k_block_work_id = get_block_1d_id() / BBlockWork; - const index_t b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork; - - const index_t k_block_data_begin = k_block_work_id * KPerBlock; - const index_t b_block_data_begin = b_block_work_id * BPerBlock; - - // flattend (2d) tensor view of gridwise input - constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence{}); - constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence{}); - - // tensor view of blockwise input and weight - // be careful of alignment - constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, Number{}); - - constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, Number{}); - - constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, Number{}); - - // tensor view of threadwise output in register - constexpr auto out_kb_thread_desc = - make_ConstantTensorDescriptor(Sequence{}); - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc"); - print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc"); - print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc"); - - print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc"); - print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc"); - - print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc"); - print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc"); - print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc"); - print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc"); - - printf("KPerBlock %u\n", KPerBlock); - } -#endif - -// blockwise in copy -// formmat is [CPerBlock,BPerBlock + BGhostRead] -#if 0 - const auto blockwise_in_copy = - Blockwise2dTensorCopy1{}; -#elif 0 - const auto blockwise_in_copy = - Blockwise2dTensorCopy2{}; -#elif 1 - const auto blockwise_in_copy = - Blockwise2dTensorCopy3{}; -#endif - -// blockwise wei copy -// format is [CPerBlock*Y*X,KPerBlock] -#if 0 - const auto blockwise_wei_copy = - Blockwise2dTensorCopy1{}; -#elif 0 - const auto blockwise_wei_copy = - Blockwise2dTensorCopy2{}; -#elif 1 - const auto blockwise_wei_copy = - Blockwise2dTensorCopy3{}; -#endif - - // a series of blockwise GEMM - // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx and b_mtx saved in LDS, c_mtx saved in register - // a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K] - // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] - // c_mtx[K,B] is out_block[K,B] - constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - constexpr auto c_kxb_thread_mtx_desc = - make_ConstantMatrixDescriptor(Number{}, Number{}); - -#if 0 - const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC{}; -#else - const auto blockwise_gemm = - BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2{}; -#endif - - // LDS: be careful of alignment - constexpr index_t in_block_element_size = - in_cb_block_desc.GetElementSpace(Number{}); - - constexpr index_t wei_block_element_size = - wei_cyxk_block_desc.GetElementSpace(Number{}); - - constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead - ? InBlockCopyDataPerRead - : WeiBlockCopyDataPerRead; - - // LDS double buffer - __shared__ Float - p_in_block_0[max_align * ((in_block_element_size + max_align - 1) / max_align)]; - __shared__ Float - p_wei_block_0[max_align * ((wei_block_element_size + max_align - 1) / max_align)]; - - __shared__ Float - p_in_block_1[max_align * ((in_block_element_size + max_align - 1) / max_align)]; - __shared__ Float - p_wei_block_1[max_align * ((wei_block_element_size + max_align - 1) / max_align)]; - - const Float* p_in_global_block_offset = - p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); - - const Float* p_wei_global_block_offset = - p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); - - // preload data into LDS - blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_0); - blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_0); - - p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); - p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0); - - // register - Float p_out_thread[out_kb_thread_desc.GetElementSpace()]; - - // set threadwise output tensor to 0 - threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread); - - bool even_loop = true; - - for(index_t c_block_data_begin = 0; c_block_data_begin + CPerBlock < C; - c_block_data_begin += CPerBlock, - p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0), - p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0), - even_loop = !even_loop) - { - Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1; - Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1; - - Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0; - Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0; - - __syncthreads(); - -// load next data -#if 0 - blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next); - blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next); -#elif 1 - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); -#endif - - // compute on current data - // a series of GEMM - for(index_t y = 0; y < Y; ++y) - { - for(index_t x = 0; x < X; ++x) - { - auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; -#if 1 - blockwise_gemm.Run -#else - blockwise_gemm.Run_RegisterDoubleBuffer -#endif - (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block_now + y * Wi + x, - p_out_thread, - f_accum); - } - } - -#if 1 - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_next); -#endif - } - - // last computation - { - Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1; - Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1; - - __syncthreads(); - - for(index_t y = 0; y < Y; ++y) - { - for(index_t x = 0; x < X; ++x) - { - auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; -#if 1 - blockwise_gemm.Run -#else - blockwise_gemm.Run_RegisterDoubleBuffer -#endif - (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block_now + y * Wi + x, - p_out_thread, - f_accum); - } - } - } - - // output: register to global mem, - const auto c_thread_mtx_begin = - blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); - - const index_t k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row; - const index_t b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col; - -#if 0 - if(get_block_1d_id() == 0) - { - printf("%u %u, row %u col %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n", - get_block_1d_id(), - get_thread_local_1d_id(), - matrix_c_index.row, - matrix_c_index.col, - k_data_begin, - b_data_begin, - p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]); - } -#endif - - for(index_t k = 0; k < out_kb_thread_desc.GetLength(I0); ++k) - { - for(index_t b = 0; b < out_kb_thread_desc.GetLength(I1); ++b) - { - const auto c_thread_mtx_distance = - blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b); - - index_t k_data = k_thread_data_begin + c_thread_mtx_distance.row; - index_t b_data = b_thread_data_begin + c_thread_mtx_distance.col; - - index_t h_data = b_data / (Wi * N); - index_t itmp = b_data - h_data * (Wi * N); - index_t w_data = itmp / N; - index_t n_data = itmp - w_data * N; - - if(n_data < N && h_data < Ho && w_data < Wo) - { - p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] = - p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]; - } - } - } - } -}; diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hip.hpp index d1c7e830d0..c81fd82cd3 100644 --- a/src/include/threadwise_gemm.hip.hpp +++ b/src/include/threadwise_gemm.hip.hpp @@ -10,7 +10,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix, constexpr auto src_mtx = SrcMatrix{}; constexpr auto dst_mtx = DstMatrix{}; -#if 0 +#if 1 for(index_t i = 0; i < NRow; ++i) { for(index_t j = 0; j < NCol; ++j) @@ -78,9 +78,7 @@ __device__ void threadwise_gemm(MatrixA, const index_t bindex = b_mtx.Get1dIndex(k, j); const index_t cindex = c_mtx.Get1dIndex(i, j); -#if 0 - f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); -#elif 1 +#if DEVICE_BACKEND_HIP // this only does c += a * b asm volatile("\n \ v_mac_f32 %0, %1, %2 \n \ " @@ -88,6 +86,8 @@ __device__ void threadwise_gemm(MatrixA, : "v"(p_a_thread[aindex]), "v"(p_b_thread[bindex]), "0"(p_c_thread[cindex])); +#else // this does general accumulation defined by f_accum + f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); #endif } } From 0b2a76553db5b256c24dae21cd4c25dc172b4eac Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 2 Apr 2019 20:19:24 -0500 Subject: [PATCH 4/7] add a missing file [ROCm/composable_kernel commit: c23474104508d5b08b2548365751993701a2ec73] --- ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 382 ++++++++++++++++++ 1 file changed, 382 insertions(+) create mode 100644 src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp new file mode 100644 index 0000000000..a731964bc6 --- /dev/null +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -0,0 +1,382 @@ +#pragma once +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" +#include "ConstantMatrixDescriptor.hip.hpp" +#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_2d_tensor_op.hip.hpp" +#include "threadwise_2d_tensor_op.hip.hpp" +#include "blockwise_gemm.hip.hpp" + +// define B = flatten(N, Hi, Wi) +template +struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer +{ + __host__ __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer() {} + + __device__ void Run(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_chwn_global_desc = InGlobalDesc{}; + constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{}; + constexpr auto out_khwn_global_desc = OutGlobalDesc{}; + + constexpr index_t C = in_chwn_global_desc.GetLength(I0); + constexpr index_t Hi = in_chwn_global_desc.GetLength(I1); + constexpr index_t Wi = in_chwn_global_desc.GetLength(I2); + constexpr index_t N = in_chwn_global_desc.GetLength(I3); + + constexpr index_t K = out_khwn_global_desc.GetLength(I0); + constexpr index_t Ho = out_khwn_global_desc.GetLength(I1); + constexpr index_t Wo = out_khwn_global_desc.GetLength(I2); + + constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1); + constexpr index_t X = wei_cyxk_global_desc.GetLength(I2); + + constexpr index_t B = N * Hi * Wi; + constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1); + + // divide block work by 2d: [K, B] + constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock; + constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock; + + const index_t k_block_work_id = get_block_1d_id() / BBlockWork; + const index_t b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork; + + const index_t k_block_data_begin = k_block_work_id * KPerBlock; + const index_t b_block_data_begin = b_block_work_id * BPerBlock; + + // flattend (2d) tensor view of gridwise input + constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence{}); + constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence{}); + + // tensor view of blockwise input and weight + // be careful of alignment + constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + // tensor view of threadwise output in register + constexpr auto out_kb_thread_desc = + make_ConstantTensorDescriptor(Sequence{}); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc"); + print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc"); + print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc"); + + print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc"); + print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc"); + + print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc"); + print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc"); + print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc"); + print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc"); + + printf("KPerBlock %u\n", KPerBlock); + } +#endif + +// blockwise in copy +// formmat is [CPerBlock,BPerBlock + BGhostRead] +#if 0 + const auto blockwise_in_copy = + Blockwise2dTensorCopy1{}; +#elif 0 + const auto blockwise_in_copy = + Blockwise2dTensorCopy2{}; +#elif 1 + const auto blockwise_in_copy = + Blockwise2dTensorCopy3{}; +#endif + +// blockwise wei copy +// format is [CPerBlock*Y*X,KPerBlock] +#if 0 + const auto blockwise_wei_copy = + Blockwise2dTensorCopy1{}; +#elif 0 + const auto blockwise_wei_copy = + Blockwise2dTensorCopy2{}; +#elif 1 + const auto blockwise_wei_copy = + Blockwise2dTensorCopy3{}; +#endif + + // a series of blockwise GEMM + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx and b_mtx saved in LDS, c_mtx saved in register + // a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K] + // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] + // c_mtx[K,B] is out_block[K,B] + constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + constexpr auto c_kxb_thread_mtx_desc = + make_ConstantMatrixDescriptor(Number{}, Number{}); + +#if 0 + const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC{}; +#else + const auto blockwise_gemm = + BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2{}; +#endif + + // LDS: be careful of alignment + constexpr index_t in_block_element_size = + in_cb_block_desc.GetElementSpace(Number{}); + + constexpr index_t wei_block_element_size = + wei_cyxk_block_desc.GetElementSpace(Number{}); + + constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead + ? InBlockCopyDataPerRead + : WeiBlockCopyDataPerRead; + + // LDS double buffer + __shared__ Float + p_in_block_0[max_align * ((in_block_element_size + max_align - 1) / max_align)]; + __shared__ Float + p_wei_block_0[max_align * ((wei_block_element_size + max_align - 1) / max_align)]; + + __shared__ Float + p_in_block_1[max_align * ((in_block_element_size + max_align - 1) / max_align)]; + __shared__ Float + p_wei_block_1[max_align * ((wei_block_element_size + max_align - 1) / max_align)]; + + const Float* p_in_global_block_offset = + p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); + + const Float* p_wei_global_block_offset = + p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + + // preload data into LDS + blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_0); + blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_0); + + p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); + p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0); + + // register + Float p_out_thread[out_kb_thread_desc.GetElementSpace()]; + + // set threadwise output tensor to 0 + threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread); + + bool even_loop = true; + + for(index_t c_block_data_begin = 0; c_block_data_begin + CPerBlock < C; + c_block_data_begin += CPerBlock, + p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0), + p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0), + even_loop = !even_loop) + { + Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1; + Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1; + + Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0; + Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0; + + __syncthreads(); + +// load next data +#if 0 + blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next); + blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next); +#elif 1 + Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; + Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + + blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, + p_in_register_clipboard); + + blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, + p_wei_register_clipboard); +#endif + + // compute on current data + // a series of GEMM + for(index_t y = 0; y < Y; ++y) + { + for(index_t x = 0; x < X; ++x) + { + auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; +#if 1 + blockwise_gemm.Run +#else + blockwise_gemm.Run_RegisterDoubleBuffer +#endif + (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block_now + y * Wi + x, + p_out_thread, + f_accum); + } + } + +#if 1 + blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next); + blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, + p_wei_block_next); +#endif + } + + // last computation + { + Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1; + Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1; + + __syncthreads(); + + for(index_t y = 0; y < Y; ++y) + { + for(index_t x = 0; x < X; ++x) + { + auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; +#if 1 + blockwise_gemm.Run +#else + blockwise_gemm.Run_RegisterDoubleBuffer +#endif + (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block_now + y * Wi + x, + p_out_thread, + f_accum); + } + } + } + + // output: register to global mem, + const auto c_thread_mtx_begin = + blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); + + const index_t k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row; + const index_t b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col; + +#if 0 + if(get_block_1d_id() == 0) + { + printf("%u %u, row %u col %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n", + get_block_1d_id(), + get_thread_local_1d_id(), + matrix_c_index.row, + matrix_c_index.col, + k_data_begin, + b_data_begin, + p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]); + } +#endif + + for(index_t k = 0; k < out_kb_thread_desc.GetLength(I0); ++k) + { + for(index_t b = 0; b < out_kb_thread_desc.GetLength(I1); ++b) + { + const auto c_thread_mtx_distance = + blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b); + + index_t k_data = k_thread_data_begin + c_thread_mtx_distance.row; + index_t b_data = b_thread_data_begin + c_thread_mtx_distance.col; + + index_t h_data = b_data / (Wi * N); + index_t itmp = b_data - h_data * (Wi * N); + index_t w_data = itmp / N; + index_t n_data = itmp - w_data * N; + + if(n_data < N && h_data < Ho && w_data < Wo) + { + p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] = + p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]; + } + } + } + } +}; From 9d61f2597a855a4f94a3752bd309ca6929a5004c Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 2 Apr 2019 20:26:58 -0500 Subject: [PATCH 5/7] add cuda extract_asm script [ROCm/composable_kernel commit: e6c86f81b54a1aef7c80108549496281a6d56a8d] --- script/cmake-cuda.sh | 9 +++++---- script/extract_asm-cuda.sh | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) create mode 100755 script/extract_asm-cuda.sh diff --git a/script/cmake-cuda.sh b/script/cmake-cuda.sh index 0110075c0d..0e7d00c469 100755 --- a/script/cmake-cuda.sh +++ b/script/cmake-cuda.sh @@ -15,11 +15,12 @@ cmake -D DEVICE_BACKEND=CUDA \ -D BOOST_ROOT="/package/install/boost_1.67.0" \ -D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc" \ --D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \ +-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \ ${MY_PROJECT_SOURCE} #-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc" \ -#-D CMAKE_CUDA_FLAGS="-G -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \ -#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \ -#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61 -Xptxas -v -maxrregcount=128" \ +#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \ +#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \ +#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \ +#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70 -Xptxas -v -maxrregcount=128" \ diff --git a/script/extract_asm-cuda.sh b/script/extract_asm-cuda.sh new file mode 100755 index 0000000000..84a0af76dd --- /dev/null +++ b/script/extract_asm-cuda.sh @@ -0,0 +1 @@ +cuobjdump -xelf all ./driver/driver && nvdisasm --print-code -g driver.sm_61.cubin > driver.sm_61.asm && nvdisasm --print-code -g driver.sm_70.cubin > driver.sm_70.asm From e277457dceec95938d394eb3b0a571210c238ef5 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 2 Apr 2019 20:30:00 -0500 Subject: [PATCH 6/7] tidy up [ROCm/composable_kernel commit: e2313c9eca16d1ef503ea4e3ae1ebc437dca2e9f] --- src/include/common.hip.hpp | 3 +-- ...e_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp | 8 ++++---- ...icit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 9 ++++++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index fc0d7d8bc3..6b8c450021 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -62,10 +62,9 @@ __host__ __device__ constexpr T min(T x, Ts... xs) return x < y ? x : y; } -}// namespace mod_conv +} // namespace mod_conv #if DEVICE_BACKEND_HIP // cast a pointer of LDS to its address extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; #endif - diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp index a95363fa60..e04b1752ea 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp @@ -287,10 +287,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn #elif 0 blockwise_gemm.Run_asm #endif - (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block + y * Wi + x, - p_out_thread, - f_accum); + (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block + y * Wi + x, + p_out_thread, + f_accum); } } } diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index a731964bc6..b68b73a6ad 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -36,11 +36,14 @@ template struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { - __host__ __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer() {} + __host__ + __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer() + { + } __device__ void Run(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) const + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; From 770c141f35ee9f10f827058503c58d6aed3067f0 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 3 Apr 2019 10:36:18 -0500 Subject: [PATCH 7/7] add script to extrac asm on hip [ROCm/composable_kernel commit: 6166233e05b821e55bcca2bba2d9adad16347f82] --- script/compile-hip.sh | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100755 script/compile-hip.sh diff --git a/script/compile-hip.sh b/script/compile-hip.sh new file mode 100755 index 0000000000..4c91e8a293 --- /dev/null +++ b/script/compile-hip.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export KMDUMPISA=1 +export KMDUMPLLVM=1 + +make -j driver +/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.isa