diff --git a/driver/conv.cu b/driver/conv.cu index 999c03f03b..04ab123266 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -396,6 +396,9 @@ int main() constexpr unsigned K = 64; constexpr unsigned S = 3; constexpr unsigned R = 3; + + constexpr unsigned HPad = 0; + constexpr unsigned WPad = 0; #elif 0 // 3x3, 56x56 constexpr unsigned N = 64; @@ -586,7 +589,7 @@ int main() #endif (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(in_nchw_desc, in_nchw, wei_kcsr_desc, @@ -598,7 +601,7 @@ int main() nrepeat); #endif -#if 0 +#if 1 if(S == 3 && R == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh index 7d3435823d..157219601e 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh @@ -87,7 +87,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, constexpr unsigned WoPerThread = 1; constexpr unsigned BlockSize = 8; -#elif 0 +#elif 1 // for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 64; @@ -162,7 +162,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, constexpr unsigned WoPerThread = 1; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 // for 1x1, 28x28 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 128; diff --git a/src/include/gemm.cuh b/src/include/blockwise_gemm.cuh similarity index 88% rename from src/include/gemm.cuh rename to src/include/blockwise_gemm.cuh index 99ecc4b962..ded4cdd017 100644 --- a/src/include/gemm.cuh +++ b/src/include/blockwise_gemm.cuh @@ -1,76 +1,5 @@ #pragma once - -template -__device__ void -threadwise_matrix_copy(SrcMatrix, Float* const p_src, DstMatrix, Float* p_dst, Sequence) -{ - const auto src_mtx = SrcMatrix{}; // constexpr doesn't compile - const auto dst_mtx = DstMatrix{}; // constexpr doesn't compile - - for(unsigned i = 0; i < NRow; ++i) - { - for(unsigned j = 0; j < NCol; ++j) - { - const unsigned src_index = src_mtx.Get1dIndex(i, j); - const unsigned dst_index = dst_mtx.Get1dIndex(i, j); - - p_dst[dst_index] = p_src[src_index]; - } - } -} - -template -__device__ void threadwise_gemm(MatrixA, - Constant, - FloatA* const p_a_thread, - MatrixB, - Constant, - FloatB* const p_b_thread, - MatrixC, - Constant, - FloatC* p_c_thread, - Accumulator f_accum) -{ - if(TransA && (!TransB) && (!TransC)) - { - const auto a_mtx = MatrixA{}; // constexpr doesn't compile - const auto b_mtx = MatrixB{}; // constexpr doesn't compile - const auto c_mtx = MatrixC{}; // constexpr doesn't compile - - constexpr unsigned M = c_mtx.NRow(); - constexpr unsigned N = c_mtx.NCol(); - constexpr unsigned K = a_mtx.NRow(); // A is transposed - - for(unsigned i = 0; i < M; ++i) - { - for(unsigned j = 0; j < N; ++j) - { - for(unsigned k = 0; k < K; ++k) - { - const unsigned aindex = a_mtx.Get1dIndex(k, i); // A is transposed - const unsigned bindex = b_mtx.Get1dIndex(k, j); - const unsigned cindex = c_mtx.Get1dIndex(i, j); - - f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); - } - } - } - } - else - { - // not implemented - assert(false); - } -} +#include "threadwise_gemm.cuh" template {}; -#endif // a series of blockwise GEMM // c_mtx += transpose(a_mtx) * b_mtx @@ -180,21 +178,25 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, // set threadwise output tensor to 0 threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread); - for(unsigned c_block_data_begin = 0; c_block_data_begin < C; - c_block_data_begin += CPerBlock, __syncthreads()) + Float* p_in_global_block_offset = + p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); + + Float* p_wei_global_block_offset = + p_wei_global + wei_srck_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + + for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, + p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0), + p_wei_global_block_offset += CPerBlock * wei_srck_global_desc.GetStride(I2), + __syncthreads()) { #if 1 // input: global mem to LDS, - blockwise_in_copy.run( - p_in_global + in_cb_global_desc.Get1dIndex(c_block_data_begin, b_block_data_begin), - p_in_block); + blockwise_in_copy.run(p_in_global_block_offset, p_in_block); #endif #if 1 // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global + wei_srck_global_desc.Get1dIndex( - 0, 0, c_block_data_begin, k_block_data_begin), - p_wei_block); + blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block); #endif __syncthreads(); diff --git a/src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh index f9acb31d8d..df7be26cb0 100644 --- a/src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh @@ -5,7 +5,7 @@ #include "blockwise_4d_tensor_op.cuh" #include "blockwise_2d_tensor_op.cuh" #include "threadwise_2d_tensor_op.cuh" -#include "gemm.cuh" +#include "blockwise_gemm.cuh" // define B = N*Hi*Wi template +__device__ void +threadwise_matrix_copy(SrcMatrix, Float* const p_src, DstMatrix, Float* p_dst, Sequence) +{ + const auto src_mtx = SrcMatrix{}; // constexpr doesn't compile + const auto dst_mtx = DstMatrix{}; // constexpr doesn't compile + + for(unsigned i = 0; i < NRow; ++i) + { + for(unsigned j = 0; j < NCol; ++j) + { + const unsigned src_index = src_mtx.Get1dIndex(i, j); + const unsigned dst_index = dst_mtx.Get1dIndex(i, j); + + p_dst[dst_index] = p_src[src_index]; + } + } +} + +template +__device__ void threadwise_gemm(MatrixA, + Constant, + FloatA* const p_a_thread, + MatrixB, + Constant, + FloatB* const p_b_thread, + MatrixC, + Constant, + FloatC* p_c_thread, + Accumulator f_accum) +{ + if(TransA && (!TransB) && (!TransC)) + { + const auto a_mtx = MatrixA{}; // constexpr doesn't compile + const auto b_mtx = MatrixB{}; // constexpr doesn't compile + const auto c_mtx = MatrixC{}; // constexpr doesn't compile + + constexpr unsigned M = c_mtx.NRow(); + constexpr unsigned N = c_mtx.NCol(); + constexpr unsigned K = a_mtx.NRow(); // A is transposed + + for(unsigned i = 0; i < M; ++i) + { + for(unsigned j = 0; j < N; ++j) + { + for(unsigned k = 0; k < K; ++k) + { + const unsigned aindex = a_mtx.Get1dIndex(k, i); // A is transposed + const unsigned bindex = b_mtx.Get1dIndex(k, j); + const unsigned cindex = c_mtx.Get1dIndex(i, j); + + f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); + } + } + } + } + else + { + // not implemented + assert(false); + } +}