From 1de6fd07535833877019634a95eafd329406be4c Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 24 Jan 2019 21:20:29 -0600 Subject: [PATCH] fixed a bug, and refactored --- driver/conv.cu | 10 +-- ..._implicit_gemm_convolution_1_nchw_kcsr.cuh | 75 +++++++++++-------- ...icit_gemm_convolution_1_nchw_srck_nkhw.cuh | 15 +--- ..._implicit_gemm_convolution_1_nchw_kcsr.cuh | 2 +- ...icit_gemm_convolution_1_nchw_srck_nkhw.cuh | 6 +- 5 files changed, 54 insertions(+), 54 deletions(-) diff --git a/driver/conv.cu b/driver/conv.cu index 24538059d9..82d711e447 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -8,7 +8,7 @@ #include "conv_common.cuh" #include "device_direct_convolution_1.cuh" #include "device_direct_convolution_2.cuh" -//#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh" +#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh" #include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" #include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" @@ -361,7 +361,7 @@ int main() constexpr unsigned K = 1; constexpr unsigned S = 3; constexpr unsigned R = 3; -#elif 0 +#elif 1 // 3x3, 34x34 constexpr unsigned N = 64; constexpr unsigned C = 256; @@ -370,7 +370,7 @@ int main() constexpr unsigned K = 64; constexpr unsigned S = 3; constexpr unsigned R = 3; -#elif 1 +#elif 0 // 3x3, 54x54 constexpr unsigned N = 64; constexpr unsigned C = 64; @@ -388,7 +388,7 @@ int main() constexpr unsigned K = 64; constexpr unsigned S = 3; constexpr unsigned R = 3; -#elif 1 +#elif 0 // 3x3, 58x58 constexpr unsigned N = 64; constexpr unsigned C = 64; @@ -449,7 +449,7 @@ int main() device_direct_convolution_2 #elif 0 device_implicit_gemm_convolution_1_nchw_kcsr -#elif 0 +#elif 1 device_implicit_gemm_convolution_1_nchw_srck_nkhw #elif 1 device_implicit_gemm_convolution_1_chwn_csrk_khwn diff --git a/driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh b/driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh index a1e311199b..af4460335b 100644 --- a/driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh +++ b/driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh @@ -1,9 +1,15 @@ #pragma once #include "gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh" +#include template -void device_implicit_gemm_convolution_1_nchw_kcsr( - InDesc, const Tensor& in, WeiDesc, const Tensor& wei, OutDesc, Tensor& out) +void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc, + const Tensor& in, + WeiDesc, + const Tensor& wei, + OutDesc, + Tensor& out, + unsigned nrepeat) { std::size_t data_sz = sizeof(T); DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); @@ -75,40 +81,45 @@ void device_implicit_gemm_convolution_1_nchw_kcsr( printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); - cudaEvent_t start, stop; - float elapsedTime; + for(unsigned i = 0; i < nrepeat; ++i) + { + cudaEvent_t start, stop; + float elapsedTime; - cudaEventCreate(&start); - cudaEventRecord(start, 0); + cudaEventCreate(&start); + cudaEventRecord(start, 0); - gridwise_implicit_gemm_convolution_1_nchw_kcsr - <<>>(InDesc{}, - static_cast(in_device_buf.GetDeviceBuffer()), - WeiDesc{}, - static_cast(wei_device_buf.GetDeviceBuffer()), - OutDesc{}, - static_cast(out_device_buf.GetDeviceBuffer())); + gridwise_implicit_gemm_convolution_1_nchw_kcsr + <<>>(InDesc{}, + static_cast(in_device_buf.GetDeviceBuffer()), + WeiDesc{}, + static_cast(wei_device_buf.GetDeviceBuffer()), + OutDesc{}, + static_cast(out_device_buf.GetDeviceBuffer())); - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + cudaEventCreate(&stop); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Elapsed time : %f ms\n", elapsedTime); + + usleep(10000); + } checkCudaErrors(cudaGetLastError()); out_device_buf.FromDevice(out.mData.data()); diff --git a/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh b/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh index 050176416f..f948f5205e 100644 --- a/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh +++ b/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh @@ -65,20 +65,8 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, constexpr unsigned WoPerThread = 2; constexpr unsigned BlockSize = 16; -#elif 0 - constexpr unsigned NPerBlock = 1; - constexpr unsigned KPerBlock = 64; - constexpr unsigned CPerBlock = 2; - constexpr unsigned HoPerBlock = 4; - constexpr unsigned WoPerBlock = 32; - - constexpr unsigned KPerThread = 8; - constexpr unsigned CPerThread = 1; - constexpr unsigned HoPerThread = 2; - constexpr unsigned WoPerThread = 4; - - constexpr unsigned BlockSize = 128; #elif 1 + // for 3x3, 34x34 constexpr unsigned NPerBlock = 1; constexpr unsigned KPerBlock = 64; constexpr unsigned CPerBlock = 2; @@ -92,6 +80,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, constexpr unsigned BlockSize = 128; #elif 0 + // for 3x3, 34x34 constexpr unsigned NPerBlock = 2; constexpr unsigned KPerBlock = 64; constexpr unsigned CPerBlock = 2; diff --git a/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh b/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh index 94351922ad..27be3e1b8b 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh @@ -104,7 +104,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc, const unsigned n_block_data_begin = n_block_work_id * NPerBlock; const unsigned k_block_data_begin = k_block_work_id * KPerBlock; const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock; - const unsigned wo_block_data_begin = w_block_work_id * HoPerBlock; + const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock; const unsigned hi_block_data_begin = ho_block_data_begin; const unsigned wi_block_data_begin = wo_block_data_begin; diff --git a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh index 3e4609d98e..e0a416ebf2 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh @@ -70,7 +70,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, const unsigned n_block_data_begin = n_block_work_id * NPerBlock; const unsigned k_block_data_begin = k_block_work_id * KPerBlock; const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock; - const unsigned wo_block_data_begin = w_block_work_id * HoPerBlock; + const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock; const unsigned hi_block_data_begin = ho_block_data_begin; const unsigned wi_block_data_begin = wo_block_data_begin; @@ -162,7 +162,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, for(unsigned c_block_data_begin = 0; c_block_data_begin < in_nchw_global_desc.GetLength(I1); c_block_data_begin += CPerBlock, __syncthreads()) { -#if 0 +#if 1 // input: global mem to LDS, // convert [N,C,Hi,Wi] to [C,Hi,Wi,N] blockwise_4d_tensor_copy_reorder_by_get_dst_from_src( @@ -177,7 +177,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, reorder_chwn_from_nchw); #endif -#if 0 +#if 1 // weight: global mem to LDS, // format is [S,R,C,K], no conversion needed blockwise_wei_copy.run(p_wei_global + wei_srck_global_desc.Get1dIndex(