diff --git a/driver/conv.cu b/driver/conv.cu index 3938718d90..695dd6d37c 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -404,7 +404,7 @@ int main() #elif 1 device_implicit_gemm_convolution_nchw_kcsr( in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device); -#elif 0 +#elif 1 device_implicit_gemm_convolution_nchw_srck( in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device); #elif 0 diff --git a/driver/device_implicit_gemm_convolution_nchw_kcsr.cuh b/driver/device_implicit_gemm_convolution_nchw_kcsr.cuh index 080ffeefb1..0b1f5b209d 100644 --- a/driver/device_implicit_gemm_convolution_nchw_kcsr.cuh +++ b/driver/device_implicit_gemm_convolution_nchw_kcsr.cuh @@ -38,20 +38,20 @@ void device_implicit_gemm_convolution_nchw_kcsr( constexpr unsigned WoPerThread = 2; constexpr unsigned BlockSize = 16; -#elif 0 - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 4; - constexpr unsigned HoPerBlock = 2; +#elif 1 + constexpr unsigned NPerBlock = 1; + constexpr unsigned KPerBlock = 64; + constexpr unsigned CPerBlock = 2; + constexpr unsigned HoPerBlock = 4; constexpr unsigned WoPerBlock = 32; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; constexpr unsigned HoPerThread = 2; constexpr unsigned WoPerThread = 2; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 constexpr unsigned NPerBlock = 1; constexpr unsigned KPerBlock = 64; constexpr unsigned CPerBlock = 4; diff --git a/driver/device_implicit_gemm_convolution_nchw_srck.cuh b/driver/device_implicit_gemm_convolution_nchw_srck.cuh index adb41a8c23..5f2e41e528 100644 --- a/driver/device_implicit_gemm_convolution_nchw_srck.cuh +++ b/driver/device_implicit_gemm_convolution_nchw_srck.cuh @@ -63,32 +63,32 @@ void device_implicit_gemm_convolution_nchw_srck(InDesc, constexpr unsigned WoPerThread = 2; constexpr unsigned BlockSize = 16; -#elif 1 - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 4; - constexpr unsigned HoPerBlock = 2; +#elif 0 + constexpr unsigned NPerBlock = 1; + constexpr unsigned KPerBlock = 64; + constexpr unsigned CPerBlock = 2; + constexpr unsigned HoPerBlock = 4; constexpr unsigned WoPerBlock = 32; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; + constexpr unsigned KPerThread = 8; + constexpr unsigned CPerThread = 1; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 4; + + constexpr unsigned BlockSize = 128; +#elif 1 + constexpr unsigned NPerBlock = 1; + constexpr unsigned KPerBlock = 64; + constexpr unsigned CPerBlock = 2; + constexpr unsigned HoPerBlock = 4; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; constexpr unsigned HoPerThread = 2; constexpr unsigned WoPerThread = 2; constexpr unsigned BlockSize = 128; -#elif 0 - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 64; - constexpr unsigned CPerBlock = 4; - constexpr unsigned HoPerBlock = 2; - constexpr unsigned WoPerBlock = 32; - - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; - constexpr unsigned HoPerThread = 2; - constexpr unsigned WoPerThread = 2; - - constexpr unsigned BlockSize = 256; #endif constexpr unsigned GridSize = diff --git a/src/include/gridwise_implicit_gemm_convolution_nchw_kcsr.cuh b/src/include/gridwise_implicit_gemm_convolution_nchw_kcsr.cuh index 03a9c4ae43..34a04ff0b4 100644 --- a/src/include/gridwise_implicit_gemm_convolution_nchw_kcsr.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_nchw_kcsr.cuh @@ -185,7 +185,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc, in_nchw_block_desc.GetLengths()); #endif -#if 0 +#if 1 // weight: global mem to LDS, // convert [K,C,S,R] to [S,R,C,K] blockwise_4d_tensor_copy_reorder_by_get_dst_from_src( @@ -238,7 +238,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc, const unsigned k_thread_data_begin = matrix_c_index.row_begin; const unsigned wo_thread_data_begin = matrix_c_index.col_begin / NPerThread; -#if 0 +#if 1 // output: register to global mem, // convert out_thread[Ho,K,Wo,N] to out_global[N,K,Ho,Wo] constexpr auto reorder_nkhw_from_hkwn = Sequence<3, 1, 0, 2>{};