diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp index 9c816bf21d..2150360a32 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -126,37 +126,46 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer // blockwise copy // input: format is [C, Hi, Wi, N] auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v1, - Sequence<0, 1, 2, 3>, - Sequence<0, 1, 2, 3>, - 3, - 3, - InBlockCopyDataPerAccess_N, - InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, - {0, 0, 0, 0}); +#if 0 + BlockwiseGenericTensorSliceCopy_v1 +#else + BlockwiseGenericTensorSliceCopy_v2 +#endif + , + Sequence<0, 1, 2, 3>, + Sequence<0, 1, 2, 3>, + 3, + 3, + InBlockCopyDataPerAccess_N, + InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0}); // blockwise wei copy // format is [CPerBlock, X * KPerBlock] const auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v1, - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - WeiBlockCopyDataPerAccess_K, - WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); +#if 0 + BlockwiseGenericTensorSliceCopy_v1 +#else + BlockwiseGenericTensorSliceCopy_v2 +#endif + , + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp index 3985bbf3a7..a3d6522b3b 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp @@ -128,7 +128,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded // blockwise copy // input: format is [C, Hi, Wi, N] auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v1{}, Number<1>{}) .Fold(I0, Number<1>{}, Number{}); -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, - "a: out_k_h_w_n_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "a: out_10d_thread_desc"); - - print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, - "a: out_k_h_w_n_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "a: out_10d_global_desc"); - } -#endif - Float* p_out_thread_on_global = p_out_global + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( k_block_data_begin + k_thread_data_begin, @@ -369,19 +342,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded .Fold(I2, Number{}, Number<1>{}, Number{}) .Fold(I0, Number<1>{}, Number{}); -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, - "b: out_k_h_w_n_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "b: out_10d_thread_desc"); - - print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, - "b: out_k_h_w_n_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "b: out_10d_global_desc"); - } -#endif - Float* p_out_thread_on_global = p_out_global + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( k_block_data_begin + k_thread_data_begin, diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 3d67486a0f..a5ea753dd0 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -156,42 +156,28 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not satisfied"); -#if 0 // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v1( - {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); +#if 0 + BlockwiseGenericTensorSliceCopy_v1 #else - auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v2( - {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); + BlockwiseGenericTensorSliceCopy_v2 #endif + ({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); // weight tensor // tensor descriptor in device memory, src of blockwise copy @@ -204,42 +190,28 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Sequence{}, Number{}); -#if 0 // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v1( - {0, k_block_data_on_global}, {0, 0}); +#if 0 + BlockwiseGenericTensorSliceCopy_v1 #else - auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v2( - {0, k_block_data_on_global}, {0, 0}); + BlockwiseGenericTensorSliceCopy_v2 #endif + ({0, k_block_data_on_global}, {0, 0}); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -430,7 +402,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, b_thread_data_on_global, 0); -#if 1 +#if 0 ThreadwiseGenericTensorSliceCopy_v1r2< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), diff --git a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 67d36fb79d..b1068d2a5e 100644 --- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -112,38 +112,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t WeiBlockCopyDataPerAccess_K = 4; constexpr index_t OutThreadCopyDataPerAccess_N = 2; -#elif 0 - // for 3x3, 34x34, v1r2, Pascal, in-block-copy1 - constexpr index_t BlockSize = 128; - - constexpr index_t NPerBlock = 4; - constexpr index_t KPerBlock = 64; - constexpr index_t CPerBlock = 8; - constexpr index_t HoPerBlock = 4; - constexpr index_t WoPerBlock = 8; - - constexpr index_t NPerThread = 4; - constexpr index_t KPerThread = 8; - constexpr index_t HoPerThread = 1; - constexpr index_t WoPerThread = 2; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 2; - constexpr index_t GemmMLevel1Cluster = 2; - constexpr index_t GemmNLevel1Cluster = 2; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using InBlockCopyClusterLengths_CHWN = Sequence<0, 0, 0, 0>; // not used - constexpr index_t InBlockCopyDataPerAccess_N = 4; - - constexpr index_t WeiBlockCopyDataPerAccess_K = 4; - - constexpr index_t OutThreadCopyDataPerAccess_N = 2; -#elif 0 +#elif 1 // for 3x3, 34x34, v1r3, Pascal // for 3x3, 28x28, v1r3, Pascal // for 3x3, 14x14, v1r3, Pascal @@ -179,37 +148,6 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t WeiBlockCopyDataPerAccess_K = 4; constexpr index_t OutThreadCopyDataPerAccess_N = 2; -#elif 0 - // for 3x3, 34x34, v1r3, Pascal, bad - constexpr index_t BlockSize = 128; - - constexpr index_t NPerBlock = 1; - constexpr index_t KPerBlock = 128; - constexpr index_t CPerBlock = 8; - constexpr index_t HoPerBlock = 2; - constexpr index_t WoPerBlock = 32; - - constexpr index_t NPerThread = 1; - constexpr index_t KPerThread = 8; - constexpr index_t HoPerThread = 1; - constexpr index_t WoPerThread = 8; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 2; - constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 2; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using InBlockCopyClusterLengths_CHWN = Sequence<2, 2, 32, 1>; - constexpr index_t InBlockCopyDataPerAccess_N = 1; - - constexpr index_t WeiBlockCopyDataPerAccess_K = 2; - - constexpr index_t OutThreadCopyDataPerAccess_N = 1; #elif 0 // for 3x3, 34x34, v1r1, Vega 20 constexpr index_t BlockSize = 256; diff --git a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp index f73be2ffa5..9649caea0a 100644 --- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp @@ -81,9 +81,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc, #if 1 // v1r3, 3x3, 32x32, 1x1 pad - constexpr index_t BlockSize = 128; + constexpr index_t BlockSize = 256; - constexpr index_t NPerBlock = 16; + constexpr index_t NPerBlock = 32; constexpr index_t KPerBlock = 128; constexpr index_t CPerBlock = 8; constexpr index_t HoPerBlock = 2; @@ -97,7 +97,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc, constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4; constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 2; + constexpr index_t GemmNLevel0Cluster = 4; constexpr index_t GemmMLevel1Cluster = 4; constexpr index_t GemmNLevel1Cluster = 2; constexpr index_t GemmKPerThreadLoop = 1; @@ -105,14 +105,14 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc, constexpr index_t GemmDataPerReadB = 4; using InBlockCopySubLengths_CHWN = Sequence<1, 1, 1, 4>; - using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 2, 4>; + using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 2, 8>; constexpr index_t InBlockCopyDataPerAccess_N = 4; - using WeiBlockCopySubLengths_CK = Sequence<2, 4>; - using WeiBlockCopyClusterLengths_CK = Sequence<4, 32>; + using WeiBlockCopySubLengths_CK = Sequence<1, 4>; + using WeiBlockCopyClusterLengths_CK = Sequence<8, 32>; constexpr index_t WeiBlockCopyDataPerAccess_K = 4; - constexpr index_t OutThreadCopyDataPerAccess_N = 2; + constexpr index_t OutThreadCopyDataPerAccess_N = 4; #endif constexpr index_t GridSize = diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 96ec3ff3bc..69a82c1268 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -91,43 +91,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; -#elif 1 - // each thread hold 64 data - constexpr index_t BlockSize = 256; - - constexpr index_t BPerBlock = 16; - constexpr index_t KPerBlock = 128; - constexpr index_t EPerBlock = 8; - - constexpr index_t GemmNRepeat = 2; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 4; - constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 4; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 2, 2>; - using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 8, 2>; - using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] - using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] - using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] - - constexpr index_t InBlockCopySrcDataPerRead_B = 2; - constexpr index_t InBlockCopyDstDataPerWrite_N2 = 2; - - using WeiBlockCopySubLengths_E_K = Sequence<2, 2>; - using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>; - using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] - using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] - using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - - constexpr index_t WeiBlockCopySrcDataPerRead_E = 2; - constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2; #elif 0 // each thread hold 32 data constexpr index_t BlockSize = 256; diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index 79b7da25f5..67d2bf7dcf 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -54,7 +54,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 0 +#if 1 constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 128; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index ea961d3564..5046fbdbd6 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -229,7 +229,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128;