diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp index 1455e1797b..2d1cd532de 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp @@ -83,7 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn constexpr index_t HBlockWork = math::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t WBlockWork = math::integer_divide_ceil(Wo, WoPerBlock); - constexpr auto block_work_desc = make_ConstantTensorDescriptor( + constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed( Sequence{}); const auto block_work_multi_id = @@ -109,8 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn GemmDataPerReadB); constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, - Number{}); + Sequence{}, Number{}); // this check is ad-hoc // TODO: need to properly implement tensor descriptor with alignment @@ -118,11 +117,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn "GemmDataPerReadB alignment requirement is not meet"); constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, - Number{}); + Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); // blockwise copy @@ -144,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn decltype(wei_c_k_global_desc), decltype(wei_c_k_block_desc), decltype(wei_c_k_block_desc.GetLengths()), - WeiBlockCopyDataPerRead_K>{}; + WeiBlockCopyDataPerRead_K>({0, 0}, {0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -186,22 +184,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn GemmDataPerReadA, GemmDataPerReadB>{}; - // choose GEMM implementation here - const auto run_blockwise_batch_gemm = [&](auto... Xs) { -#if 0 - return blockwise_batch_gemm.Run(Xs...); -#elif 0 - return blockwise_batch_gemm.Run_amd_asm(Xs...); -#else - return blockwise_batch_gemm.Run_asm_v2(Xs...); -#endif - }; - // LDS: be careful of alignment - // TODO:: need to properly implement tensor descriptor with alignment - constexpr index_t in_block_space = - in_c_h_w_n_block_desc.GetElementSpace(Number{}); - constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(Number{}); + constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace(); + constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(); __shared__ Float p_in_block[in_block_space]; __shared__ Float p_wei_block[wei_block_space]; @@ -225,7 +210,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn #endif // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); + threadwise_matrix_set_zero(c_k_wn_thread_mtx_desc, p_out_thread); #if 1 const Float* p_in_global_block_offset = @@ -258,7 +243,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn __syncthreads(); - run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread); + blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); __syncthreads(); } @@ -291,7 +276,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn __syncthreads(); - run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread); + blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); __syncthreads(); } @@ -308,13 +293,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; - static_if{}([&](auto f_dummy) { // f_dummy do nothing but - // perfect forwarding. - // Using this trick to - // make this lambda a generic lambda, so it won't be compiled until - // instantiated + static_if{}([&](auto fwd) { + // fwd do nothing but perfect forwarding. + // Using this trick to make this lambda a generic lambda, so it won't be compiled until + // being instantiated here static_assert( - (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), + (fwd(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), "wrong!"); // output is a 10d tensor @@ -322,38 +306,33 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn constexpr index_t N1 = NPerBlock / N2; constexpr index_t W2 = - (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC); + (GemmNLevel0Cluster * GemmNLevel1Cluster) / fwd(NPerBlock / GemmNPerThreadSubC); constexpr index_t W1 = WoPerBlock / W2; constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = fwd(out_k_h_w_n_global_desc) + .Fold(I3, Number{}, Number{}) + .Fold(I2, Number{}, Number{}) + .Fold(I0, Number{}, Number{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc) + .Fold(I3, Number<1>{}, Number{}) + .Fold(I2, Number{}, Number<1>{}) + .Fold(I0, Number<1>{}, Number{}); #if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, - "out_k_h_w_n_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, + "a: out_k_h_w_n_thread_desc"); + print_ConstantTensorDescriptor(out_10d_thread_desc, "a: out_10d_thread_desc"); - print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, - "out_k_h_w_n_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); - } + print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, + "a: out_k_h_w_n_global_desc"); + print_ConstantTensorDescriptor(out_10d_global_desc, "a: out_10d_global_desc"); + } #endif threadwise_tensor_slice_copy(out_10d_thread_desc, @@ -367,8 +346,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn n_block_data_begin + n_thread_data_begin), out_10d_thread_desc.GetLengths(), Number{}); - }).Else([&](auto f_dummy) { - static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && + }).Else([&](auto fwd) { + static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, "wrong!"); @@ -377,33 +356,34 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); + constexpr index_t W1 = WoPerBlock / fwd(W2 * W3); constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_global_desc = + fwd(out_k_h_w_n_global_desc) + .Fold(I3, Number{}) + .Fold(I2, Number{}, Number{}, Number{}) + .Fold(I0, Number{}, Number{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = + fwd(out_k_h_w_n_thread_desc) + .Fold(I3, Number{}) + .Fold(I2, Number{}, Number<1>{}, Number{}) + .Fold(I0, Number<1>{}, Number{}); #if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, - "out_k_h_w_n_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, + "b: out_k_h_w_n_thread_desc"); + print_ConstantTensorDescriptor(out_10d_thread_desc, "b: out_10d_thread_desc"); - print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, - "out_k_h_w_n_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); - - for(index_t i = 0; i < 64; ++i) - { - printf("out %f, ", p_out_thread[i]); - } - } + print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, + "b: out_k_h_w_n_global_desc"); + print_ConstantTensorDescriptor(out_10d_global_desc, "b: out_10d_global_desc"); + } #endif threadwise_tensor_slice_copy(out_10d_thread_desc, diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp index c5e1712332..0df27009ad 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -114,8 +114,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer GemmDataPerReadB); constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, - Number{}); + Sequence{}, Number{}); // this check is ad-hoc // TODO: need to properly implement tensor descriptor with alignment @@ -123,8 +122,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer "GemmDataPerReadB alignment requirement is not meet"); constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, - Number{}); + Sequence{}, Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( @@ -201,21 +199,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer GemmDataPerReadA, GemmDataPerReadB>{}; - // choose GEMM implementation here - const auto run_blockwise_batch_gemm = [&](auto... Xs) { -#if 1 - return blockwise_batch_gemm.Run(Xs...); -#elif 0 - return blockwise_batch_gemm.Run_amd_asm(Xs...); -#else - return blockwise_batch_gemm.Run_asm_v2(Xs...); -#endif - }; - // LDS: be careful of alignment - constexpr index_t in_block_space = - in_c_h_w_n_block_desc.GetElementSpace(Number{}); - constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(Number{}); + constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace(); + constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(); // LDS double buffer __shared__ Float p_in_block_double[2 * in_block_space]; @@ -307,7 +293,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, p_wei_register_clipboard); - run_blockwise_batch_gemm(p_wei_block_now, p_in_block_now, p_out_thread); + blockwise_batch_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, @@ -335,7 +321,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer p_wei_register_clipboard); // LDS double buffer: GEMM on current data - run_blockwise_batch_gemm(p_wei_block_double, p_in_block_double, p_out_thread); + blockwise_batch_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, @@ -347,7 +333,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer __syncthreads(); // LDS double buffer: GEMM on current data - run_blockwise_batch_gemm(p_wei_block_double + wei_block_space, + blockwise_batch_gemm.Run(p_wei_block_double + wei_block_space, p_in_block_double + in_block_space, p_out_thread); } diff --git a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp index 36d3060b23..a6aebf0fd0 100644 --- a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp @@ -170,9 +170,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 } template - __device__ void Run(const FloatA* __restrict__ p_a_block, - const FloatB* __restrict__ p_b_block, - FloatC* __restrict__ p_c_thread) const + __device__ void Run_source(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread) const { constexpr auto True = integral_constant{}; constexpr auto False = integral_constant{}; @@ -189,10 +189,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 // thread A, B for GEMM // A is transposed, b is not constexpr auto a_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); + make_ConstantMatrixDescriptor_packed(Number{}, Number{}); constexpr auto b_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); + make_ConstantMatrixDescriptor_packed(Number{}, Number{}); // thread A-sub, B-sub for copy constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( @@ -480,6 +480,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 } #endif + template + __device__ void Run(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread) const + + { +#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM + Run_amd_asm(p_a_block, p_b_block, p_c_thread); +#else + Run_source(p_a_block, p_b_block, p_c_thread); +#endif + } + template __device__ void CopyThreadMatrixCToBlockMatrixC(const FloatC* __restrict__ p_c_thread, FloatC* __restrict__ p_c_block) const diff --git a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index de82858288..0f5305d196 100644 --- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -143,7 +143,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_N = 2; -#elif 0 +#elif 1 // for 3x3, 34x34, v1r3, Pascal // for 3x3, 28x28, v1r3, Pascal // for 3x3, 14x14, v1r3, Pascal @@ -478,9 +478,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn #elif 0 GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn -#elif 0 - GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn #elif 1 + GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn +#elif 0 GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer #endif ; + using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; constexpr index_t HPad = 0; @@ -519,19 +519,19 @@ int main(int argc, char* argv[]) #if 0 device_convolution_direct_v2_nchw_kcyx_nkhw + (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); +#elif 1 + device_convolution_implicit_gemm_v1_chwn_cyxk_khwn( in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); #elif 0 - device_convolution_implicit_gemm_v1_chwn_cyxk_khwn + device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw( in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); #elif 0 - device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw - in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_convolution_implicit_gemm_v2_chwn_cyxk_khwn + device_convolution_implicit_gemm_v2_chwn_cyxk_khwn( in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( - in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); + (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); #elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, diff --git a/script/cmake-cuda.sh b/script/cmake-cuda.sh index 7feb67e213..4828c22fc8 100755 --- a/script/cmake-cuda.sh +++ b/script/cmake-cuda.sh @@ -13,11 +13,11 @@ cmake -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D DEVICE_BACKEND=NVIDIA \ --D BOOST_ROOT="/package/install/boost_1.67.0" \ --D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc" \ --D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \ +-D CUDA_COMMON_INCLUDE_DIR="/package/install/cuda/10.1/NVIDIA_CUDA-10.1_Samples/common/inc" \ +-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \ ${MY_PROJECT_SOURCE} +#-D BOOST_ROOT="/package/install/boost_1.67.0" \ #-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc" \ #-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \