From 1f70524471c2e198f5372a3952252a7b042289ac Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 12 Sep 2019 01:12:08 -0500 Subject: [PATCH] padding for chwn is functional --- ...plicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp | 288 +++++------------- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 4 +- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 4 +- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 9 +- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 9 +- .../multi_index_transform.hpp | 6 + .../blockwise_generic_tensor_slice_copy.hpp | 23 +- .../threadwise_generic_tensor_slice_copy.hpp | 11 +- driver/src/driver.cpp | 36 ++- 9 files changed, 118 insertions(+), 272 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp index cfa8e9f02d..23a7f5b05e 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp @@ -47,27 +47,18 @@ template struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded { - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr auto I4 = Number<4>{}; - static constexpr auto I5 = Number<5>{}; - static constexpr auto I6 = Number<6>{}; - static constexpr auto I7 = Number<7>{}; - static constexpr auto I8 = Number<8>{}; - static constexpr auto I9 = Number<9>{}; - static constexpr auto I10 = Number<10>{}; - static constexpr auto I11 = Number<11>{}; - - static constexpr auto True = integral_constant{}; - static constexpr auto False = integral_constant{}; - -#if 1 __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) const { + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + + static constexpr auto True = integral_constant{}; + static constexpr auto False = integral_constant{}; + // be careful of this assertion static_assert( NPerBlock % NPerThread == 0 && @@ -122,8 +113,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded const index_t wo_block_data_begin = block_work_multi_id[2] * WoPerBlock; const index_t n_block_data_begin = block_work_multi_id[3] * NPerBlock; - const index_t hi_block_data_begin = ho_block_data_begin - LeftPads{}[0]; - const index_t wi_block_data_begin = wo_block_data_begin - LeftPads{}[1]; + const index_t hp_block_data_begin = ho_block_data_begin; + const index_t wp_block_data_begin = wo_block_data_begin; // input global tensor view constexpr auto in_c_hp_wp_n_global_desc = transform_tensor_descriptor( @@ -133,12 +124,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); - // global tensor view - constexpr auto wei_c_k_global_desc_old = wei_c_y_x_k_global_desc_old.Extract(I0, I3); - - constexpr auto wei_c_k_global_desc = make_native_tensor_descriptor( - wei_c_k_global_desc_old.GetLengths(), wei_c_k_global_desc_old.GetStrides()); - // LDS tensor view // be careful of alignment constexpr index_t max_align = math::lcm(InBlockCopyDataPerAccess_N, @@ -158,15 +143,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc_old = make_ConstantTensorDescriptor_aligned( - Sequence{}, Number{}); + constexpr auto wei_c_1_1_k_block_desc_old = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); - constexpr auto wei_c_k_block_desc = make_native_tensor_descriptor( - wei_c_k_block_desc_old.GetLengths(), wei_c_k_block_desc_old.GetStrides()); + constexpr auto wei_c_1_1_k_block_desc = make_native_tensor_descriptor( + wei_c_1_1_k_block_desc_old.GetLengths(), wei_c_1_1_k_block_desc_old.GetStrides()); // LDS: be careful of alignment constexpr index_t in_block_space = in_c_h_w_n_block_desc_old.GetElementSpace(); - constexpr index_t wei_block_space = wei_c_k_block_desc_old.GetElementSpace(); + constexpr index_t wei_block_space = wei_c_1_1_k_block_desc_old.GetElementSpace(); __shared__ Float p_in_block[in_block_space]; __shared__ Float p_wei_block[wei_block_space]; @@ -181,46 +166,45 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded // blockwise input copy // format is [C, Hi, Wi, N] auto blockwise_in_copy = -#if 0 - BlockwiseGenericTensorSliceCopy_v2 -#else - BlockwiseGenericTensorSliceCopy_v4 -#endif - , - Sequence<0, 1, 2, 3>, - Sequence<0, 1, 2, 3>, - 3, - 3, - InBlockCopyDataPerAccess_N, - InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0}); + BlockwiseGenericTensorSliceCopy_v4, + Sequence<0, 1, 2, 3>, + Sequence<0, 1, 2, 3>, + 3, + 3, + InBlockCopyDataPerAccess_N, + InBlockCopyDataPerAccess_N>( + {0, hp_block_data_begin, wp_block_data_begin, n_block_data_begin}, {0, 0, 0, 0}); // blockwise wei copy // format is [CPerBlock, KPerBlock] - const auto blockwise_wei_copy = -#if 0 - BlockwiseGenericTensorSliceCopy_v2 -#else - BlockwiseGenericTensorSliceCopy_v4 -#endif - , - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - WeiBlockCopyDataPerAccess_K, - WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); + using WeiBlockCopySubLengths_CYXK = + Sequence; + using WeiBlockCopyClusterLengths_CYXK = Sequence; + + auto blockwise_wei_copy = + BlockwiseGenericTensorSliceCopy_v4, + Sequence<0, 1, 2, 3>, + Sequence<0, 1, 2, 3>, + 3, + 3, + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>( + {0, 0, 0, k_block_data_begin}, {0, 0, 0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -228,8 +212,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded // A_matrix[C,K] is a sub-matrix of wei_block[C,K] // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] - constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); + constexpr auto a_c_k_block_mtx_desc = + make_ConstantMatrixDescriptor(Number{}, + Number{}, + Number{}); constexpr auto b_c_wn_block_mtx_desc = make_ConstantMatrixDescriptor(Number{}, @@ -270,39 +256,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded // set threadwise output tensor to 0 threadwise_matrix_set_zero(c_k_wn_thread_mtx_desc, p_out_thread); -#if 1 - for(index_t y = 0; y < Y; ++y) - { - for(index_t x = 0; x < X; ++x) - { - const Float* p_in_global_block_offset = - p_in_global + - in_c_h_w_n_global_desc.CalculateOffset( - {0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin}); - - const Float* p_wei_global_block_offset = - p_wei_global + - wei_c_y_x_k_global_desc.CalculateOffset({0, y, x, k_block_data_begin}); - - for(index_t c_block_data_begin = 0; c_block_data_begin < C; - c_block_data_begin += CPerBlock, - p_in_global_block_offset += - CPerBlock * in_c_h_w_n_global_desc.GetStride(I0), - p_wei_global_block_offset += - CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) - { - blockwise_in_copy.Run(p_in_global_block_offset, p_in_block); - blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block); - - __syncthreads(); - - blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); - - __syncthreads(); - } - } - } -#else for(index_t y = 0; y < Y; ++y) { for(index_t x = 0; x < X; ++x) @@ -310,8 +263,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock) { - blockwise_in_copy.Run(); - blockwise_wei_copy.Run(); + blockwise_in_copy.Run(p_in_global, p_in_block); + blockwise_wei_copy.Run(p_wei_global, p_wei_block); __syncthreads(); @@ -320,28 +273,29 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded __syncthreads(); // move along C - blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_in_copy.MoveSrcSliceWindow(make_multi_index(CPerBlock, 0, 0, 0), + True); + blockwise_wei_copy.MoveSrcSliceWindow(make_multi_index(CPerBlock, 0, 0, 0), + True); } // reset C - blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, False); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, False); + blockwise_in_copy.MoveSrcSliceWindow(make_multi_index(C, 0, 0, 0), False); + blockwise_wei_copy.MoveSrcSliceWindow(make_multi_index(C, 0, 0, 0), False); // move aling X - blockwise_in_copy.MoveSrcSliceWindow(Sequence<0, 0, 1, 0>{}, True); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence<0, 0, 1, 0>{}, True); + blockwise_in_copy.MoveSrcSliceWindow(make_multi_index(0, 0, 1, 0), True); + blockwise_wei_copy.MoveSrcSliceWindow(make_multi_index(0, 0, 1, 0), True); } // reset X - blockwise_in_copy.MoveSrcSliceWindow(Sequence<0, 0, X, 0>{}, False); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence<0, 0, X, 0>{}, False); + blockwise_in_copy.MoveSrcSliceWindow(make_multi_index(0, 0, X, 0), False); + blockwise_wei_copy.MoveSrcSliceWindow(make_multi_index(0, 0, X, 0), False); // move along Y - blockwise_in_copy.MoveSrcSliceWindow(Sequence<0, 1, 0, 0>{}, False); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence<0, 1, 0, 0>{}, False); + blockwise_in_copy.MoveSrcSliceWindow(make_multi_index(0, 1, 0, 0), True); + blockwise_wei_copy.MoveSrcSliceWindow(make_multi_index(0, 1, 0, 0), True); } -#endif // output: register to global mem const auto c_thread_mtx_begin = @@ -454,110 +408,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded .Run(p_out_thread, p_out_thread_on_global); }); } -#elif 0 - __device__ void Run(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) const - { - // create a native tensor descriptor - constexpr auto in_c_h_w_n_global_desc = - make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); - - constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0); - constexpr index_t Hi = in_c_h_w_n_global_desc.GetLength(I1); - constexpr index_t Wi = in_c_h_w_n_global_desc.GetLength(I2); - constexpr index_t N = in_c_h_w_n_global_desc.GetLength(I3); - - // transformation: {c, h, w, n} --> {n, c, hp, wp} - // {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n} - constexpr auto in_n_c_hp_wp_global_desc = transform_tensor_descriptor( - in_c_h_w_n_global_desc, - make_tuple( - Pad, LeftPads, RightPads>{}, PassThrough{}, PassThrough{}), - make_tuple(Sequence<1, 2>{}, Sequence<0>{}, Sequence<3>{}), - make_tuple(Sequence<2, 3>{}, Sequence<1>{}, Sequence<0>{})); - - // transformation: {n, c, hp, wp} --> {c, b} - // {n, hp, wp} --> {b}, {c} --> {c} - constexpr auto in_c_b_global_desc = transform_tensor_descriptor( - in_n_c_hp_wp_global_desc, - make_tuple(Merge{}, - PassThrough{}), - make_tuple(Sequence<0, 2, 3>{}, Sequence<1>{}), - make_tuple(Sequence<1>{}, Sequence<0>{})); - - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - // 0 - print_tensor_descriptor("in_c_h_w_n_global_desc", in_c_h_w_n_global_desc); - - // 1 - print_tensor_descriptor("in_n_c_hp_wp_global_desc", in_n_c_hp_wp_global_desc); - - // 2 - print_tensor_descriptor("in_c_b_global_desc", in_c_b_global_desc); - - constexpr auto idx2 = MultiIndex<2>{1, 4 * (16 * 16) + 5 * 16 + 6}; - auto idx1 = in_c_b_global_desc.CalculateLowerIndex(idx2); - auto idx0 = in_c_b_global_desc.GetLowerTensorDescriptor().CalculateLowerIndex(idx1); - - print_array("idx2: ", idx2); - print_array("idx1: ", idx1); - print_array("idx0: ", idx0); - - printf("in_c_b_global_desc offset: %lu\n", in_c_b_global_desc.CalculateOffset(idx2)); - } - } -#else - __device__ void Run(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) const - { - // create a native tensor descriptor - constexpr auto in_c_h_w_n_global_desc = - make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); - - constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0); - constexpr index_t Hi = in_c_h_w_n_global_desc.GetLength(I1); - constexpr index_t Wi = in_c_h_w_n_global_desc.GetLength(I2); - constexpr index_t N = in_c_h_w_n_global_desc.GetLength(I3); - - // transformation: {c, h, w, n} --> {n, c, hp, wp} - // {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n} - constexpr auto in_c_hp_wp_n_global_desc = transform_tensor_descriptor( - in_c_h_w_n_global_desc, - make_tuple( - PassThrough{}, Pad, LeftPads, RightPads>{}, PassThrough{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); - - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - // 0 - print_tensor_descriptor("in_c_h_w_n_global_desc", in_c_h_w_n_global_desc); - - // 1 - print_tensor_descriptor("in_c_hp_wp_n_global_desc", in_c_hp_wp_n_global_desc); - - constexpr auto idx1 = MultiIndex<4>{1, 2, 3, 4}; - auto idx0 = in_c_hp_wp_n_global_desc.CalculateLowerIndex(idx1); - - print_array("idx1: ", idx1); - print_array("idx0: ", idx0); - - auto coord1 = make_tensor_coordinate_v2(in_c_hp_wp_n_global_desc, idx1); - - print_array("1: ", coord1.GetIndex()); - print_array("0: ", coord1.GetLowerCoordinate().GetIndex()); - - printf("in_c_hp_wp_n_global_desc is_in_pad: %d\n", - coord1.IsAnyLevelIndexInPaddingArea()); - - printf("in_c_hp_wp_n_global_desc offset: %lu\n", - in_c_hp_wp_n_global_desc.CalculateOffset(idx1)); - } - } -#endif }; } // namespace ck diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index d7199bce57..5fb465c519 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -304,8 +304,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); #else - blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0, 0, 0}, true); - blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true); + blockwise_in_copy.MoveSrcSliceWindow({EPerBlock, 0, 0, 0}, true); + blockwise_wei_copy.MoveSrcSliceWindow({EPerBlock, 0}, true); #endif } diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index a5ea753dd0..629e822dcb 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -303,7 +303,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; - blockwise_in_copy.MoveSrcSlicingWindow(Sequence{}, True); + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); __syncthreads(); @@ -328,7 +328,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; - blockwise_in_copy.MoveSrcSlicingWindow(Sequence{}, True); + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); __syncthreads(); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index 38fc225f23..81f5e87960 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -240,8 +240,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw __syncthreads(); - blockwise_in_copy.MoveSrcSlicingWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSlicingWindow(Sequence{}, True); + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); } // copy output: register to global memory @@ -297,9 +297,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw { threadwise_out_copy.Run(p_out_thread, p_out_global); - threadwise_out_copy.MoveSrcSlicingWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, - True); - threadwise_out_copy.MoveDstSlicingWindow(Sequence<0, 0, B1>{}, True); + threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True); + threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True); } } } diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index eddd759b87..eeb63bf8c8 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -269,7 +269,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; - blockwise_in_copy.MoveSrcSlicingWindow(Sequence{}, True); + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0]; __syncthreads(); @@ -294,7 +294,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; // even iteration - blockwise_in_copy.MoveSrcSlicingWindow(Sequence{}, True); + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0]; __syncthreads(); @@ -379,9 +379,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer { threadwise_out_copy.Run(p_out_thread, p_out_global); - threadwise_out_copy.MoveSrcSlicingWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, - True); - threadwise_out_copy.MoveDstSlicingWindow(Sequence<0, 0, B1>{}, True); + threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True); + threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True); } } } diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp index d26b4f1efa..7becf3093e 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -8,6 +8,12 @@ namespace ck { template using MultiIndex = Array; +template +__host__ __device__ constexpr auto make_multi_index(Xs... xs) +{ + return MultiIndex(xs...); +} + template struct PassThrough { diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index f3085ef0c2..071996fa60 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -408,8 +408,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 template __device__ void - MoveSrcSlicingWindow(T step_sizes, - integral_constant positive_direction) + MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) { static_for<0, nDim, 1>{}([&](auto idim) { if(step_sizes[idim] != 0) @@ -506,18 +505,16 @@ struct BlockwiseGenericTensorSliceCopy_v2 template __device__ void - MoveSrcSlicingWindow(T step_sizes, - integral_constant positive_direction) + MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) { - mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction); + mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); } template __device__ void - MoveDstSlicingWindow(T step_sizes, - integral_constant positive_direction) + MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) { - mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction); + mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction); } private: @@ -753,18 +750,16 @@ struct BlockwiseGenericTensorSliceCopy_v4 template __device__ void - MoveSrcSlicingWindow(T step_sizes, - integral_constant positive_direction) + MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) { - mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction); + mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); } template __device__ void - MoveDstSlicingWindow(T step_sizes, - integral_constant positive_direction) + MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) { - mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction); + mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction); } private: diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 3fba453a36..723f6766da 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -757,7 +757,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // T can be Sequence or Array template - __device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant) + __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) { static_if{}([&](auto) { mSrcSliceOrigin += step_sizes; @@ -765,7 +765,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 } template - __device__ void MoveDstSlicingWindow(T step_sizes, integral_constant) + __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) { static_if{}([&](auto) { mDstSliceOrigin += step_sizes; @@ -1045,8 +1045,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // TODO: still kind of messy if(!src_coord.IsAnyLevelIndexInPaddingArea()) { - const index_t src_offset = - (mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset(); + const index_t src_offset = src_coord.GetOffset(); const index_t buffer_offset = i * src_data_per_access; @@ -1073,7 +1072,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 } template - __device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant) + __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) { static_if{}([&](auto) { mSrcSliceOrigin += step_sizes; @@ -1081,7 +1080,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 } template - __device__ void MoveDstSlicingWindow(T step_sizes, integral_constant) + __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) { static_if{}([&](auto) { mDstSliceOrigin += step_sizes; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 00ce2079f6..177386ab8b 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -72,20 +72,20 @@ int main(int argc, char* argv[]) { using namespace ck; -#if 1 +#if 0 constexpr index_t N = 32; constexpr index_t C = 8; - constexpr index_t HI = 2; - constexpr index_t WI = 2; + constexpr index_t HI = 1; + constexpr index_t WI = 1; constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; + constexpr index_t Y = 1; + constexpr index_t X = 1; using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; - constexpr index_t HPad = 1; - constexpr index_t WPad = 1; + using LeftPads = Sequence<1, 1>; + using RightPads = Sequence<0, 0>; #elif 1 // 3x3, 34x34 constexpr index_t N = 64; @@ -99,8 +99,8 @@ int main(int argc, char* argv[]) using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; - constexpr index_t HPad = 1; - constexpr index_t WPad = 1; + using LeftPads = Sequence<1, 1>; + using RightPads = Sequence<1, 1>; #elif 0 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% @@ -311,13 +311,10 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #endif - auto lower_pads = Sequence{}; - auto upper_pads = Sequence{}; - auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence{}); auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence{}); auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( - in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, lower_pads, upper_pads); + in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); @@ -378,8 +375,8 @@ int main(int argc, char* argv[]) wei_kcyx, out_nkhw_desc, out_nkhw_device, - lower_pads, - upper_pads, + LeftPads{}, + RightPads{}, nrepeat); #elif 0 device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw( @@ -434,11 +431,12 @@ int main(int argc, char* argv[]) if(do_verification) { -#if 0 +#if 1 if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 && ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1) { - host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); + host_winograd_3x3_convolution( + in_nchw, wei_kcyx, out_nkhw_host, LeftPads{}, RightPads{}); } else #endif @@ -448,8 +446,8 @@ int main(int argc, char* argv[]) out_nkhw_host, ConvStrides{}, ConvDilations{}, - lower_pads, - upper_pads); + LeftPads{}, + RightPads{}); } check_error(out_nkhw_host, out_nkhw_device);