From 8bdaba51f89b06032a402fa7faedae583cbc3131 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 13 Aug 2019 00:37:23 -0500 Subject: [PATCH] clean up --- ...tion_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp | 15 +- ...tion_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp | 141 +++++++------- ..._v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp | 182 ++++++++++-------- ..._v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp | 63 +++--- ...lution_implicit_gemm_v2_chwn_cyxk_khwn.hpp | 15 +- ...mm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp | 53 +++-- ...mm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp | 41 ++-- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 64 +++--- ..._v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp | 33 ++-- ..._v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp | 41 ++-- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 62 +++--- .../tensor_description/tensor_coordinate.hpp | 22 +++ .../blockwise_2d_tensor_op.hpp | 18 +- .../blockwise_3d_tensor_op.hpp | 10 +- .../blockwise_4d_tensor_op.hpp | 10 +- .../blockwise_generic_tensor_slice_copy.hpp | 47 ++--- .../blockwise_tensor_slice_copy.hpp | 16 +- .../threadwise_generic_tensor_slice_copy.hpp | 173 +---------------- ...lution_implicit_gemm_v1_chwn_cyxk_khwn.hpp | 140 +++++++------- driver/src/driver.cpp | 10 +- 20 files changed, 505 insertions(+), 651 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp index 5dcf4f415e..ce6965ec62 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp @@ -241,16 +241,15 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn blockwise_in_copy.Run(p_in_global_block_offset, p_in_block); blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block); #else - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global_block_offset, p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block); #endif __syncthreads(); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp index ef608e6061..619faaf094 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp @@ -4,11 +4,8 @@ #include "common_header.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_2d_tensor_op.hpp" -#include "blockwise_4d_tensor_op.hpp" #include "blockwise_generic_tensor_slice_copy.hpp" -#include "threadwise_tensor_slice_copy.hpp" -#include "threadwise_4d_tensor_op.hpp" +#include "threadwise_generic_tensor_slice_copy.hpp" #include "blockwise_batched_gemm.hpp" namespace ck { @@ -37,10 +34,13 @@ template + index_t InBlockCopyDataPerAccess_N, + class WeiBlockCopySubLengths_CK, + class WeiBlockCopyClusterLengths_CK, + index_t WeiBlockCopyDataPerAccess_K, + index_t OutThreadCopyDataPerAccess_N> struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn { __device__ void Run(const Float* const __restrict__ p_in_global, @@ -103,8 +103,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn // LDS tensor view // be careful of alignment - constexpr index_t max_align = math::lcm(InBlockCopyDataPerRead_N, - WeiBlockCopyDataPerRead_K, + constexpr index_t max_align = math::lcm(InBlockCopyDataPerAccess_N, + WeiBlockCopyDataPerAccess_K, GemmDataPerReadA, GemmDataPerReadB); @@ -123,24 +123,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); -// blockwise copy -// input: format is [C, Hi, Wi, N] -#if 0 - const auto blockwise_in_copy = - Blockwise4dTensorCopy3{}; -#elif 0 - using InBlockCopySubLengths_CHWN = - decltype(in_c_h_w_n_block_desc.GetLengths() / InBlockCopyClusterLengths_CHWN{}); - + // blockwise copy + // input: format is [C, Hi, Wi, N] auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v1, Sequence<0, 1, 2, 3>, Sequence<0, 1, 2, 3>, - 1, - 1>({0, 0, 0, 0}, {0, 0, 0, 0}); -#elif 1 - using InBlockCopySubLengths_CHWN = - decltype(in_c_h_w_n_block_desc.GetLengths() / InBlockCopyClusterLengths_CHWN{}); - - auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2< - BlockSize, - decltype(in_c_h_w_n_global_desc), - decltype(in_c_h_w_n_block_desc), - NormalTensorCoordinate, - NormalTensorCoordinate, - decltype(in_c_h_w_n_block_desc.GetLengths()), - InBlockCopySubLengths_CHWN, - InBlockCopyClusterLengths_CHWN, - Sequence<0, 1, 2, 3>>({0, 0, 0, 0}, {0, 0, 0, 0}); -#endif + 3, + 3, + InBlockCopyDataPerAccess_N, + InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, + {0, 0, 0, 0}); // blockwise wei copy - // format is [CPerBlock, X * KPerBlock] + // format is [CPerBlock, KPerBlock] const auto blockwise_wei_copy = - Blockwise2dTensorCopy3({0, 0}, {0, 0}); + BlockwiseGenericTensorSliceCopy_v2, + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -278,7 +259,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn } } - // output: register to global mem, + // output: register to global mem const auto c_thread_mtx_begin = blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); @@ -329,17 +310,24 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn } #endif - threadwise_tensor_slice_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + Float* p_out_thread_on_global = p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin); + + ThreadwiseGenericTensorSliceCopy_v2r1::type, + arithmetic_sequence_gen<0, 10, 1>::type, + 9, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>( + make_zero_array(), make_zero_array()) + .Run(p_out_thread, p_out_thread_on_global); }).Else([&](auto fwd) { static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, @@ -380,17 +368,24 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn } #endif - threadwise_tensor_slice_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + Float* p_out_thread_on_global = p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin); + + ThreadwiseGenericTensorSliceCopy_v2r1::type, + arithmetic_sequence_gen<0, 10, 1>::type, + 9, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>( + make_zero_array(), make_zero_array()) + .Run(p_out_thread, p_out_thread_on_global); }); } }; diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp index 8fa2aeb89f..d9d948415f 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -4,10 +4,8 @@ #include "common_header.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_2d_tensor_op.hpp" -#include "blockwise_4d_tensor_op.hpp" -#include "threadwise_tensor_slice_copy.hpp" -#include "threadwise_4d_tensor_op.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" +#include "threadwise_generic_tensor_slice_copy.hpp" #include "blockwise_batched_gemm.hpp" namespace ck { @@ -36,10 +34,13 @@ template + index_t InBlockCopyDataPerAccess_N, + class WeiBlockCopySubLengths_CK, + class WeiBlockCopyClusterLengths_CK, + index_t WeiBlockCopyDataPerAccess_K, + index_t OutThreadCopyDataPerAccess_N> struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer { __device__ void Run(const Float* const __restrict__ p_in_global, @@ -108,8 +109,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer // LDS tensor view // be careful of alignment - constexpr index_t max_align = math::lcm(InBlockCopyDataPerRead_N, - WeiBlockCopyDataPerRead_K, + constexpr index_t max_align = math::lcm(InBlockCopyDataPerAccess_N, + WeiBlockCopyDataPerAccess_K, GemmDataPerReadA, GemmDataPerReadB); @@ -130,24 +131,38 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer // blockwise copy // input: format is [C, Hi, Wi, N] - const auto blockwise_in_copy = - Blockwise4dTensorCopy3{}; + auto blockwise_in_copy = + BlockwiseGenericTensorSliceCopy_v2, + Sequence<0, 1, 2, 3>, + Sequence<0, 1, 2, 3>, + 3, + 3, + InBlockCopyDataPerAccess_N, + InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, + {0, 0, 0, 0}); // blockwise wei copy // format is [CPerBlock, X * KPerBlock] const auto blockwise_wei_copy = - Blockwise2dTensorCopy3({0, 0}, {0, 0}); + BlockwiseGenericTensorSliceCopy_v2, + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -233,18 +248,18 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer // LDS double buffer: preload data into LDS { - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global_block_offset, + p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_double); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_double); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_double); } // LDS double buffer: main body @@ -266,9 +281,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer Float* p_wei_block_next = even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float - p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0); @@ -278,25 +292,25 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global_block_offset, + p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); blockwise_batch_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_next); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_next); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_next); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_next); } } // LDS double buffer: tail { - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; // even iteration p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0); @@ -305,19 +319,19 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global_block_offset, + p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_batch_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterClipboard( - p_wei_register_clipboard, p_wei_block_double + wei_block_space); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); @@ -330,7 +344,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer } } - // output: register to global mem, + // output: register to global mem const auto c_thread_mtx_begin = blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); @@ -381,17 +395,24 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer } #endif - threadwise_tensor_slice_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + Float* p_out_thread_on_global = p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin); + + ThreadwiseGenericTensorSliceCopy_v2r1::type, + arithmetic_sequence_gen<0, 10, 1>::type, + 9, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>( + make_zero_array(), make_zero_array()) + .Run(p_out_thread, p_out_thread_on_global); }).Else([&](auto fwd) { static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, @@ -432,17 +453,24 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer } #endif - threadwise_tensor_slice_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + Float* p_out_thread_on_global = p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin); + + ThreadwiseGenericTensorSliceCopy_v2r1::type, + arithmetic_sequence_gen<0, 10, 1>::type, + 9, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>( + make_zero_array(), make_zero_array()) + .Run(p_out_thread, p_out_thread_on_global); }); } }; diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp index ff5246435b..5b9216a8dc 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp @@ -254,19 +254,18 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - Float p_in_register_clipboard[blockwise_in_copy_reorder - .GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy_reorder.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; - blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_in_copy_reorder.RunLoadRegisterBuffer(p_in_global_block_offset, + p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); - blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_double); + blockwise_in_copy_reorder.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_double); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_double); } // LDS double buffer: main body @@ -288,10 +287,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer Float* p_wei_block_next = even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; - Float p_in_register_clipboard[blockwise_in_copy_reorder - .GetRegisterClipboardSize()]; Float - p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + p_in_register_buffer[blockwise_in_copy_reorder.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1); @@ -301,27 +299,26 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_in_copy_reorder.RunLoadRegisterBuffer(p_in_global_block_offset, + p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); // LDS double buffer: GEMM on current data run_blockwise_batch_gemm(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_next); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_next); + blockwise_in_copy_reorder.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_next); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_next); } } // LDS double buffer: tail { - Float p_in_register_clipboard[blockwise_in_copy_reorder - .GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy_reorder.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; // even iteration p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1); @@ -330,19 +327,19 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_in_copy_reorder.RunLoadRegisterBuffer(p_in_global_block_offset, + p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); // LDS double buffer: GEMM on current data run_blockwise_batch_gemm(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy_reorder.RunStoreRegisterClipboard( - p_in_register_clipboard, p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterClipboard( - p_wei_register_clipboard, p_wei_block_double + wei_block_space); + blockwise_in_copy_reorder.RunStoreRegisterBuffer( + p_in_register_buffer, p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp index b39bb66a2c..19defcba6c 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp @@ -214,16 +214,15 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn __syncthreads()) { // load data - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global_block_offset, p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block); __syncthreads(); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp index 52abeab530..a9e432d9f0 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -209,17 +209,15 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer // preload data into LDS { - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global_block_offset, p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_double); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_double); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_double); } // register @@ -247,18 +245,18 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; // load next data - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0); __syncthreads(); - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global_block_offset, + p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); // compute on current data // a series of GEMM @@ -280,10 +278,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer } } - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_next); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_next); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next); } } @@ -295,14 +291,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer __syncthreads(); - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global_block_offset, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global_block_offset, + p_wei_register_buffer); for(index_t y = 0; y < Y; ++y) { @@ -322,10 +317,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer } } - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_double + wei_block_space); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_double + wei_block_space); // odd __syncthreads(); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp index fb4571ffdb..25d73df497 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp @@ -267,9 +267,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer Float* p_wei_block_next = even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float - p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; p_in_block_on_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1); p_wei_block_on_global += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0); @@ -277,26 +276,26 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterClipboard(p_in_block_on_global, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_block_on_global, + p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, + p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_next); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_next); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_next); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_next); } } // LDS double buffer: tail { - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; // even iteration p_in_block_on_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1); @@ -305,19 +304,19 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterClipboard(p_in_block_on_global, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_block_on_global, + p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, + p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterClipboard( - p_wei_register_clipboard, p_wei_block_double + wei_block_space); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index a8b330458d..0efac06ed8 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -176,22 +176,21 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer InBlockCopyDstDataPerWrite_N2>( {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); #else - auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2< - BlockSize, - decltype(in_e_n1_b_n2_global_merged_desc), - decltype(in_e_n1_b_n2_block_desc), - MergedTensorCoordinate, - NormalTensorCoordinate, - decltype(in_e_n1_b_n2_block_desc.GetLengths()), - InBlockCopySubLengths_E_N1_B_N2, - InBlockCopyClusterLengths_E_N1_B_N2, - InBlockCopyThreadClusterArrangeOrder, - InBlockCopySrcAccessOrder, - InBlockCopyDstAccessOrder, - 2, - 3, - InBlockCopySrcDataPerRead_B, - InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); + auto blockwise_in_copy = + BlockwiseGenericTensorSliceCopy_v2( + {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); #endif // weight tensor @@ -225,22 +224,21 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer WeiBlockCopyDstDataPerWrite_K>( {0, k_block_data_on_global}, {0, 0}); #else - auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2< - BlockSize, - decltype(wei_e_k_global_desc), - decltype(wei_e_k_block_desc), - NormalTensorCoordinate, - NormalTensorCoordinate, - decltype(wei_e_k_block_desc.GetLengths()), - WeiBlockCopySubLengths_E_K, - WeiBlockCopyClusterLengths_E_K, - WeiBlockCopyThreadClusterArrangeOrder, - WeiBlockCopySrcAccessOrder, - WeiBlockCopyDstAccessOrder, - 0, - 1, - WeiBlockCopySrcDataPerRead_E, - WeiBlockCopyDstDataPerWrite_K>({0, k_block_data_on_global}, {0, 0}); + auto blockwise_wei_copy = + BlockwiseGenericTensorSliceCopy_v2( + {0, k_block_data_on_global}, {0, 0}); #endif // GEMM definition @@ -448,8 +446,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ThreadwiseGenericTensorSliceCopy_v2r1< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), - NormalTensorCoordinate, - MergedTensorCoordinate, decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()), arithmetic_sequence_gen<0, 8, 1>::type, arithmetic_sequence_gen<0, 8, 1>::type, diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp index bf52b30438..edfe828087 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -313,8 +313,8 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer Float* p_wei_block_next = even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); @@ -322,25 +322,23 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, + p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_next); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_next); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next); } } // LDS double buffer: tail { - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; // even iteration blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); @@ -349,18 +347,17 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_double + wei_block_space); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp index 85b2d7894c..fdc29a2e71 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -319,8 +319,8 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer Float* p_wei_block_next = even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); @@ -328,9 +328,9 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, + p_wei_register_buffer); #if 0 if(get_block_1d_id() == 0) @@ -338,10 +338,10 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer printf("tid (%d %d), %f %f %f %f\n", get_block_1d_id(), get_thread_local_1d_id(), - p_wei_register_clipboard[0], - p_wei_register_clipboard[1], - p_wei_register_clipboard[2], - p_wei_register_clipboard[3]); + p_wei_register_buffer[0], + p_wei_register_buffer[1], + p_wei_register_buffer[2], + p_wei_register_buffer[3]); } #endif @@ -349,17 +349,15 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_next); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_next); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next); } } // LDS double buffer: tail { - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; + Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; // even iteration blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); @@ -368,18 +366,17 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, - p_wei_register_clipboard); + blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_double + wei_block_space); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index 5241086a1a..e49df5b4bb 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -134,8 +134,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer BlockwiseGenericTensorSliceCopy_v2, - NormalTensorCoordinate, decltype(in_e_b_block_desc.GetLengths()), InBlockCopySubLengths_E_B, InBlockCopyClusterLengths_E_B, @@ -162,22 +160,21 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in - auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2< - BlockSize, - decltype(wei_e_k_global_desc), - decltype(wei_e_k_block_desc), - NormalTensorCoordinate, - NormalTensorCoordinate, - decltype(wei_e_k_block_desc.GetLengths()), - WeiBlockCopySubLengths_E_K, - WeiBlockCopyClusterLengths_E_K, - WeiBlockCopyThreadClusterArrangeOrder, - WeiBlockCopySrcAccessOrder, - WeiBlockCopyDstAccessOrder, - 0, - 1, - WeiBlockCopySrcDataPerRead_E, - WeiBlockCopyDstDataPerWrite_K>({0, k_block_data_on_global}, {0, 0}); + auto blockwise_wei_copy = + BlockwiseGenericTensorSliceCopy_v2( + {0, k_block_data_on_global}, {0, 0}); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -365,21 +362,20 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer using OutThreadCopySliceLengths = Sequence; - auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2r1< - decltype(out_k0_k1_b_thread_desc), - decltype(out_k0_k1_b_global_desc), - NormalTensorCoordinate, - MergedTensorCoordinate, - OutThreadCopySliceLengths, - arithmetic_sequence_gen<0, 3, 1>::type, - arithmetic_sequence_gen<0, 3, 1>::type, - 2, - 2, - OutThreadCopyDataPerAccess_B, - OutThreadCopyDataPerAccess_B>({0, 0, 0}, - {k_thread_data_on_global / K1, - k_thread_data_on_global % K1, - b_thread_data_on_global}); + auto threadwise_out_copy = + ThreadwiseGenericTensorSliceCopy_v2r1::type, + arithmetic_sequence_gen<0, 3, 1>::type, + 2, + 2, + OutThreadCopyDataPerAccess_B, + OutThreadCopyDataPerAccess_B>( + {0, 0, 0}, + {k_thread_data_on_global / K1, + k_thread_data_on_global % K1, + b_thread_data_on_global}); for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) { diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index eee8c27502..77ed7c052b 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -295,5 +295,27 @@ struct MergedTensorCoordinate index_t mOffset; }; +template +struct TensorCoordinate +{ + private: + template + __host__ __device__ static constexpr auto + MakeDummyTensorCoordinate(ConstantTensorDescriptor) + { + return NormalTensorCoordinate>(); + } + + template + __host__ __device__ static constexpr auto + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + { + return MergedTensorCoordinate>(); + } + + public: + using type = decltype(MakeDummyTensorCoordinate(TensorDesc{})); +}; + } // namespace ck #endif diff --git a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp index 527c84b67a..08e8bbd74a 100644 --- a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp @@ -563,7 +563,7 @@ struct Blockwise2dTensorCopy3 } } - __device__ constexpr index_t GetRegisterClipboardSize() const + __device__ constexpr index_t GetRegisterBufferSize() const { static_assert(is_same{}, "wrong! only support float!\n"); @@ -579,8 +579,8 @@ struct Blockwise2dTensorCopy3 return DataPerRead * (L0 + thread_per_d0 - 1) / thread_per_d0; } - __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, - Float* __restrict__ p_clipboard) const + __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src, + Float* __restrict__ p_clipboard) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -630,8 +630,8 @@ struct Blockwise2dTensorCopy3 } } - __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, - Float* __restrict__ p_dst) const + __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard, + Float* __restrict__ p_dst) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -681,8 +681,8 @@ struct Blockwise2dTensorCopy3 } #if CK_USE_AMD_INLINE_ASM - __device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src, - Float* p_clipboard) const + __device__ void RunLoadRegisterBuffer_asm(const Float* __restrict__ p_src, + Float* p_clipboard) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -741,8 +741,8 @@ struct Blockwise2dTensorCopy3 } } - __device__ void RunStoreRegisterClipboard_asm(const Float* __restrict__ p_clipboard, - Float* __restrict__ p_dst) const + __device__ void RunStoreRegisterBuffer_asm(const Float* __restrict__ p_clipboard, + Float* __restrict__ p_dst) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp index e328caf495..0c4ed9d5c1 100644 --- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp @@ -237,7 +237,7 @@ struct Blockwise3dTensorCopy3 } } - __device__ static constexpr index_t GetRegisterClipboardSize() + __device__ static constexpr index_t GetRegisterBufferSize() { static_assert(is_same{}, "wrong! only support float!\n"); @@ -260,8 +260,8 @@ struct Blockwise3dTensorCopy3 return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2; } - __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, - Float* __restrict__ p_clipboard) const + __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src, + Float* __restrict__ p_clipboard) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -316,8 +316,8 @@ struct Blockwise3dTensorCopy3 } } - __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, - Float* __restrict__ p_dst) const + __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard, + Float* __restrict__ p_dst) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp index 95fcd28023..4185e066fb 100644 --- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp @@ -596,7 +596,7 @@ struct Blockwise4dTensorCopy3 } } - __device__ constexpr index_t GetRegisterClipboardSize() const + __device__ constexpr index_t GetRegisterBufferSize() const { static_assert(is_same{}, "wrong! only support float!\n"); @@ -623,8 +623,8 @@ struct Blockwise4dTensorCopy3 return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2 * nloop_d3; } - __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, - Float* __restrict__ p_clipboard) const + __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src, + Float* __restrict__ p_clipboard) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -690,8 +690,8 @@ struct Blockwise4dTensorCopy3 } } - __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, - Float* __restrict__ p_dst) const + __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard, + Float* __restrict__ p_dst) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index cc58e44fb7..95d9e92daa 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -420,8 +420,6 @@ struct BlockwiseGenericTensorSliceCopy_v1 template ::type; + using DstCoordinate = typename TensorCoordinate::type; + __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin, DstCoordinate dst_block_slice_origin) { @@ -515,31 +516,25 @@ struct BlockwiseGenericTensorSliceCopy_v2 private: using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); - using ThreadwiseLoad = - ThreadwiseGenericTensorSliceCopy_v2r1, - SubLengths, - SrcDimAccessOrder, - SrcDimAccessOrder, - SrcVectorAccessDim, - SrcVectorAccessDim, - SrcDataPerAccess, - 1>; + using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1; - using ThreadwiseStore = - ThreadwiseGenericTensorSliceCopy_v2r1, - DstCoordinate, - SubLengths, - DstDimAccessOrder, - DstDimAccessOrder, - DstVectorAccessDim, - DstVectorAccessDim, - 1, - DstDataPerAccess>; + using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1; ThreadwiseLoad mThreadwiseLoad; ThreadwiseStore mThreadwiseStore; diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp index 55e7fb8f69..8ecdfc76ba 100644 --- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp @@ -165,7 +165,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 #endif } - __device__ static constexpr index_t GetRegisterClipboardSize() + __device__ static constexpr index_t GetRegisterBufferSize() { constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; @@ -183,8 +183,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 return thread_tensor_desc.GetElementSpace(); } - __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, - Float* __restrict__ p_clipboard) const + __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src, + Float* __restrict__ p_clipboard) const { constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; @@ -219,8 +219,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 }); } - __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, - Float* __restrict__ p_dst) const + __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard, + Float* __restrict__ p_dst) const { constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; @@ -274,10 +274,10 @@ struct BlockwiseTensorSliceReorderCopy_v3 __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - Float p_clipboard[GetRegisterClipboardSize()]; + Float p_clipboard[GetRegisterBufferSize()]; - RunLoadRegisterClipboard(p_src, p_clipboard); - RunStoreRegisterClipboard(p_clipboard, p_dst); + RunLoadRegisterBuffer(p_src, p_clipboard); + RunStoreRegisterBuffer(p_clipboard, p_dst); } // this function doesn't do santiy check on whether the slicing window is out of the boundary diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 859d5fc164..9b5c138abf 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -14,10 +14,6 @@ #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #endif -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0 -#endif - #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 #endif @@ -430,170 +426,6 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 Array mDstSliceOrigin; }; -template -struct ThreadwiseGenericTensorSliceCopy_v2 -{ - static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2(SrcCoordinate src_slice_origin, - DstCoordinate dst_slice_origin) - : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) - { - } - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2() - : ThreadwiseGenericTensorSliceCopy_v2(make_zero_array(), - make_zero_array()) - { - } - - __device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin) - { - mSrcSliceOrigin = src_slice_origin; - } - - __device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin) - { - mDstSliceOrigin = dst_slice_origin; - } - - template - struct IsolateMergedDimSliceLengthsHack - { - template - __device__ constexpr index_t operator()(IDim idim) const - { - return TDesc::ContainMultipleOriginalDimensions(idim) ? Seq{}[idim] : 1; - } - }; - - template - __device__ void Run(const TData* p_src, TData* p_dst) const - { - constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); - - TData p_buffer_[buffer_desc.GetElementSpace()]; - TData* p_buffer = p_buffer_; - - // hacks to isolate merged dimension from normal dimensions, and calculate their offset - // seperately - // SrcMergedDimSliceLengthsHack has entry same as SliceLengths on src merged dimensions, - // but 1 on normal dimensions; - // SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions, - // but 1 on merged dimensions; - using SrcMergedDimSliceLengthsHack = - typename sequence_gen>::type; - - using SrcNormalDimSliceLengthsHack = - decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{}); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 - static_ford{}([&](auto merged_dim_data_id_) { - constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){}; - - const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset(); - - static_ford{}([&](auto normal_dim_data_id_) { - constexpr auto normal_dim_data_id = decltype(normal_dim_data_id_){}; - - constexpr index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id); - - constexpr index_t src_normal_offset = - SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id); - - p_buffer[buffer_offset] = p_src_tmp[src_normal_offset]; - }); - }); -#else - ford{}([&](auto merged_dim_data_id) { - const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset(); - - ford{}([&](auto normal_dim_data_id) { - const index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id); - - const index_t src_normal_offset = - SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id); - - p_buffer[buffer_offset] = p_src_tmp[src_normal_offset]; - }); - }); -#endif - - // DstMergedDimSliceLengthsHack has entry same as SliceLengths on dst merged dimensions, - // but 1 on normal dimensions; - // DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions, - // but 1 on merged dimensions; - using DstMergedDimSliceLengthsHack = - typename sequence_gen>::type; - - using DstNormalDimSliceLengthsHack = - decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{}); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 - static_ford{}([&](auto merged_dim_data_id_) { - constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){}; - - TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset(); - - static_ford{}([&](auto normal_dim_data_id_) { - constexpr auto normal_dim_data_id = decltype(normal_dim_data_id_){}; - - constexpr index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id); - - constexpr index_t dst_normal_offset = - DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id); - - p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset]; - }); - }); -#else - ford{}([&](auto merged_dim_data_id) { - TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset(); - - ford{}([&](auto normal_dim_data_id) { - const index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id); - - const index_t dst_normal_offset = - DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id); - - p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset]; - }); - }); -#endif - } - - // T can be Sequence or Array - template - __device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant) - { - static_if{}([&](auto) { - mSrcSliceOrigin += step_sizes; - }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); - } - - template - __device__ void MoveDstSlicingWindow(T step_sizes, integral_constant) - { - static_if{}([&](auto) { - mDstSliceOrigin += step_sizes; - }).Else([&](auto) { mDstSliceOrigin -= step_sizes; }); - } - - private: - SrcCoordinate mSrcSliceOrigin; - DstCoordinate mDstSliceOrigin; -}; - // This threadwise copy allow vector access of src and dst. // It allows the dimensions of vector access to be different on src and dst. // It also allows the vector size to be different on src and dst. @@ -605,8 +437,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2 // used for the buffer. template ::type; + using DstCoordinate = typename TensorCoordinate::type; + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin, DstCoordinate dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) diff --git a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 0f5305d196..11908b0816 100644 --- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -107,11 +107,11 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadB = 4; using InBlockCopyClusterLengths_CHWN = Sequence<4, 4, 2, 4>; - constexpr index_t InBlockCopyDataPerRead_N = 4; + constexpr index_t InBlockCopyDataPerAccess_N = 4; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; - constexpr index_t OutThreadCopyDataPerWrite_N = 2; + constexpr index_t OutThreadCopyDataPerAccess_N = 2; #elif 0 // for 3x3, 34x34, v1r2, Pascal, in-block-copy1 constexpr index_t BlockSize = 128; @@ -137,12 +137,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockCopyClusterLengths_CHWN = Sequence<0, 0, 0, 0>; // not used - constexpr index_t InBlockCopyDataPerRead_N = 4; + using InBlockCopyClusterLengths_CHWN = Sequence<0, 0, 0, 0>; // not used + constexpr index_t InBlockCopyDataPerAccess_N = 4; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; - constexpr index_t OutThreadCopyDataPerWrite_N = 2; + constexpr index_t OutThreadCopyDataPerAccess_N = 2; #elif 1 // for 3x3, 34x34, v1r3, Pascal // for 3x3, 28x28, v1r3, Pascal @@ -170,12 +170,15 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 2, 4>; - constexpr index_t InBlockCopyDataPerRead_N = 4; + using InBlockCopySubLengths_CHWN = Sequence<1, 1, 1, 4>; + using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 2, 4>; + constexpr index_t InBlockCopyDataPerAccess_N = 4; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; + using WeiBlockCopySubLengths_CK = Sequence<2, 4>; + using WeiBlockCopyClusterLengths_CK = Sequence<4, 32>; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; - constexpr index_t OutThreadCopyDataPerWrite_N = 2; + constexpr index_t OutThreadCopyDataPerAccess_N = 2; #elif 0 // for 3x3, 34x34, v1r3, Pascal, bad constexpr index_t BlockSize = 128; @@ -201,12 +204,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockCopyClusterLengths_CHWN = Sequence<2, 2, 32, 1>; - constexpr index_t InBlockCopyDataPerRead_N = 1; + using InBlockCopyClusterLengths_CHWN = Sequence<2, 2, 32, 1>; + constexpr index_t InBlockCopyDataPerAccess_N = 1; - constexpr index_t WeiBlockCopyDataPerRead_K = 2; + constexpr index_t WeiBlockCopyDataPerAccess_K = 2; - constexpr index_t OutThreadCopyDataPerWrite_N = 1; + constexpr index_t OutThreadCopyDataPerAccess_N = 1; #elif 0 // for 3x3, 34x34, v1r1, Vega 20 constexpr index_t BlockSize = 256; @@ -232,12 +235,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockCopyClusterLengths_CHWN = Sequence<4, 4, 2, 8>; - constexpr index_t InBlockCopyDataPerRead_N = 2; + using InBlockCopyClusterLengths_CHWN = Sequence<4, 4, 2, 8>; + constexpr index_t InBlockCopyDataPerAccess_N = 2; - constexpr index_t WeiBlockCopyDataPerRead_K = 2; + constexpr index_t WeiBlockCopyDataPerAccess_K = 2; - constexpr index_t OutThreadCopyDataPerWrite_N = 4; + constexpr index_t OutThreadCopyDataPerAccess_N = 4; #elif 1 // for 3x3, 34x34, v1r3, Vega 20 constexpr index_t BlockSize = 256; @@ -263,12 +266,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 4, 4>; - constexpr index_t InBlockCopyDataPerRead_N = 4; + using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 4, 4>; + constexpr index_t InBlockCopyDataPerAccess_N = 4; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; - constexpr index_t OutThreadCopyDataPerWrite_N = 4; + constexpr index_t OutThreadCopyDataPerAccess_N = 4; #elif 0 // for 3x3, 56x56, v1r1, Pascal constexpr index_t NPerBlock = 32; @@ -282,13 +285,13 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t HoPerThread = 1; constexpr index_t WoPerThread = 2; - constexpr index_t InBlockCopy_ThreadPerDimC = 1; - constexpr index_t InBlockCopy_ThreadPerDimH = 4; - constexpr index_t InBlockCopy_ThreadPerDimW = 4; - constexpr index_t InBlockCopy_ThreadPerDimN = 8; - constexpr index_t InBlockCopyDataPerRead_N = 4; + constexpr index_t InBlockCopy_ThreadPerDimC = 1; + constexpr index_t InBlockCopy_ThreadPerDimH = 4; + constexpr index_t InBlockCopy_ThreadPerDimW = 4; + constexpr index_t InBlockCopy_ThreadPerDimN = 8; + constexpr index_t InBlockCopyDataPerAccess_N = 4; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4; @@ -298,7 +301,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmNLevel1Cluster = 4; constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t OutThreadCopyDataPerWrite_N = 2; + constexpr index_t OutThreadCopyDataPerAccess_N = 2; constexpr index_t BlockSize = 128; #elif 0 @@ -324,14 +327,14 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadA = 1; constexpr index_t GemmDataPerReadB = 1; - constexpr index_t InBlockCopy_ThreadPerDimC = 1; - constexpr index_t InBlockCopy_ThreadPerDimH = 2; - constexpr index_t InBlockCopy_ThreadPerDimW = 4; - constexpr index_t InBlockCopy_ThreadPerDimN = 4; - constexpr index_t InBlockCopyDataPerRead_N = 4; + constexpr index_t InBlockCopy_ThreadPerDimC = 1; + constexpr index_t InBlockCopy_ThreadPerDimH = 2; + constexpr index_t InBlockCopy_ThreadPerDimW = 4; + constexpr index_t InBlockCopy_ThreadPerDimN = 4; + constexpr index_t InBlockCopyDataPerAccess_N = 4; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; - constexpr index_t OutThreadCopyDataPerWrite_N = 4; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; + constexpr index_t OutThreadCopyDataPerAccess_N = 4; constexpr index_t BlockSize = 128; #elif 0 @@ -347,13 +350,13 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t HoPerThread = 1; constexpr index_t WoPerThread = 2; - constexpr index_t InBlockCopy_ThreadPerDimC = 1; - constexpr index_t InBlockCopy_ThreadPerDimH = 4; - constexpr index_t InBlockCopy_ThreadPerDimW = 4; - constexpr index_t InBlockCopy_ThreadPerDimN = 8; - constexpr index_t InBlockCopyDataPerRead_N = 4; + constexpr index_t InBlockCopy_ThreadPerDimC = 1; + constexpr index_t InBlockCopy_ThreadPerDimH = 4; + constexpr index_t InBlockCopy_ThreadPerDimW = 4; + constexpr index_t InBlockCopy_ThreadPerDimN = 8; + constexpr index_t InBlockCopyDataPerAccess_N = 4; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4; @@ -365,7 +368,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - constexpr index_t OutThreadCopyDataPerWrite_N = 2; + constexpr index_t OutThreadCopyDataPerAccess_N = 2; constexpr index_t BlockSize = 128; #elif 0 @@ -393,12 +396,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockCopyClusterLengths_CHWN = Sequence<4, 2, 4, 4>; - constexpr index_t InBlockCopyDataPerRead_N = 4; + using InBlockCopyClusterLengths_CHWN = Sequence<4, 2, 4, 4>; + constexpr index_t InBlockCopyDataPerAccess_N = 4; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; - constexpr index_t OutThreadCopyDataPerWrite_N = 2; + constexpr index_t OutThreadCopyDataPerAccess_N = 2; #elif 0 // for 1x1, 28x28, v1r1, Pascal constexpr index_t NPerBlock = 16; @@ -413,13 +416,13 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t HoPerThread = 1; constexpr index_t WoPerThread = 1; - constexpr index_t InBlockCopy_ThreadPerDimC = 8; - constexpr index_t InBlockCopy_ThreadPerDimH = 2; - constexpr index_t InBlockCopy_ThreadPerDimW = 2; - constexpr index_t InBlockCopy_ThreadPerDimN = 4; - constexpr index_t InBlockCopyDataPerRead_N = 4; + constexpr index_t InBlockCopy_ThreadPerDimC = 8; + constexpr index_t InBlockCopy_ThreadPerDimH = 2; + constexpr index_t InBlockCopy_ThreadPerDimW = 2; + constexpr index_t InBlockCopy_ThreadPerDimN = 4; + constexpr index_t InBlockCopyDataPerAccess_N = 4; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4; @@ -429,7 +432,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmNLevel1Cluster = 4; constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t OutThreadCopyDataPerWrite_N = 2; + constexpr index_t OutThreadCopyDataPerAccess_N = 2; constexpr index_t BlockSize = 128; #elif 0 @@ -453,14 +456,14 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmNLevel1Cluster = 2; constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t InBlockCopy_ThreadPerDimC = 8; - constexpr index_t InBlockCopy_ThreadPerDimH = 2; - constexpr index_t InBlockCopy_ThreadPerDimW = 2; - constexpr index_t InBlockCopy_ThreadPerDimN = 4; - constexpr index_t InBlockCopyDataPerRead_N = 4; + constexpr index_t InBlockCopy_ThreadPerDimC = 8; + constexpr index_t InBlockCopy_ThreadPerDimH = 2; + constexpr index_t InBlockCopy_ThreadPerDimW = 2; + constexpr index_t InBlockCopy_ThreadPerDimN = 4; + constexpr index_t InBlockCopyDataPerAccess_N = 4; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; - constexpr index_t OutThreadCopyDataPerWrite_N = 2; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; + constexpr index_t OutThreadCopyDataPerAccess_N = 2; constexpr index_t BlockSize = 128; #endif @@ -478,9 +481,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn #elif 0 GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn -#elif 1 - GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn #elif 0 + GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn +#elif 1 GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer #endif {}; + InBlockCopyDataPerAccess_N, + WeiBlockCopySubLengths_CK, + WeiBlockCopyClusterLengths_CK, + WeiBlockCopyDataPerAccess_K, + OutThreadCopyDataPerAccess_N>{}; float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 64892c74b2..7ea05e243e 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -9,7 +9,7 @@ #include "conv_common.hpp" #include "host_conv.hpp" #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" -//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" +#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" //#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" //#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" //#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" @@ -85,7 +85,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; @@ -367,7 +367,7 @@ int main(int argc, char* argv[]) #if 0 device_convolution_direct_v2_nchw_kcyx_nkhw (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v1_chwn_cyxk_khwn( in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); #elif 0 @@ -379,7 +379,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc, @@ -409,7 +409,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc,