mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
refactor
This commit is contained in:
@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr index_t BlockSize = 256;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 1x1, 14x14, Pascal, enable lds_double_buffer, disable register double buffer
|
||||
constexpr index_t BPerBlock = 64;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
@@ -216,9 +216,10 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
|
||||
constexpr index_t InBlockCopyDataPerRead = 4;
|
||||
constexpr index_t WeiBlockCopyDataPerRead = 4;
|
||||
constexpr index_t OutThreadCopyDataPerWrite = 4;
|
||||
|
||||
constexpr index_t BlockSize = 128;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 1x1, 14x14, Vega 20, enable lds_double_buffer, disable register_double_buffer
|
||||
constexpr index_t BPerBlock = 128;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
@@ -243,7 +244,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
|
||||
constexpr index_t InBlockCopyDataPerRead = 4;
|
||||
constexpr index_t WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite = 4;
|
||||
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
@@ -169,20 +169,19 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn
|
||||
HoPerThread>{};
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t in_block_element_size =
|
||||
in_chwn_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
|
||||
constexpr index_t max_align =
|
||||
mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
|
||||
|
||||
constexpr index_t wei_block_element_size =
|
||||
wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
|
||||
constexpr index_t in_block_space =
|
||||
in_chwn_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
|
||||
? InBlockCopyDataPerRead
|
||||
: WeiBlockCopyDataPerRead;
|
||||
constexpr index_t wei_block_space =
|
||||
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
__shared__ Float
|
||||
p_in_block[max_align * ((in_block_element_size + max_align - 1) / max_align)];
|
||||
p_in_block[in_block_space];
|
||||
__shared__ Float
|
||||
p_wei_block[max_align * ((wei_block_element_size + max_align - 1) / max_align)];
|
||||
p_wei_block[wei_block_space];
|
||||
|
||||
// register
|
||||
Float p_out_thread[out_khwn_thread_desc.GetElementSpace()];
|
||||
|
||||
@@ -208,11 +208,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_register_clipboard);
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_double);
|
||||
#elif 1
|
||||
#else
|
||||
vmcnt(0);
|
||||
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
|
||||
p_in_block_double);
|
||||
@@ -266,11 +266,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
#elif 0
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
@@ -279,12 +279,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_next);
|
||||
#elif 1
|
||||
#else
|
||||
vmcnt(0);
|
||||
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
|
||||
p_in_block_next);
|
||||
@@ -315,11 +315,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
#elif 0
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
@@ -328,7 +328,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_double + in_block_space);
|
||||
|
||||
@@ -349,11 +349,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
#elif 0
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + wei_block_space +
|
||||
|
||||
Reference in New Issue
Block a user