mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
debugging
This commit is contained in:
@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr index_t BlockSize = 256;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer
|
||||
constexpr index_t BPerBlock = 64;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
@@ -208,9 +208,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmNLevel1Cluster = 4;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
|
||||
constexpr index_t GemmThreadPerColumnPerCluster = 8;
|
||||
constexpr index_t GemmThreadPerRowPerCluster = 8;
|
||||
|
||||
constexpr index_t InBlockCopyThreadPerDim0 = 4;
|
||||
constexpr index_t InBlockCopyThreadPerDim1 = 16;
|
||||
|
||||
|
||||
@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
constexpr index_t HPad = 0;
|
||||
constexpr index_t WPad = 0;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 1x1 filter, 14x14 image, C = 2048
|
||||
constexpr index_t N = 128;
|
||||
constexpr index_t C = 2048;
|
||||
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
constexpr index_t HPad = 0;
|
||||
constexpr index_t WPad = 0;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 1x1 filter, 14x14 image, C = 512
|
||||
constexpr index_t N = 128;
|
||||
constexpr index_t C = 512;
|
||||
|
||||
@@ -19,8 +19,6 @@ template <index_t GridSize,
|
||||
index_t CPerBlock,
|
||||
index_t BPerThread,
|
||||
index_t KPerThread,
|
||||
index_t GemmThreadPerColumnPerCluster,
|
||||
index_t GemmThreadPerRowPerCluster,
|
||||
index_t GemmMPerThreadSubC,
|
||||
index_t GemmNPerThreadSubC,
|
||||
index_t GemmMLevel0Cluster,
|
||||
@@ -95,34 +93,15 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
constexpr auto out_kb_thread_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
|
||||
print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
|
||||
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
|
||||
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
|
||||
print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
|
||||
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
|
||||
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
|
||||
|
||||
printf("KPerBlock %u\n", KPerBlock);
|
||||
}
|
||||
#endif
|
||||
|
||||
// blockwise in copy
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
#if 0
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
decltype(in_cb_block_desc.GetLengths())>{};
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
decltype(in_cb_block_desc.GetLengths())>{};
|
||||
#elif 0
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy2<BlockSize,
|
||||
@@ -145,12 +124,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock*Y*X,KPerBlock]
|
||||
#if 0
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(wei_ek_global_desc),
|
||||
decltype(wei_ek_block_desc),
|
||||
decltype(wei_ek_block_desc.GetLengths())>{};
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(wei_ek_global_desc),
|
||||
decltype(wei_ek_block_desc),
|
||||
decltype(wei_ek_block_desc.GetLengths())>{};
|
||||
#elif 0
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy2<BlockSize,
|
||||
@@ -202,14 +181,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
constexpr index_t max_align =
|
||||
mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
|
||||
|
||||
constexpr index_t in_block_element_space =
|
||||
in_cb_block_desc.GetElementSpace(Number<max_align>{});
|
||||
constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
constexpr index_t wei_block_element_space =
|
||||
constexpr index_t wei_block_space =
|
||||
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
__shared__ Float p_in_block[in_block_element_space];
|
||||
__shared__ Float p_wei_block[wei_block_element_space];
|
||||
__shared__ Float p_in_block[in_block_space];
|
||||
__shared__ Float p_wei_block[wei_block_space];
|
||||
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
|
||||
@@ -229,7 +207,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
__syncthreads())
|
||||
{
|
||||
// load data
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block);
|
||||
#elif 0
|
||||
|
||||
@@ -67,6 +67,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
constexpr index_t B = N * Hi * Wi;
|
||||
constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
|
||||
|
||||
static_assert(C % (2 * CPerBlock) == 0, "C cannot be evenly divided");
|
||||
|
||||
// divide block work by 2d: [K, B]
|
||||
constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
|
||||
constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock;
|
||||
@@ -184,15 +186,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
constexpr index_t max_align =
|
||||
mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
|
||||
|
||||
constexpr index_t in_block_element_space =
|
||||
in_cb_block_desc.GetElementSpace(Number<max_align>{});
|
||||
constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
constexpr index_t wei_block_element_space =
|
||||
constexpr index_t wei_block_space =
|
||||
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
// LDS double buffer
|
||||
__shared__ Float p_in_block_double[2 * in_block_element_space];
|
||||
__shared__ Float p_wei_block_double[2 * wei_block_element_space];
|
||||
__shared__ Float p_in_block_double[2 * in_block_space];
|
||||
__shared__ Float p_wei_block_double[2 * wei_block_space];
|
||||
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
|
||||
@@ -202,10 +203,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
|
||||
// preload data into LDS
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_double);
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_double);
|
||||
#elif 1
|
||||
#elif 0
|
||||
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
@@ -237,22 +238,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
const bool even_loop = (iloop % 2 == 0);
|
||||
|
||||
Float* p_in_block_now =
|
||||
even_loop ? p_in_block_double : p_in_block_double + in_block_element_space;
|
||||
even_loop ? p_in_block_double : p_in_block_double + in_block_space;
|
||||
Float* p_wei_block_now =
|
||||
even_loop ? p_wei_block_double : p_wei_block_double + wei_block_element_space;
|
||||
even_loop ? p_wei_block_double : p_wei_block_double + wei_block_space;
|
||||
|
||||
Float* p_in_block_next =
|
||||
even_loop ? p_in_block_double + in_block_element_space : p_in_block_double;
|
||||
even_loop ? p_in_block_double + in_block_space : p_in_block_double;
|
||||
Float* p_wei_block_next =
|
||||
even_loop ? p_wei_block_double + wei_block_element_space : p_wei_block_double;
|
||||
|
||||
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
|
||||
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
|
||||
even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double;
|
||||
|
||||
// load next data
|
||||
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
|
||||
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
@@ -267,25 +268,25 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_now + y * Wi + x,
|
||||
p_out_thread);
|
||||
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_now + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_next);
|
||||
#elif 1
|
||||
#elif 0
|
||||
// if work with RunLoadRegisterClipboard_asm, need to wait
|
||||
vmcnt(0);
|
||||
|
||||
@@ -298,7 +299,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
}
|
||||
|
||||
// tail
|
||||
if(C % 2 == 0)
|
||||
{
|
||||
// even
|
||||
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
|
||||
@@ -319,34 +319,34 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_double + y * Wi + x,
|
||||
p_out_thread);
|
||||
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_double + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_double + in_block_element_space);
|
||||
p_in_block_double + in_block_space);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(
|
||||
p_wei_register_clipboard, p_wei_block_double + wei_block_element_space);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_double + wei_block_space);
|
||||
#else
|
||||
// if work with RunLoadRegisterClipboard_asm, need to wait
|
||||
vmcnt(0);
|
||||
|
||||
blockwise_in_copy.RunStoreRegisterClipboard_asm(
|
||||
p_in_register_clipboard, p_in_block_double + in_block_element_space);
|
||||
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
|
||||
p_in_block_double + in_block_space);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard_asm(
|
||||
p_wei_register_clipboard, p_wei_block_double + wei_block_element_space);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
|
||||
p_wei_block_double + wei_block_space);
|
||||
#endif
|
||||
|
||||
// odd
|
||||
@@ -356,25 +356,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + in_block_element_space +
|
||||
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_double + wei_block_element_space + y * Wi + x,
|
||||
p_out_thread);
|
||||
(p_wei_block_double + in_block_space +
|
||||
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_double + wei_block_space + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// not implemented
|
||||
assert(false);
|
||||
}
|
||||
|
||||
// output: register to global mem,
|
||||
const auto c_thread_mtx_begin =
|
||||
|
||||
@@ -14,29 +14,12 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
|
||||
|
||||
for(index_t i = 0; i < NRow; ++i)
|
||||
{
|
||||
// optimize for vector-4 load
|
||||
if(NCol % 4 == 0)
|
||||
for(index_t j = 0; j < NCol; ++j)
|
||||
{
|
||||
using vector_t = typename vector_type<Float, 4>::MemoryType;
|
||||
const index_t src_index = src_mtx.Get1dIndex(i, j);
|
||||
const index_t dst_index = dst_mtx.Get1dIndex(i, j);
|
||||
|
||||
for(index_t j = 0; j < NCol / 4; ++j)
|
||||
{
|
||||
const index_t src_index = src_mtx.Get1dIndex(i, 4 * j);
|
||||
const index_t dst_index = dst_mtx.Get1dIndex(i, 4 * j);
|
||||
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(index_t j = 0; j < NCol; ++j)
|
||||
{
|
||||
const index_t src_index = src_mtx.Get1dIndex(i, j);
|
||||
const index_t dst_index = dst_mtx.Get1dIndex(i, j);
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
}
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user