another version of blockwise 2d tensor copy

[ROCm/composable_kernel commit: 1f3870ca19]
This commit is contained in:
Chao Liu
2019-01-23 16:42:57 -06:00
parent a8ef877996
commit b30d00c0d8
7 changed files with 243 additions and 52 deletions

View File

@@ -376,7 +376,7 @@ int main()
constexpr unsigned K = 64;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0
#elif 1
constexpr unsigned N = 64;
constexpr unsigned C = 256;
constexpr unsigned HI = 36;
@@ -427,7 +427,7 @@ int main()
#endif
(in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat);
#if 1
#if 0
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host);
check_error(out_nkhw_host, out_nkhw_device);
#elif 0

View File

@@ -103,19 +103,6 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 1;
constexpr unsigned BlockSize = 128;
#elif 0
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 128;
#endif

View File

@@ -75,10 +75,23 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned ThreadPerClusterRow = 1;
constexpr unsigned ThreadPerClusterColumn = 4;
constexpr unsigned GemmThreadPerClusterRow = 1;
constexpr unsigned GemmThreadPerClusterColumn = 4;
constexpr unsigned BlockSize = 32;
#elif 0
constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2;
constexpr unsigned BPerThread = 8;
constexpr unsigned KPerThread = 8;
constexpr unsigned CPerThread = 1;
constexpr unsigned GemmThreadPerClusterRow = 4;
constexpr unsigned GemmThreadPerClusterColumn = 4;
constexpr unsigned BlockSize = 128;
#elif 1
constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 64;
@@ -88,8 +101,11 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr unsigned KPerThread = 8;
constexpr unsigned CPerThread = 1;
constexpr unsigned ThreadPerClusterRow = 4;
constexpr unsigned ThreadPerClusterColumn = 4;
constexpr unsigned GemmThreadPerClusterRow = 4;
constexpr unsigned GemmThreadPerClusterColumn = 4;
constexpr unsigned InBlockCopyThreadPerDim0 = 2;
constexpr unsigned InBlockCopyThreadPerDim1 = 64;
constexpr unsigned BlockSize = 128;
#endif
@@ -132,8 +148,10 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
BPerThread,
KPerThread,
CPerThread,
ThreadPerClusterRow,
ThreadPerClusterColumn>
GemmThreadPerClusterRow,
GemmThreadPerClusterColumn,
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1>
<<<grid_dim, block_dim>>>(in_cnhw_desc,
static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer()),
wei_srck_desc,