mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 03:19:48 +00:00
another version of blockwise 2d tensor copy
[ROCm/composable_kernel commit: 1f3870ca19]
This commit is contained in:
@@ -376,7 +376,7 @@ int main()
|
||||
constexpr unsigned K = 64;
|
||||
constexpr unsigned S = 3;
|
||||
constexpr unsigned R = 3;
|
||||
#elif 0
|
||||
#elif 1
|
||||
constexpr unsigned N = 64;
|
||||
constexpr unsigned C = 256;
|
||||
constexpr unsigned HI = 36;
|
||||
@@ -427,7 +427,7 @@ int main()
|
||||
#endif
|
||||
(in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat);
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host);
|
||||
check_error(out_nkhw_host, out_nkhw_device);
|
||||
#elif 0
|
||||
|
||||
@@ -103,19 +103,6 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
|
||||
constexpr unsigned HoPerThread = 2;
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
constexpr unsigned NPerBlock = 2;
|
||||
constexpr unsigned KPerBlock = 32;
|
||||
constexpr unsigned CPerBlock = 4;
|
||||
constexpr unsigned HoPerBlock = 2;
|
||||
constexpr unsigned WoPerBlock = 32;
|
||||
|
||||
constexpr unsigned KPerThread = 4;
|
||||
constexpr unsigned CPerThread = 2;
|
||||
constexpr unsigned HoPerThread = 2;
|
||||
constexpr unsigned WoPerThread = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#endif
|
||||
|
||||
|
||||
@@ -75,10 +75,23 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
|
||||
constexpr unsigned KPerThread = 1;
|
||||
constexpr unsigned CPerThread = 1;
|
||||
|
||||
constexpr unsigned ThreadPerClusterRow = 1;
|
||||
constexpr unsigned ThreadPerClusterColumn = 4;
|
||||
constexpr unsigned GemmThreadPerClusterRow = 1;
|
||||
constexpr unsigned GemmThreadPerClusterColumn = 4;
|
||||
|
||||
constexpr unsigned BlockSize = 32;
|
||||
#elif 0
|
||||
constexpr unsigned BPerBlock = 128;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
constexpr unsigned CPerBlock = 2;
|
||||
|
||||
constexpr unsigned BPerThread = 8;
|
||||
constexpr unsigned KPerThread = 8;
|
||||
constexpr unsigned CPerThread = 1;
|
||||
|
||||
constexpr unsigned GemmThreadPerClusterRow = 4;
|
||||
constexpr unsigned GemmThreadPerClusterColumn = 4;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 1
|
||||
constexpr unsigned BPerBlock = 128;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
@@ -88,8 +101,11 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
|
||||
constexpr unsigned KPerThread = 8;
|
||||
constexpr unsigned CPerThread = 1;
|
||||
|
||||
constexpr unsigned ThreadPerClusterRow = 4;
|
||||
constexpr unsigned ThreadPerClusterColumn = 4;
|
||||
constexpr unsigned GemmThreadPerClusterRow = 4;
|
||||
constexpr unsigned GemmThreadPerClusterColumn = 4;
|
||||
|
||||
constexpr unsigned InBlockCopyThreadPerDim0 = 2;
|
||||
constexpr unsigned InBlockCopyThreadPerDim1 = 64;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#endif
|
||||
@@ -132,8 +148,10 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
|
||||
BPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
ThreadPerClusterRow,
|
||||
ThreadPerClusterColumn>
|
||||
GemmThreadPerClusterRow,
|
||||
GemmThreadPerClusterColumn,
|
||||
InBlockCopyThreadPerDim0,
|
||||
InBlockCopyThreadPerDim1>
|
||||
<<<grid_dim, block_dim>>>(in_cnhw_desc,
|
||||
static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer()),
|
||||
wei_srck_desc,
|
||||
|
||||
Reference in New Issue
Block a user