mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-19 12:30:16 +00:00
device_implicit_gemm_convolution_1_chwn_csrk_khwn: use tensor copy (instead of pointwise) for writing output, 3x3 increased from 78% to 84%, 5x5 from 80% to 84%
[ROCm/composable_kernel commit: a65ef90308]
This commit is contained in:
@@ -75,7 +75,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
out_khwn_device_buf.ToDevice(out_khwn.mData.data());
|
||||
|
||||
#if 1
|
||||
// for 3x3, 34x34, try
|
||||
// for 3x3, 34x34
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
constexpr unsigned CPerBlock = 4;
|
||||
@@ -106,9 +106,46 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
constexpr unsigned GemmNLevel1Cluster = 4;
|
||||
constexpr unsigned GemmKPerThreadLoop = 1;
|
||||
|
||||
constexpr unsigned OutThreadCopyDataPerWrite = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
// for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
|
||||
// for 5x5, 36x36
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
constexpr unsigned CPerBlock = 2;
|
||||
constexpr unsigned HoPerBlock = 2;
|
||||
constexpr unsigned WoPerBlock = 4;
|
||||
|
||||
constexpr unsigned NPerThread = 8;
|
||||
constexpr unsigned KPerThread = 8;
|
||||
constexpr unsigned HoPerThread = 1;
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
|
||||
constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
|
||||
|
||||
constexpr unsigned InBlockCopy_ThreadPerDimC = 2;
|
||||
constexpr unsigned InBlockCopy_ThreadPerDimH = 2;
|
||||
constexpr unsigned InBlockCopy_ThreadPerDimW = 4;
|
||||
constexpr unsigned InBlockCopy_ThreadPerDimN = 4;
|
||||
constexpr unsigned InBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 2;
|
||||
|
||||
constexpr unsigned GemmMPerThreadSubC = 4;
|
||||
constexpr unsigned GemmNPerThreadSubC = 4;
|
||||
constexpr unsigned GemmMLevel0Cluster = 4;
|
||||
constexpr unsigned GemmNLevel0Cluster = 2;
|
||||
constexpr unsigned GemmMLevel1Cluster = 2;
|
||||
constexpr unsigned GemmNLevel1Cluster = 4;
|
||||
constexpr unsigned GemmKPerThreadLoop = 1;
|
||||
|
||||
constexpr unsigned OutThreadCopyDataPerWrite = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
// 3x3 58x58, NKC = 64, 64, 256
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
constexpr unsigned CPerBlock = 4;
|
||||
@@ -142,27 +179,6 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
constexpr unsigned HoPerThread = 1;
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
// for 5x5, 36x36
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
constexpr unsigned CPerBlock = 2;
|
||||
constexpr unsigned HoPerBlock = 2;
|
||||
constexpr unsigned WoPerBlock = 4;
|
||||
|
||||
constexpr unsigned NPerThread = 4;
|
||||
constexpr unsigned KPerThread = 16;
|
||||
constexpr unsigned CPerThread = 1;
|
||||
constexpr unsigned HoPerThread = 1;
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
|
||||
constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
|
||||
|
||||
constexpr unsigned InBlockCopyDataPerRead = 4; // not used, yet
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
// for 7x7, 38x38
|
||||
@@ -200,7 +216,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// for 1x1, 28x28
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 128;
|
||||
@@ -210,7 +226,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
|
||||
constexpr unsigned NPerThread = 4;
|
||||
constexpr unsigned KPerThread = 16;
|
||||
constexpr unsigned CPerThread = 2;
|
||||
constexpr unsigned CPerThread = 1;
|
||||
constexpr unsigned HoPerThread = 1;
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
@@ -225,6 +241,16 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr unsigned GemmMPerThreadSubC = 4;
|
||||
constexpr unsigned GemmNPerThreadSubC = 4;
|
||||
constexpr unsigned GemmMLevel0Cluster = 4;
|
||||
constexpr unsigned GemmNLevel0Cluster = 2;
|
||||
constexpr unsigned GemmMLevel1Cluster = 2;
|
||||
constexpr unsigned GemmNLevel1Cluster = 4;
|
||||
constexpr unsigned GemmKPerThreadLoop = 1;
|
||||
|
||||
constexpr unsigned OutThreadCopyDataPerWrite = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#endif
|
||||
|
||||
@@ -266,7 +292,8 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
GemmNLevel0Cluster,
|
||||
GemmMLevel1Cluster,
|
||||
GemmNLevel1Cluster,
|
||||
GemmKPerThreadLoop>,
|
||||
GemmKPerThreadLoop,
|
||||
OutThreadCopyDataPerWrite>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
|
||||
|
||||
@@ -571,16 +571,21 @@ int main()
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
bool do_verification = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
#if 0
|
||||
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
#elif 1
|
||||
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
#elif 1
|
||||
in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread);
|
||||
wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread);
|
||||
wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
#endif
|
||||
}
|
||||
|
||||
unsigned nrepeat = 200;
|
||||
|
||||
@@ -614,22 +619,23 @@ int main()
|
||||
nrepeat);
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
if(S == 3 && R == 3)
|
||||
if(do_verification)
|
||||
{
|
||||
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
|
||||
}
|
||||
else
|
||||
{
|
||||
host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
|
||||
}
|
||||
check_error(out_nkhw_host, out_nkhw_device);
|
||||
#endif
|
||||
if(S == 3 && R == 3)
|
||||
{
|
||||
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
|
||||
}
|
||||
else
|
||||
{
|
||||
host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
|
||||
}
|
||||
check_error(out_nkhw_host, out_nkhw_device);
|
||||
|
||||
#if 0
|
||||
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
|
||||
LogRange(std::cout << "wei_kcsr: ", wei_kcsr.mData, ",") << std::endl;
|
||||
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
|
||||
LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
|
||||
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
|
||||
LogRange(std::cout << "wei_kcsr: ", wei_kcsr.mData, ",") << std::endl;
|
||||
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
|
||||
LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user