From 23e9f358bb3e358d40d193dffce9d9b5f957b3f7 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Tue, 5 Apr 2022 22:16:59 +0200 Subject: [PATCH] Common forward convolution utility refactor. (#141) * Convolution ND * Code unification across dimensions for generating tensor descriptors. * Example * Instances * Move convnd f32 instance file to comply with repo structure. * Conv 1D tensor layouts. * Formatting and use ReferenceConv * Reference ConvFwd supporting 1D and 2D convolution. * Debug printing TensorLayout name. * Conv fwd 1D instance f32 * Refactor conv ND example. Needed to support various conv dimensio. Needed to support various conv dimensions * Rename conv nd example director to prevent conflicts. * Refactor some common utility to single file. Plus some tests. * Refactor GetHostTensorDescriptor + UT. * Add 1D test case. * Test reference convolution 1d/2d * Remove some leftovers. * Fix convolution example error for 1D * Refactor test check errors utility function. * Test Conv2D Fwd XDL * More UT for 1D case. * Parameterize input & weight initializers. * Rename example to prevent conflicts. * Split convnd instance into separate files for 1d/2d * Address review comments. * Fix data type for flops/gbytes calculations. * Assign example number 11. * 3D cases for convolution utility functions. * 3D reference convolution. * Add support for 3D convolution. * Check for inputs bigger than 2GB. * Formatting * Support for bf16/f16/f32/i8 - conv instances + UT. * Use check_err from test_util.hpp. * Split convnd test into separate files for each dim. * Fix data generation and use proper instances. * Formatting * Skip tensor initialization if not necessary. * Fix CMakefiles. * Remove redundant conv2d_fwd test. * Lower problem size for conv3D UT. * 3D case for convnd example. * Remove leftovers after merge. * Add Conv Specialization string to GetTypeString * Skip instance causing numerical errors. * Small fixes. * Remove redundant includes. * Fix namespace name error. * Script for automatic testing and logging convolution fwd UTs * Comment out numactl cmd. * Refine weights initalization and relax rtol for fp16 * Move test_util.hpp to check_err.hpp * Refine weights initalization and relax rtol for fp16 * Refactor common part of test conv utils. * Move utility function to single common place. * Add additional common functions to utility. * Refactor convnd_fwd_xdl examples. * Remove redundant files. * Unify structure. * Add constructor to ConvParams. * And add input parameters validation. * Modify conv examples to use single utility file. * Remove check_error from host_tensor.hpp * Get rid of check_indices function. * Remove bf16_to_f32 function overload for scalars. * Fix namespace. * Add half_float::half for check_err. * Fix conv params size in UT. * Fix weights initialization for int8. * Fix weights initialization for int8. * Add type_convert when store output in ref conv 1D. * Get back old conv2d_fwd_xdl operation. * Silence conv debug print. * format * clean * clean * Fix merge. * Fix namespace for check_err * Formatting. * Fix merge artifacts. * Remove deleted header. * Fix some includes and use ck::utils::check_err. * Remove unused check_indices restored by previous merge. * Fix namespaces after merge. * Fix compilation error. * Small fixes. * Use common functions. * Fix filename * Fix namespaces. * Fix merge artifact - retrieve removed by accident fun. * Fix ConvForwardSpecialization. * Adhere to coding style rules. * Fix merge artifacts. Co-authored-by: Adam Osewski Co-authored-by: Chao Liu [ROCm/composable_kernel commit: abf4bdb9a9946c578d4801a79650e79938fb0e41] --- example/01_gemm/gemm_xdl_bf16.cpp | 4 +- example/01_gemm/gemm_xdl_fp16.cpp | 4 +- example/01_gemm/gemm_xdl_int8.cpp | 4 +- .../gemm_xdl_alpha_beta.cpp | 4 +- .../03_gemm_bias_relu/gemm_xdl_bias_relu.cpp | 4 +- .../gemm_xdl_bias_relu_add.cpp | 4 +- example/05_conv2d_fwd/CMakeLists.txt | 2 - example/05_conv2d_fwd/README.md | 24 - example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp | 274 --------- example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp | 275 --------- .../conv2d_fwd_xdl_bias_relu.cpp | 324 +++++----- .../conv2d_fwd_xdl_bias_relu_add.cpp | 342 ++++++----- example/08_conv3d_fwd/CMakeLists.txt | 1 - example/08_conv3d_fwd/README.md | 24 - example/08_conv3d_fwd/conv3d_fwd_xdl.cpp | 281 --------- example/09_convnd_fwd/CMakeLists.txt | 2 + example/09_convnd_fwd/convnd_fwd_xdl.cpp | 117 +--- example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp | 341 +++++++++++ example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp | 343 +++++++++++ .../conv2d_bwd_data_xdl.cpp | 4 +- .../conv2d_bwd_weight_xdl.cpp | 4 +- example/12_reduce/reduce_blockwise.cpp | 7 +- example/13_pool2d_fwd/pool2d_fwd.cpp | 7 +- .../gemm_xdl_requant_relu_requant_int8.cpp | 4 +- .../15_grouped_gemm/grouped_gemm_xdl_fp16.cpp | 5 +- .../convnd_bwd_data_xdl.cpp | 98 +--- example/CMakeLists.txt | 3 +- .../gpu/device/conv_utils.hpp | 242 -------- .../gpu/device/convolution_utility.hpp | 73 --- ...ice_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp | 40 +- .../ck/library/host_tensor/host_tensor.hpp | 27 - .../include/ck/library/utility/check_err.hpp | 75 +-- .../ck/library/utility/conv_fwd_util.hpp | 554 ++++++++++++++++++ library/src/host_tensor/host_tensor.cpp | 13 +- .../conv_add_fwd_driver_offline_nchwc.cpp | 4 +- .../conv_bwd_driver_offline.cpp | 4 +- .../conv_fwd_driver_offline.cpp | 4 +- .../conv_fwd_driver_offline_nchwc.cpp | 4 +- .../conv_maxpool_fwd_driver_offline_nchwc.cpp | 6 +- .../conv_wrw_driver_offline.cpp | 4 +- .../gemm_driver_offline.cpp | 4 +- profiler/CMakeLists.txt | 1 + .../include/profile_batched_gemm_impl.hpp | 2 +- .../include/profile_conv_bwd_data_impl.hpp | 5 +- .../profile_conv_fwd_bias_relu_add_impl.hpp | 5 +- ...ile_conv_fwd_bias_relu_atomic_add_impl.hpp | 4 +- .../profile_conv_fwd_bias_relu_impl.hpp | 4 +- profiler/include/profile_conv_fwd_impl.hpp | 5 +- .../include/profile_convnd_bwd_data_impl.hpp | 27 +- .../include/profile_gemm_bias_2d_impl.hpp | 4 +- .../profile_gemm_bias_relu_add_impl.hpp | 4 +- .../include/profile_gemm_bias_relu_impl.hpp | 4 +- profiler/include/profile_gemm_impl.hpp | 6 +- .../include/profile_grouped_gemm_impl.hpp | 4 +- profiler/include/profile_reduce_impl.hpp | 12 +- profiler/src/profile_convnd_bwd_data.cpp | 6 +- test/CMakeLists.txt | 1 + test/batched_gemm/batched_gemm_fp16.cpp | 4 +- test/conv2d_bwd_weight/conv2d_bwd_weight.cpp | 24 +- test/conv_util/conv_util.cpp | 149 ++--- test/convnd_bwd_data/convnd_bwd_data.cpp | 2 +- test/convnd_fwd/conv1d_fwd.cpp | 83 +-- test/convnd_fwd/conv2d_fwd.cpp | 73 +-- test/convnd_fwd/conv3d_fwd.cpp | 146 ++--- test/convnd_fwd/conv_util.hpp | 90 +++ test/gemm/gemm_bf16.cpp | 1 - test/gemm/gemm_fp32.cpp | 1 - test/gemm/gemm_int8.cpp | 1 - test/gemm/gemm_util.hpp | 14 +- test/grouped_gemm/grouped_gemm_fp16.cpp | 23 +- test/include/conv_test_util.hpp | 289 --------- .../magic_number_division.cpp | 33 +- test/reduce/reduce_no_index.cpp | 11 +- test/reduce/reduce_with_index.cpp | 23 +- .../reference_conv_fwd/reference_conv_fwd.cpp | 175 +++--- 75 files changed, 2278 insertions(+), 2518 deletions(-) delete mode 100644 example/05_conv2d_fwd/CMakeLists.txt delete mode 100644 example/05_conv2d_fwd/README.md delete mode 100644 example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp delete mode 100644 example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp delete mode 100644 example/08_conv3d_fwd/CMakeLists.txt delete mode 100644 example/08_conv3d_fwd/README.md delete mode 100644 example/08_conv3d_fwd/conv3d_fwd_xdl.cpp create mode 100644 example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp create mode 100644 example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp delete mode 100644 include/ck/tensor_operation/gpu/device/conv_utils.hpp delete mode 100644 include/ck/tensor_operation/gpu/device/convolution_utility.hpp rename test/include/test_util.hpp => library/include/ck/library/utility/check_err.hpp (69%) create mode 100644 library/include/ck/library/utility/conv_fwd_util.hpp create mode 100644 test/convnd_fwd/conv_util.hpp delete mode 100644 test/include/conv_test_util.hpp diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp index 9be781454b..8f0631c1ce 100644 --- a/example/01_gemm/gemm_xdl_bf16.cpp +++ b/example/01_gemm/gemm_xdl_bf16.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -227,7 +229,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - check_error(c_m_n_host_result, c_m_n_device_f32_result); + ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData); } return 0; diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp index 3427d046ea..2d5a95e400 100644 --- a/example/01_gemm/gemm_xdl_fp16.cpp +++ b/example/01_gemm/gemm_xdl_fp16.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -196,7 +198,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - check_error(c_m_n_host_result, c_m_n_device_result); + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); } return 0; diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp index aaad1397f7..724757565e 100644 --- a/example/01_gemm/gemm_xdl_int8.cpp +++ b/example/01_gemm/gemm_xdl_int8.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -219,7 +221,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - check_error(c_m_n_host_result, c_m_n_device_result); + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); } return 0; diff --git a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp index bd937cdc07..2abebbbac4 100644 --- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp +++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "print.hpp" #include "device.hpp" @@ -244,6 +246,6 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - check_error(c_m_n_host_result, c_m_n_device_result); + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); } } diff --git a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp index b4739ed47a..f3ed2bad37 100644 --- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp +++ b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "print.hpp" #include "device.hpp" @@ -230,6 +232,6 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - check_error(c_m_n_host_result, c_m_n_device_result); + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); } } diff --git a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp index 671cfd014f..9405c36881 100644 --- a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp +++ b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "print.hpp" #include "device.hpp" @@ -248,6 +250,6 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - check_error(c_m_n_host_result, c_m_n_device_result); + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); } } diff --git a/example/05_conv2d_fwd/CMakeLists.txt b/example/05_conv2d_fwd/CMakeLists.txt deleted file mode 100644 index 5f0e118fd6..0000000000 --- a/example/05_conv2d_fwd/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_example_executable(example_conv2d_fwd_xdl_fp16 conv2d_fwd_xdl_fp16.cpp) -add_example_executable(example_conv2d_fwd_xdl_int8 conv2d_fwd_xdl_int8.cpp) diff --git a/example/05_conv2d_fwd/README.md b/example/05_conv2d_fwd/README.md deleted file mode 100644 index 08a7f0d56c..0000000000 --- a/example/05_conv2d_fwd/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Instructions for ```example_conv2d_fwd_xdl``` - -## Run ```example_conv2d_fwd_xdl``` -```bash -#arg1: verification (0=no, 1=yes) -#arg2: initialization (0=no init, 1=integer value, 2=decimal value) -#arg3: run kernel # of times (>1) -#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx -./bin/example_conv2d_fwd_xdl 0 1 5 -``` - -Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) -``` -in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192} -wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192} -out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256} -arg.a_grid_desc_k0_m_k1_{216, 165888, 8} -arg.b_grid_desc_k0_n_k1_{216, 256, 8} -arg.c_grid_desc_m_n_{ 165888, 256} -launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 5 times... -Perf: 1.43206 ms, 102.486 TFlops, 232.947 GB/s -``` diff --git a/example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp b/example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp deleted file mode 100644 index c1f5c3b169..0000000000 --- a/example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp +++ /dev/null @@ -1,274 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "config.hpp" -#include "print.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "device_tensor.hpp" -#include "tensor_layout.hpp" -#include "element_wise_operation.hpp" -#include "device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp" -#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp" -#include "reference_conv_fwd.hpp" -#include "convolution_utility.hpp" - -using InDataType = ck::half_t; -using WeiDataType = ck::half_t; -using OutDataType = ck::half_t; -using AccDataType = float; - -template -using S = ck::Sequence; - -using InLayout = ck::tensor_layout::convolution::NHWC; -using WeiLayout = ck::tensor_layout::convolution::KYXC; -using OutLayout = ck::tensor_layout::convolution::NHWK; - -using InElementOp = ck::tensor_operation::element_wise::PassThrough; -using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; -using OutElementOp = ck::tensor_operation::element_wise::PassThrough; - -static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; - -using DeviceConvFwdInstance = ck::tensor_operation::device:: - DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< - InDataType, // InDataType - WeiDataType, // WeiDataType - OutDataType, // OutDataType - AccDataType, // AccDataType - InElementOp, // InElementwiseOperation - WeiElementOp, // WeiElementwiseOperation - OutElementOp, // OutElementwiseOperation - ConvFwdDefault, // ConvForwardSpecialization - 256, // BlockSize - 128, // MPerBlock - 256, // NPerBlock - 4, // K0PerBlock - 8, // K1 - 32, // MPerXdl - 32, // NPerXdl - 2, // MXdlPerWave - 4, // NXdlPerWave - S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 8, // ABlockTransferSrcScalarPerVector - 8, // ABlockTransferDstScalarPerVector_K1 - true, // ABlockLdsAddExtraM - S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 8, // BBlockTransferSrcScalarPerVector - 8, // BBlockTransferDstScalarPerVector_K1 - true, // BBlockLdsAddExtraN - 7, // CThreadTransferSrcDstVectorDim - 1>; // CThreadTransferDstScalarPerVector - -using ReferenceConvFwdInstance = ck::tensor_operation::host:: - ReferenceConvFwd; - -int main(int argc, char* argv[]) -{ - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; - - // Conv shape - ck::index_t N = 128; - ck::index_t K = 256; - ck::index_t C = 192; - ck::index_t Y = 3; - ck::index_t X = 3; - ck::index_t Hi = 71; - ck::index_t Wi = 71; - ck::index_t conv_stride_h = 2; - ck::index_t conv_stride_w = 2; - ck::index_t conv_dilation_h = 1; - ck::index_t conv_dilation_w = 1; - ck::index_t in_left_pad_h = 1; - ck::index_t in_left_pad_w = 1; - ck::index_t in_right_pad_h = 1; - ck::index_t in_right_pad_w = 1; - - if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - } - else if(argc == 19) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - - N = std::stoi(argv[4]); - K = std::stoi(argv[5]); - C = std::stoi(argv[6]); - Y = std::stoi(argv[7]); - X = std::stoi(argv[8]); - Hi = std::stoi(argv[9]); - Wi = std::stoi(argv[10]); - conv_stride_h = std::stoi(argv[11]); - conv_stride_w = std::stoi(argv[12]); - conv_dilation_h = std::stoi(argv[13]); - conv_dilation_w = std::stoi(argv[14]); - in_left_pad_h = std::stoi(argv[15]); - in_left_pad_w = std::stoi(argv[16]); - in_right_pad_h = std::stoi(argv[17]); - in_right_pad_w = std::stoi(argv[18]); - } - else - { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); - printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " - "RightPx\n"); - exit(0); - } - - const std::vector conv_filter_strides{conv_stride_h, conv_stride_w}; - const std::vector conv_filter_dilations{conv_dilation_h, conv_dilation_w}; - const std::vector input_left_pads{in_left_pad_h, in_left_pad_w}; - const std::vector input_right_pads{in_right_pad_h, in_right_pad_w}; - const auto output_spatial_lengths = - ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi}, - {Y, X}, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads); - - const ck::index_t Ho = output_spatial_lengths[0]; - const ck::index_t Wo = output_spatial_lengths[1]; - - // tensor layout - auto f_host_tensor_descriptor = [](std::size_t N_, - std::size_t C_, - std::size_t H, - std::size_t W, - auto layout) { - if constexpr(ck::is_same::value || - ck::is_same::value || - ck::is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, H * W, W, 1})); - } - else if constexpr(ck::is_same::value || - ck::is_same::value || - ck::is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, 1, W * C_, C_})); - } - }; - - Tensor in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); - Tensor wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{})); - Tensor out_n_k_ho_wo_host_result( - f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); - Tensor out_n_k_ho_wo_device_result( - f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); - - std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; - std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; - std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl; - - switch(init_method) - { - case 0: break; - case 1: - in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - default: - in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - } - - DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * - out_n_k_ho_wo_device_result.mDesc.GetElementSpace()); - - in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_device_buf.ToDevice(wei_k_c_y_x.mData.data()); - - // do GEMM - auto conv = DeviceConvFwdInstance{}; - auto invoker = conv.MakeInvoker(); - auto argument = conv.MakeArgument(static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - InElementOp{}, - WeiElementOp{}, - OutElementOp{}); - - if(!conv.IsSupportedArgument(argument)) - { - throw std::runtime_error( - "wrong! device_conv with the specified compilation parameters does " - "not support this Conv problem"); - } - - float ave_time = invoker.Run(argument, nrepeat); - - std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; - - std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) + - sizeof(WeiDataType) * (K * C * Y * X) + - sizeof(OutDataType) * (N * K * Ho * Wo); - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" - << std::endl; - - if(do_verification) - { - auto ref_conv = ReferenceConvFwdInstance{}; - auto ref_invoker = ref_conv.MakeInvoker(); - - auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi, - wei_k_c_y_x, - out_n_k_ho_wo_host_result, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - InElementOp{}, - WeiElementOp{}, - OutElementOp{}); - - ref_invoker.Run(ref_argument); - - out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - - check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); - } -} diff --git a/example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp b/example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp deleted file mode 100644 index ea5e7a1fd9..0000000000 --- a/example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp +++ /dev/null @@ -1,275 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "config.hpp" -#include "print.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "device_tensor.hpp" -#include "tensor_layout.hpp" -#include "device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp" -#include "element_wise_operation.hpp" -#include "reference_conv_fwd.hpp" -#include "convolution_utility.hpp" - -using InDataType = int8_t; -using WeiDataType = int8_t; -using OutDataType = int8_t; -using AccDataType = int32_t; - -template -using S = ck::Sequence; - -using InLayout = ck::tensor_layout::convolution::NHWC; -using WeiLayout = ck::tensor_layout::convolution::KYXC; -using OutLayout = ck::tensor_layout::convolution::NHWK; - -using InElementOp = ck::tensor_operation::element_wise::PassThrough; -using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; -using OutElementOp = ck::tensor_operation::element_wise::PassThrough; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; - -using DeviceConvFwdInstance = ck::tensor_operation::device:: - DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< - int8_t, // InDataType - int8_t, // WeiDataType - int8_t, // OutDataType - int32_t, // AccDataType - PassThrough, // InElementwiseOperation - PassThrough, // WeiElementwiseOperation - PassThrough, // OutElementwiseOperation - ConvFwdDefault, // ConvForwardSpecialization - 256, // BlockSize - 128, // MPerBlock - 256, // NPerBlock - 4, // K0PerBlock - 16, // K1 - 32, // MPerXdl - 32, // NPerXdl - 2, // MXdlPerWave - 4, // NXdlPerWave - S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 16, // ABlockTransferSrcScalarPerVector - 16, // ABlockTransferDstScalarPerVector_K1 - true, // ABlockLdsAddExtraM - S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 16, // BBlockTransferSrcScalarPerVector - 16, // BBlockTransferDstScalarPerVector_K1 - true, // BBlockLdsAddExtraN - 7, // CThreadTransferSrcDstVectorDim - 1>; // CThreadTransferDstScalarPerVector - -using ReferenceConvFwdInstance = ck::tensor_operation::host:: - ReferenceConvFwd; - -int main(int argc, char* argv[]) -{ - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; - - // Conv shape - ck::index_t N = 128; - ck::index_t K = 256; - ck::index_t C = 192; - ck::index_t Y = 3; - ck::index_t X = 3; - ck::index_t Hi = 71; - ck::index_t Wi = 71; - ck::index_t conv_stride_h = 2; - ck::index_t conv_stride_w = 2; - ck::index_t conv_dilation_h = 1; - ck::index_t conv_dilation_w = 1; - ck::index_t in_left_pad_h = 1; - ck::index_t in_left_pad_w = 1; - ck::index_t in_right_pad_h = 1; - ck::index_t in_right_pad_w = 1; - - if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - } - else if(argc == 19) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - - N = std::stoi(argv[4]); - K = std::stoi(argv[5]); - C = std::stoi(argv[6]); - Y = std::stoi(argv[7]); - X = std::stoi(argv[8]); - Hi = std::stoi(argv[9]); - Wi = std::stoi(argv[10]); - conv_stride_h = std::stoi(argv[11]); - conv_stride_w = std::stoi(argv[12]); - conv_dilation_h = std::stoi(argv[13]); - conv_dilation_w = std::stoi(argv[14]); - in_left_pad_h = std::stoi(argv[15]); - in_left_pad_w = std::stoi(argv[16]); - in_right_pad_h = std::stoi(argv[17]); - in_right_pad_w = std::stoi(argv[18]); - } - else - { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); - printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " - "RightPx\n"); - exit(0); - } - - const std::vector conv_filter_strides{conv_stride_h, conv_stride_w}; - const std::vector conv_filter_dilations{conv_dilation_h, conv_dilation_w}; - const std::vector input_left_pads{in_left_pad_h, in_left_pad_w}; - const std::vector input_right_pads{in_right_pad_h, in_right_pad_w}; - const auto output_spatial_lengths = - ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi}, - {Y, X}, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads); - - const ck::index_t Ho = output_spatial_lengths[0]; - const ck::index_t Wo = output_spatial_lengths[1]; - - // tensor layout - auto f_host_tensor_descriptor = [](std::size_t N_, - std::size_t C_, - std::size_t H, - std::size_t W, - auto layout) { - if constexpr(ck::is_same::value || - ck::is_same::value || - ck::is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, H * W, W, 1})); - } - else if constexpr(ck::is_same::value || - ck::is_same::value || - ck::is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, 1, W * C_, C_})); - } - }; - - Tensor in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); - Tensor wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{})); - Tensor out_n_k_ho_wo_host_result( - f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); - Tensor out_n_k_ho_wo_device_result( - f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); - - std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; - std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; - std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl; - - switch(init_method) - { - case 0: break; - case 1: - in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2{-1, 1}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2{-1, 1}); - break; - default: - in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3{0, 1}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3{-1, 1}); - } - - DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * - out_n_k_ho_wo_device_result.mDesc.GetElementSpace()); - - in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_device_buf.ToDevice(wei_k_c_y_x.mData.data()); - - // do GEMM - auto conv = DeviceConvFwdInstance{}; - auto invoker = conv.MakeInvoker(); - auto argument = conv.MakeArgument(static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - InElementOp{}, - WeiElementOp{}, - OutElementOp{}); - - if(!conv.IsSupportedArgument(argument)) - { - throw std::runtime_error( - "wrong! device_conv with the specified compilation parameters does " - "not support this Conv problem"); - } - - float ave_time = invoker.Run(argument, nrepeat); - - std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; - - std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) + - sizeof(WeiDataType) * (K * C * Y * X) + - sizeof(OutDataType) * (N * K * Ho * Wo); - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" - << std::endl; - - if(do_verification) - { - auto ref_conv = ReferenceConvFwdInstance{}; - auto ref_invoker = ref_conv.MakeInvoker(); - - auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi, - wei_k_c_y_x, - out_n_k_ho_wo_host_result, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - InElementOp{}, - WeiElementOp{}, - OutElementOp{}); - - ref_invoker.Run(ref_argument); - - out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - - check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); - } -} diff --git a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp index 0b3e15a25e..751ce16b90 100644 --- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp +++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp @@ -4,17 +4,20 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" -#include "print.hpp" +#include "conv_fwd_util.hpp" #include "device.hpp" +#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp" +#include "device_tensor.hpp" +#include "element_wise_operation.hpp" #include "host_tensor.hpp" #include "host_tensor_generator.hpp" -#include "device_tensor.hpp" -#include "tensor_layout.hpp" -#include "element_wise_operation.hpp" -#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp" #include "reference_conv_fwd_bias_activation.hpp" -#include "convolution_utility.hpp" +#include "tensor_layout.hpp" + +namespace { using InDataType = ck::half_t; using WeiDataType = ck::half_t; @@ -86,146 +89,157 @@ using ReferenceConvFwdInstance = WeiElementOp, OutElementOp>; -int main(int argc, char* argv[]) +void PrintUseMsg() { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: run kernel # of times (>1)\n" + << "Following arguments:\n" + << " N, K, C, \n" + << " , (ie Y, X for 2D)\n" + << " , (ie Hi, Wi for 2D)\n" + << " , (ie Sy, Sx for 2D)\n" + << " , (ie Dy, Dx for 2D)\n" + << " , (ie LeftPy, LeftPx for 2D)\n" + << " , (ie RightPy, RightPx for 2D)\n" + << std::endl; +} - // Conv shape - ck::index_t N = 128; - ck::index_t K = 256; - ck::index_t C = 192; - ck::index_t Y = 3; - ck::index_t X = 3; - ck::index_t Hi = 71; - ck::index_t Wi = 71; - ck::index_t conv_stride_h = 2; - ck::index_t conv_stride_w = 2; - ck::index_t conv_dilation_h = 1; - ck::index_t conv_dilation_w = 1; - ck::index_t in_left_pad_h = 1; - ck::index_t in_left_pad_w = 1; - ck::index_t in_right_pad_h = 1; - ck::index_t in_right_pad_w = 1; - - if(argc == 4) +ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[]) +{ + // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) + int num_dim_spatial = 2; + int conv_args = 3 + num_dim_spatial * 6; + int cmdline_nargs = conv_args + 4; + if(cmdline_nargs != argc) { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - } - else if(argc == 19) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - - N = std::stoi(argv[4]); - K = std::stoi(argv[5]); - C = std::stoi(argv[6]); - Y = std::stoi(argv[7]); - X = std::stoi(argv[8]); - Hi = std::stoi(argv[9]); - Wi = std::stoi(argv[10]); - conv_stride_h = std::stoi(argv[11]); - conv_stride_w = std::stoi(argv[12]); - conv_dilation_h = std::stoi(argv[13]); - conv_dilation_w = std::stoi(argv[14]); - in_left_pad_h = std::stoi(argv[15]); - in_left_pad_w = std::stoi(argv[16]); - in_right_pad_h = std::stoi(argv[17]); - in_right_pad_w = std::stoi(argv[18]); - } - else - { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); - printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " - "RightPx\n"); + PrintUseMsg(); exit(0); } - const std::vector conv_filter_strides{conv_stride_h, conv_stride_w}; - const std::vector conv_filter_dilations{conv_dilation_h, conv_dilation_w}; - const std::vector input_left_pads{in_left_pad_h, in_left_pad_w}; - const std::vector input_right_pads{in_right_pad_h, in_right_pad_w}; - const auto output_spatial_lengths = - ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi}, - {Y, X}, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads); + ck::utils::conv::ConvParams params; + int arg_idx = 4; - const ck::index_t Ho = output_spatial_lengths[0]; - const ck::index_t Wo = output_spatial_lengths[1]; + params.num_dim_spatial = num_dim_spatial; + params.N = std::stoi(argv[arg_idx++]); + params.K = std::stoi(argv[arg_idx++]); + params.C = std::stoi(argv[arg_idx++]); - // tensor layout - auto f_host_tensor_descriptor = [](std::size_t N_, - std::size_t C_, - std::size_t H, - std::size_t W, - auto layout) { - if constexpr(ck::is_same::value || - ck::is_same::value || - ck::is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, H * W, W, 1})); - } - else if constexpr(ck::is_same::value || - ck::is_same::value || - ck::is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, 1, W * C_, C_})); - } - }; + params.filter_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.input_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_strides.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_dilations.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + } + params.input_left_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + } + params.input_right_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + } - Tensor in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); - Tensor wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{})); - Tensor out_n_k_ho_wo_host_result( - f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); - Tensor out_n_k_ho_wo_device_result( - f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); + return params; +} +} // anonymous namespace + +int main(int argc, char* argv[]) +{ + using namespace ck::utils::conv; + + bool do_verification = 0; + int init_method = 0; + int nrepeat = 5; + const int num_dim_spatial = 2; + + ck::utils::conv::ConvParams params; + + if(argc >= 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + } + + if(argc >= 5) + { + params = ParseConvParams(argc, argv); + } + + std::vector input_dims{static_cast(params.N), + static_cast(params.C)}; + input_dims.insert(std::end(input_dims), + std::begin(params.input_spatial_lengths), + std::end(params.input_spatial_lengths)); + + std::vector filter_dims{static_cast(params.K), + static_cast(params.C)}; + filter_dims.insert(std::end(filter_dims), + std::begin(params.filter_spatial_lengths), + std::end(params.filter_spatial_lengths)); + + const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); + std::vector output_dims{static_cast(params.N), + static_cast(params.K)}; + output_dims.insert(std::end(output_dims), + std::begin(output_spatial_lengths), + std::end(output_spatial_lengths)); + + Tensor input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial)); + Tensor weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial)); + Tensor host_output( + get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); + Tensor device_output( + get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); // bias: assume contiguous 1d vector - Tensor bias_k( - HostTensorDescriptor(std::vector({static_cast(K)}))); + Tensor bias( + HostTensorDescriptor(std::vector({static_cast(params.K)}))); - std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; - std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; - std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl; - std::cout << "bias_k: " << bias_k.mDesc << std::endl; + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weights: " << weights.mDesc << std::endl; + std::cout << "output: " << host_output.mDesc << std::endl; + std::cout << "bias: " << bias.mDesc << std::endl; switch(init_method) { case 0: break; case 1: - in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - bias_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + weights.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}); break; default: - in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - bias_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + weights.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + bias.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); } - DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * - out_n_k_ho_wo_device_result.mDesc.GetElementSpace()); - DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace()); + DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); + DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace()); + DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace()); - in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_device_buf.ToDevice(wei_k_c_y_x.mData.data()); - bias_device_buf.ToDevice(bias_k.mData.data()); + in_device_buf.ToDevice(input.mData.data()); + wei_device_buf.ToDevice(weights.mData.data()); + bias_device_buf.ToDevice(bias.mData.data()); auto conv = DeviceConvFwdInstance{}; auto invoker = conv.MakeInvoker(); @@ -234,16 +248,16 @@ int main(int argc, char* argv[]) static_cast(wei_device_buf.GetDeviceBuffer()), static_cast(out_device_buf.GetDeviceBuffer()), static_cast(bias_device_buf.GetDeviceBuffer()), - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, + params.N, + params.K, + params.C, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, InElementOp{}, WeiElementOp{}, OutElementOp{}); @@ -257,16 +271,19 @@ int main(int argc, char* argv[]) float ave_time = invoker.Run(argument, nrepeat); - std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; - - std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) + - sizeof(WeiDataType) * (K * C * Y * X) + - sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K); - - float tflops = static_cast(flop) / 1.E9 / ave_time; + std::size_t flop = get_flops( + params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + std::size_t num_btype = + get_btype(params.N, + params.C, + params.K, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths) + + sizeof(OutDataType) * (params.K); + float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl; @@ -275,21 +292,20 @@ int main(int argc, char* argv[]) auto ref_conv = ReferenceConvFwdInstance{}; auto ref_invoker = ref_conv.MakeInvoker(); - auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi, - wei_k_c_y_x, - out_n_k_ho_wo_host_result, - bias_k, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, + auto ref_argument = ref_conv.MakeArgument(input, + weights, + host_output, + bias, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, InElementOp{}, WeiElementOp{}, OutElementOp{}); ref_invoker.Run(ref_argument); - - out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - - check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); + out_device_buf.FromDevice(device_output.mData.data()); + ck::utils::check_err( + host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); } } diff --git a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp index bcfde547b2..e6339fcd23 100644 --- a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp +++ b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp @@ -4,17 +4,20 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" -#include "print.hpp" +#include "conv_fwd_util.hpp" #include "device.hpp" +#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp" +#include "device_tensor.hpp" +#include "element_wise_operation.hpp" #include "host_tensor.hpp" #include "host_tensor_generator.hpp" -#include "device_tensor.hpp" -#include "tensor_layout.hpp" -#include "element_wise_operation.hpp" -#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp" #include "reference_conv_fwd_bias_activation_add.hpp" -#include "convolution_utility.hpp" +#include "tensor_layout.hpp" + +namespace { using InDataType = ck::half_t; using WeiDataType = ck::half_t; @@ -83,154 +86,166 @@ using ReferenceConvFwdInstance = WeiElementOp, OutElementOp>; -int main(int argc, char* argv[]) +void PrintUseMsg() { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: run kernel # of times (>1)\n" + << "Following arguments:\n" + << " N, K, C, \n" + << " , (ie Y, X for 2D)\n" + << " , (ie Hi, Wi for 2D)\n" + << " , (ie Sy, Sx for 2D)\n" + << " , (ie Dy, Dx for 2D)\n" + << " , (ie LeftPy, LeftPx for 2D)\n" + << " , (ie RightPy, RightPx for 2D)\n" + << std::endl; +} - // Conv shape - ck::index_t N = 128; - ck::index_t K = 256; - ck::index_t C = 192; - ck::index_t Y = 3; - ck::index_t X = 3; - ck::index_t Hi = 71; - ck::index_t Wi = 71; - ck::index_t conv_stride_h = 2; - ck::index_t conv_stride_w = 2; - ck::index_t conv_dilation_h = 1; - ck::index_t conv_dilation_w = 1; - ck::index_t in_left_pad_h = 1; - ck::index_t in_left_pad_w = 1; - ck::index_t in_right_pad_h = 1; - ck::index_t in_right_pad_w = 1; - - if(argc == 4) +ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[]) +{ + // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) + int num_dim_spatial = 2; + int conv_args = 3 + num_dim_spatial * 6; + int cmdline_nargs = conv_args + 4; + if(cmdline_nargs != argc) { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - } - else if(argc == 19) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - - N = std::stoi(argv[4]); - K = std::stoi(argv[5]); - C = std::stoi(argv[6]); - Y = std::stoi(argv[7]); - X = std::stoi(argv[8]); - Hi = std::stoi(argv[9]); - Wi = std::stoi(argv[10]); - conv_stride_h = std::stoi(argv[11]); - conv_stride_w = std::stoi(argv[12]); - conv_dilation_h = std::stoi(argv[13]); - conv_dilation_w = std::stoi(argv[14]); - in_left_pad_h = std::stoi(argv[15]); - in_left_pad_w = std::stoi(argv[16]); - in_right_pad_h = std::stoi(argv[17]); - in_right_pad_w = std::stoi(argv[18]); - } - else - { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); - printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " - "RightPx\n"); + PrintUseMsg(); exit(0); } - const std::vector conv_filter_strides{conv_stride_h, conv_stride_w}; - const std::vector conv_filter_dilations{conv_dilation_h, conv_dilation_w}; - const std::vector input_left_pads{in_left_pad_h, in_left_pad_w}; - const std::vector input_right_pads{in_right_pad_h, in_right_pad_w}; - const auto output_spatial_lengths = - ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi}, - {Y, X}, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads); + ck::utils::conv::ConvParams params; + int arg_idx = 4; - const ck::index_t Ho = output_spatial_lengths[0]; - const ck::index_t Wo = output_spatial_lengths[1]; + params.num_dim_spatial = num_dim_spatial; + params.N = std::stoi(argv[arg_idx++]); + params.K = std::stoi(argv[arg_idx++]); + params.C = std::stoi(argv[arg_idx++]); - // tensor layout - auto f_host_tensor_descriptor = [](std::size_t N_, - std::size_t C_, - std::size_t H, - std::size_t W, - auto layout) { - if constexpr(ck::is_same::value || - ck::is_same::value || - ck::is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, H * W, W, 1})); - } - else if constexpr(ck::is_same::value || - ck::is_same::value || - ck::is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, 1, W * C_, C_})); - } - }; + params.filter_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.input_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_strides.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_dilations.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + } + params.input_left_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + } + params.input_right_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + } - Tensor in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); - Tensor wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{})); - Tensor out_n_k_ho_wo_host_result( - f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); - Tensor out_n_k_ho_wo_device_result( - f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); + return params; +} + +} // anonymous namespace + +int main(int argc, char* argv[]) +{ + using namespace ck::utils::conv; + + bool do_verification = 0; + int init_method = 0; + int nrepeat = 5; + const int num_dim_spatial = 2; + + ck::utils::conv::ConvParams params; + + if(argc >= 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + } + + if(argc >= 5) + { + params = ParseConvParams(argc, argv); + } + + std::vector input_dims{static_cast(params.N), + static_cast(params.C)}; + input_dims.insert(std::end(input_dims), + std::begin(params.input_spatial_lengths), + std::end(params.input_spatial_lengths)); + + std::vector filter_dims{static_cast(params.K), + static_cast(params.C)}; + filter_dims.insert(std::end(filter_dims), + std::begin(params.filter_spatial_lengths), + std::end(params.filter_spatial_lengths)); + + const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); + std::vector output_dims{static_cast(params.N), + static_cast(params.K)}; + output_dims.insert(std::end(output_dims), + std::begin(output_spatial_lengths), + std::end(output_spatial_lengths)); + + Tensor input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial)); + Tensor weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial)); + Tensor host_output( + get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); + Tensor device_output( + get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); // bias: assume contiguous 1d vector - Tensor bias_k( - HostTensorDescriptor(std::vector({static_cast(K)}))); + Tensor bias( + HostTensorDescriptor(std::vector({static_cast(params.K)}))); // residual: assume same layout as output tensor - Tensor resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); + Tensor residual(get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); - std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; - std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; - std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl; - std::cout << "bias_k: " << bias_k.mDesc << std::endl; - std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.mDesc << std::endl; + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weights: " << weights.mDesc << std::endl; + std::cout << "output: " << host_output.mDesc << std::endl; + std::cout << "bias: " << bias.mDesc << std::endl; + std::cout << "residual: " << residual.mDesc << std::endl; switch(init_method) { case 0: break; case 1: - in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - bias_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + weights.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + residual.GenerateTensorValue(GeneratorTensor_2{-5, 5}); break; default: - in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - bias_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + weights.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + bias.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + residual.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); } - DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * - out_n_k_ho_wo_device_result.mDesc.GetElementSpace()); - DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace()); - DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_k_ho_wo.mDesc.GetElementSpace()); + DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); + DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace()); + DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace()); + DeviceMem resi_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpace()); - in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_device_buf.ToDevice(wei_k_c_y_x.mData.data()); - bias_device_buf.ToDevice(bias_k.mData.data()); - resi_device_buf.ToDevice(resi_n_k_ho_wo.mData.data()); + in_device_buf.ToDevice(input.mData.data()); + wei_device_buf.ToDevice(weights.mData.data()); + bias_device_buf.ToDevice(bias.mData.data()); + resi_device_buf.ToDevice(residual.mData.data()); const auto in_element_op = InElementOp{}; const auto wei_element_op = WeiElementOp{}; @@ -244,16 +259,16 @@ int main(int argc, char* argv[]) static_cast(out_device_buf.GetDeviceBuffer()), static_cast(bias_device_buf.GetDeviceBuffer()), static_cast(resi_device_buf.GetDeviceBuffer()), - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, + params.N, + params.K, + params.C, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, in_element_op, wei_element_op, out_element_op); @@ -267,17 +282,21 @@ int main(int argc, char* argv[]) float ave_time = invoker.Run(argument, nrepeat); - std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; - - std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) + - sizeof(WeiDataType) * (K * C * Y * X) + - sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K) + - sizeof(OutDataType) * (N * K * Ho * Wo); - - float tflops = static_cast(flop) / 1.E9 / ave_time; + std::size_t flop = get_flops( + params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + std::size_t num_btype = + get_btype(params.N, + params.C, + params.K, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths) + + sizeof(OutDataType) * (params.K) + + sizeof(OutDataType) * + (params.N * params.K * output_spatial_lengths[0] * output_spatial_lengths[1]); + float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl; @@ -286,23 +305,22 @@ int main(int argc, char* argv[]) auto ref_conv = ReferenceConvFwdInstance{}; auto ref_invoker = ref_conv.MakeInvoker(); - auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi, - wei_k_c_y_x, - out_n_k_ho_wo_host_result, - bias_k, - resi_n_k_ho_wo, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, + auto ref_argument = ref_conv.MakeArgument(input, + weights, + host_output, + bias, + residual, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, in_element_op, wei_element_op, out_element_op); ref_invoker.Run(ref_argument); - - out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - - check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); + out_device_buf.FromDevice(device_output.mData.data()); + ck::utils::check_err( + host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); } } diff --git a/example/08_conv3d_fwd/CMakeLists.txt b/example/08_conv3d_fwd/CMakeLists.txt deleted file mode 100644 index 49fb1fe1ce..0000000000 --- a/example/08_conv3d_fwd/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_example_executable(example_conv3d_fwd_xdl conv3d_fwd_xdl.cpp) diff --git a/example/08_conv3d_fwd/README.md b/example/08_conv3d_fwd/README.md deleted file mode 100644 index 962c603871..0000000000 --- a/example/08_conv3d_fwd/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Instructions for ```example_conv3d_fwd_xdl``` - -## Run ```example_conv3d_fwd_xdl``` -```bash -#arg1: verification (0=no, 1=yes) -#arg2: initialization (0=no init, 1=integer value, 2=decimal value) -#arg3: run kernel # of times (>1) -#arg4 to 24: N, K, C, Z, Y, X, Di, Hi, Wi, Sz, Sy, Sx, Dz, Dy, Dx, leftPz, LeftPy, LeftPx, RightPz, RightPy, RightPx -./bin/example_conv3d_fwd_xdl 0 1 5 -``` - -Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) -``` -wei: dim 5, lengths {256, 3, 3, 3, 192}, strides {5184, 1728, 576, 192, 1} -out: dim 5, lengths {4, 36, 36, 36, 256}, strides {11943936, 331776, 9216, 256, 1} -num_batches_of_GEMM = 1 -a_grid_desc_k0_m_k1{648, 186624, 8} -b_grid_desc_k0_n_k1{648, 256, 8} -c_grid_desc_m_n{ 186624, 256} -launch_and_time_kernel: grid_dim {1458, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 5 times... -Perf: 4.58795 ms, 107.965 TFlops, 141.23 GB/s -``` diff --git a/example/08_conv3d_fwd/conv3d_fwd_xdl.cpp b/example/08_conv3d_fwd/conv3d_fwd_xdl.cpp deleted file mode 100644 index 5f89ee3c19..0000000000 --- a/example/08_conv3d_fwd/conv3d_fwd_xdl.cpp +++ /dev/null @@ -1,281 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "config.hpp" -#include "print.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "host_gemm.hpp" -#include "device_tensor.hpp" -#include "device_base.hpp" -#include "device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp" -#include "device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp" -#include "convolution_utility.hpp" - -// convolution data type -using InDataType = ck::half_t; -using WeiDataType = ck::half_t; -using OutDataType = ck::half_t; -using AccDataType = float; - -using InElementOp = ck::tensor_operation::element_wise::PassThrough; -using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; -using OutElementOp = ck::tensor_operation::element_wise::PassThrough; - -using F16 = ck::half_t; -using F32 = float; - -template -using S = ck::Sequence; - -using InLayout = ck::tensor_layout::convolution::NDHWC; -using WeiLayout = ck::tensor_layout::convolution::KZYXC; -using OutLayout = ck::tensor_layout::convolution::NDHWK; - -static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; - -using DeviceConv3dFwdInstance = ck::tensor_operation::device:: - DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K< - InDataType, // InData - WeiDataType, // WeiData - OutDataType, // OutData - AccDataType, // AccData - InElementOp, // InElementwise Operation - WeiElementOp, // WeiElementwise Operation - OutElementOp, // OutElementwise Operation - ConvFwdDefault, // ConvForwardSpecialization - 256, // BlockSize - 128, // MPerBlock - 256, // NPerBlock - 4, // K0PerBlock - 8, // K1. K0PerBlock * K1 = KPerBlock - 32, // MPerXDL - 32, // NPerXDL. Each XDL computes a matrix of size (MPerXDL, NPerBlock) - 2, // MXdlPerWave - 4, // NXdlPerWave - S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 8, // ABlockTransferSrcScalarPerVector - 8, // ABlockTransferDstScalarPerVector_K1 - true, // ABlockLdsAddExtraM - S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 8, // BBlockTransferSrcScalarPerVector - 8, // BBlockTransferDstScalarPerVector_K1 - true, // BBlockLdsAddExtraN - 7, // CThreadTransferSrcDstVectorDim - 1>; // CThreadTransferDstScalarPerVector - -int main(int argc, char* argv[]) -{ - bool do_verification = false; - int init_method = 0; - int nrepeat = 5; - - // convolution shape - ck::index_t N = 4; - ck::index_t K = 256; - ck::index_t C = 192; - std::vector in_spatial_lengths = {71, 71, 71}; - std::vector filter_spatial_lengths = {3, 3, 3}; - std::vector conv_filter_strides = {2, 2, 2}; - std::vector conv_filter_dilations = {1, 1, 1}; - std::vector in_left_pads = {1, 1, 1}; - std::vector in_right_pads = {1, 1, 1}; - - if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - } - else if(argc == 25) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - - N = std::stoi(argv[4]); - K = std::stoi(argv[5]); - C = std::stoi(argv[6]); - filter_spatial_lengths[0] = std::stoi(argv[7]); - filter_spatial_lengths[1] = std::stoi(argv[8]); - filter_spatial_lengths[2] = std::stoi(argv[9]); - in_spatial_lengths[0] = std::stoi(argv[10]); - in_spatial_lengths[1] = std::stoi(argv[11]); - in_spatial_lengths[2] = std::stoi(argv[12]); - conv_filter_strides[0] = std::stoi(argv[13]); - conv_filter_strides[1] = std::stoi(argv[14]); - conv_filter_strides[2] = std::stoi(argv[15]); - conv_filter_dilations[0] = std::stoi(argv[16]); - conv_filter_dilations[1] = std::stoi(argv[17]); - conv_filter_dilations[2] = std::stoi(argv[18]); - in_left_pads[0] = std::stoi(argv[19]); - in_left_pads[1] = std::stoi(argv[20]); - in_left_pads[2] = std::stoi(argv[21]); - in_right_pads[0] = std::stoi(argv[22]); - in_right_pads[1] = std::stoi(argv[23]); - in_right_pads[2] = std::stoi(argv[24]); - } - else - { - printf("Usage: 3 or 24 input arguments\n"); - printf(" arg1: verification (0=no, 1=yes)\n"); - printf(" arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf(" arg3: run kernel # of times (>1)\n"); - printf(" arg4 to 24: N, K, C, Z, Y, X, Di, Hi, Wi, Sz, Sy, Sz, Dz, Dy, Dx, LeftPz, LeftPy, " - "LeftPz, RightPz, RightPy, RightPx\n"); - exit(0); - } - - auto conv3d = DeviceConv3dFwdInstance{}; - - const auto out_spatial_lengths = - ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths( - in_spatial_lengths, - filter_spatial_lengths, - conv_filter_strides, - conv_filter_dilations, - in_left_pads, - in_right_pads); - Tensor in( - {N, in_spatial_lengths[0], in_spatial_lengths[1], in_spatial_lengths[2], C}); - Tensor wei( - {K, filter_spatial_lengths[0], filter_spatial_lengths[1], filter_spatial_lengths[2], C}); - Tensor out( - {N, out_spatial_lengths[0], out_spatial_lengths[1], out_spatial_lengths[2], K}); - - std::cout << "in: " << in.mDesc << std::endl; - std::cout << "wei: " << wei.mDesc << std::endl; - std::cout << "out: " << out.mDesc << std::endl; - - switch(init_method) - { - case 0: break; - case 1: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - default: - in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - } - - DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpace()); - - in_device_buf.ToDevice(in.mData.data()); - wei_device_buf.ToDevice(wei.mData.data()); - - // do Convolution - auto invoker = conv3d.MakeInvoker(); - auto argument = conv3d.MakeArgument(static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - N, - K, - C, - in_spatial_lengths, - filter_spatial_lengths, - out_spatial_lengths, - conv_filter_strides, - conv_filter_dilations, - in_left_pads, - in_right_pads, - InElementOp{}, - WeiElementOp{}, - OutElementOp{}); - - if(!conv3d.IsSupportedArgument(argument)) - { - throw std::runtime_error( - "wrong! device_conv3d with the specified compilation parameters does " - "not support this GEMM problem"); - } - - float ave_time = invoker.Run(argument, nrepeat); - - const auto Di = in_spatial_lengths[0]; - const auto Hi = in_spatial_lengths[1]; - const auto Wi = in_spatial_lengths[2]; - const auto Do = out_spatial_lengths[0]; - const auto Ho = out_spatial_lengths[1]; - const auto Wo = out_spatial_lengths[2]; - const auto Z = filter_spatial_lengths[0]; - const auto Y = filter_spatial_lengths[1]; - const auto X = filter_spatial_lengths[2]; - - std::size_t flop = std::size_t(2) * N * K * Do * Ho * Wo * C * Z * Y * X; - std::size_t num_btype = sizeof(InDataType) * N * Di * Hi * Wi * C + - sizeof(WeiDataType) * K * Z * Y * X * C + - sizeof(OutDataType) * N * Do * Ho * Wo * K; - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" - << std::endl; - - out_device_buf.FromDevice(out.mData.data()); - - if(do_verification) - { - DeviceMem out_ref_device_buf(sizeof(OutDataType) * N * Do * Ho * Wo * K); - - using DeviceConv3dFwdNaive = ck::tensor_operation::device:: - DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K< - InDataType, - WeiDataType, - OutDataType, - AccDataType, - InElementOp, - WeiElementOp, - OutElementOp>; - auto conv3d_naive = DeviceConv3dFwdNaive{}; - auto invoker_naive = conv3d_naive.MakeInvoker(); - auto argument_naive = conv3d_naive.MakeArgument( - static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_ref_device_buf.GetDeviceBuffer()), - N, - K, - C, - in_spatial_lengths, - filter_spatial_lengths, - out_spatial_lengths, - conv_filter_strides, - conv_filter_dilations, - in_left_pads, - in_right_pads, - InElementOp{}, - WeiElementOp{}, - OutElementOp{}); - - if(!conv3d_naive.IsSupportedArgument(argument_naive)) - { - throw std::runtime_error( - "wrong! device_conv3d_naive does NOT support the specified compilation parameters"); - } - invoker_naive.Run(argument_naive); - - Tensor out_ref( - {N, out_spatial_lengths[0], out_spatial_lengths[1], out_spatial_lengths[2], K}); - - out_ref_device_buf.FromDevice(out_ref.mData.data()); - - check_error(out_ref, out); - } - - return 0; -} diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt index 61299b521e..fd6d11d9ff 100644 --- a/example/09_convnd_fwd/CMakeLists.txt +++ b/example/09_convnd_fwd/CMakeLists.txt @@ -1 +1,3 @@ add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp) +add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp) +add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp) diff --git a/example/09_convnd_fwd/convnd_fwd_xdl.cpp b/example/09_convnd_fwd/convnd_fwd_xdl.cpp index 3caaf6720c..e8895b8639 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl.cpp @@ -2,8 +2,10 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" -#include "conv_utils.hpp" +#include "conv_fwd_util.hpp" #include "device.hpp" #include "device_tensor.hpp" #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" @@ -13,6 +15,8 @@ #include "reference_conv_fwd.hpp" #include "tensor_layout.hpp" +namespace { + using InDataType = float; using WeiDataType = float; using OutDataType = float; @@ -80,7 +84,7 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd< OutElementOp, NumDimSpatial>; -DeviceConvFwdBasePtr GetConvInstance(int num_dim_spatial) +DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial) { switch(num_dim_spatial) { @@ -99,7 +103,7 @@ DeviceConvFwdBasePtr GetConvInstance(int num_dim_spatial) } } -void PrintUseMsg() +void print_use_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" @@ -116,18 +120,18 @@ void PrintUseMsg() << std::endl; } -ck::conv_util::ConvParams ParseConvParams(int num_dim_spatial, int argc, char* argv[]) +ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[]) { // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) int conv_args = 3 + num_dim_spatial * 6; int cmdline_nargs = conv_args + 5; if(cmdline_nargs != argc) { - PrintUseMsg(); + print_use_msg(); exit(0); } - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; int arg_idx = 5; params.num_dim_spatial = num_dim_spatial; @@ -169,80 +173,18 @@ ck::conv_util::ConvParams ParseConvParams(int num_dim_spatial, int argc, char* a return params; } -HostTensorDescriptor GetOutputHostTensorDescriptor(const std::vector& dims, - int num_dim_spatial = 2) -{ - namespace tl = ck::tensor_layout::convolution; - - switch(num_dim_spatial) - { - case 3: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWK{}); - } - case 2: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWK{}); - } - case 1: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NWK{}); - } - default: { - throw std::runtime_error("Unsupported number of spatial dimensions provided!"); - } - } -} - -HostTensorDescriptor GetFiltersHostTensorDescriptor(const std::vector& dims, - int num_dim_spatial = 2) -{ - namespace tl = ck::tensor_layout::convolution; - - switch(num_dim_spatial) - { - case 3: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::KZYXC{}); - } - case 2: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::KYXC{}); - } - case 1: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::KXC{}); - } - default: { - throw std::runtime_error("Unsupported number of spatial dimensions provided!"); - } - } -} - -HostTensorDescriptor GetInputHostTensorDescriptor(const std::vector& dims, - int num_dim_spatial = 2) -{ - namespace tl = ck::tensor_layout::convolution; - - switch(num_dim_spatial) - { - case 3: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWC{}); - } - case 2: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{}); - } - case 1: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{}); - } - default: { - throw std::runtime_error("Unsupported number of spatial dimensions provided!"); - } - } -} +} // anonymous namespace int main(int argc, char* argv[]) { + using namespace ck::utils::conv; + bool do_verification = 0; int init_method = 0; int nrepeat = 5; int num_dim_spatial = 2; - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; if(argc >= 5) { @@ -254,7 +196,7 @@ int main(int argc, char* argv[]) if(argc >= 6) { - params = ParseConvParams(num_dim_spatial, argc, argv); + params = parse_conv_params(num_dim_spatial, argc, argv); } std::vector input_dims{static_cast(params.N), @@ -276,10 +218,12 @@ int main(int argc, char* argv[]) std::begin(output_spatial_lengths), std::end(output_spatial_lengths)); - Tensor input(GetInputHostTensorDescriptor(input_dims, num_dim_spatial)); - Tensor weights(GetFiltersHostTensorDescriptor(filter_dims, num_dim_spatial)); - Tensor host_output(GetOutputHostTensorDescriptor(output_dims, num_dim_spatial)); - Tensor device_output(GetOutputHostTensorDescriptor(output_dims, num_dim_spatial)); + Tensor input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial)); + Tensor weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial)); + Tensor host_output( + get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); + Tensor device_output( + get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); std::cout << "input: " << input.mDesc << std::endl; std::cout << "weights: " << weights.mDesc << std::endl; @@ -305,7 +249,7 @@ int main(int argc, char* argv[]) wei_device_buf.ToDevice(weights.mData.data()); // do GEMM - auto conv = GetConvInstance(num_dim_spatial); + auto conv = get_conv_instance(num_dim_spatial); auto invoker = conv->MakeInvokerPointer(); auto argument = conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), @@ -334,15 +278,15 @@ int main(int argc, char* argv[]) float ave_time = invoker->Run(argument.get(), nrepeat); - std::size_t flop = ck::conv_util::GetFlops( + std::size_t flop = get_flops( params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); std::size_t num_btype = - ck::conv_util::GetBtype(params.N, - params.C, - params.K, - params.input_spatial_lengths, - params.filter_spatial_lengths, - output_spatial_lengths); + get_btype(params.N, + params.C, + params.K, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths); float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; @@ -367,7 +311,8 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); out_device_buf.FromDevice(device_output.mData.data()); - check_error(host_output, device_output); + ck::utils::check_err( + host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); }; switch(num_dim_spatial) diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp new file mode 100644 index 0000000000..eaa5683978 --- /dev/null +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp @@ -0,0 +1,341 @@ +#include +#include +#include +#include + +#include "check_err.hpp" +#include "config.hpp" +#include "conv_fwd_util.hpp" +#include "device.hpp" +#include "device_tensor.hpp" +#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" +#include "element_wise_operation.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "reference_conv_fwd.hpp" +#include "tensor_layout.hpp" + +namespace { + +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using OutDataType = ck::half_t; +using AccDataType = float; + +template +using S = ck::Sequence; + +using InLayout = ck::tensor_layout::convolution::NHWC; +using WeiLayout = ck::tensor_layout::convolution::KYXC; +using OutLayout = ck::tensor_layout::convolution::NHWK; + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto ConvFwdDefault = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +using DeviceConvFwdBasePtr = + ck::tensor_operation::device::DeviceConvFwdPtr; + +template +using DeviceConvNDFwdInstance = ck::tensor_operation::device:: + DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< + // clang-format off + InDataType, // + WeiDataType, // + OutDataType, // + AccDataType, // + InElementOp, // Input Elementwise Operation + WeiElementOp, // Weights Elementwise Operation + OutElementOp, // Output Elementwise Operation + ConvFwdDefault, // ConvForwardSpecialization + NumDimSpatial, // NumDimSpatial + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 4, // K0PerBlock + 8, // K1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_K1 + true, // ABlockLdsAddExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_K1 + true, // BBlockLdsAddExtraN + 7, // CThreadTransferSrcDstVectorDim + 1>; // CThreadTransferDstScalarPerVector + +template +using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd; + +DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial) +{ + switch(num_dim_spatial) + { + case 3: { + return std::make_unique>(); + } + case 2: { + return std::make_unique>(); + } + case 1: { + return std::make_unique>(); + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } +} + +void print_use_msg() +{ + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: run kernel # of times (>1)\n" + << "arg4: N spatial dimensions (default 2)\n" + << "Following arguments (depending on number of spatial dims):\n" + << " N, K, C, \n" + << " , (ie Y, X for 2D)\n" + << " , (ie Hi, Wi for 2D)\n" + << " , (ie Sy, Sx for 2D)\n" + << " , (ie Dy, Dx for 2D)\n" + << " , (ie LeftPy, LeftPx for 2D)\n" + << " , (ie RightPy, RightPx for 2D)\n" + << std::endl; +} + +ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[]) +{ + // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) + int conv_args = 3 + num_dim_spatial * 6; + int cmdline_nargs = conv_args + 5; + if(cmdline_nargs != argc) + { + print_use_msg(); + exit(0); + } + + ck::utils::conv::ConvParams params; + int arg_idx = 5; + + params.num_dim_spatial = num_dim_spatial; + params.N = std::stoi(argv[arg_idx++]); + params.K = std::stoi(argv[arg_idx++]); + params.C = std::stoi(argv[arg_idx++]); + + params.filter_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.input_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_strides.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_dilations.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + } + params.input_left_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + } + params.input_right_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + } + + return params; +} + +} // anonymous namespace + +int main(int argc, char* argv[]) +{ + using namespace ck::utils::conv; + + bool do_verification = 0; + int init_method = 0; + int nrepeat = 5; + int num_dim_spatial = 2; + + ck::utils::conv::ConvParams params; + + if(argc >= 5) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + num_dim_spatial = std::stoi(argv[4]); + } + + if(argc >= 6) + { + params = parse_conv_params(num_dim_spatial, argc, argv); + } + + std::vector input_dims{static_cast(params.N), + static_cast(params.C)}; + input_dims.insert(std::end(input_dims), + std::begin(params.input_spatial_lengths), + std::end(params.input_spatial_lengths)); + + std::vector filter_dims{static_cast(params.K), + static_cast(params.C)}; + filter_dims.insert(std::end(filter_dims), + std::begin(params.filter_spatial_lengths), + std::end(params.filter_spatial_lengths)); + + const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); + std::vector output_dims{static_cast(params.N), + static_cast(params.K)}; + output_dims.insert(std::end(output_dims), + std::begin(output_spatial_lengths), + std::end(output_spatial_lengths)); + + Tensor input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial)); + Tensor weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial)); + Tensor host_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); + Tensor device_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); + + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weights: " << weights.mDesc << std::endl; + std::cout << "output: " << host_output.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + weights.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + weights.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); + DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace()); + + in_device_buf.ToDevice(input.mData.data()); + wei_device_buf.ToDevice(weights.mData.data()); + + // do GEMM + auto conv = get_conv_instance(num_dim_spatial); + auto invoker = conv->MakeInvokerPointer(); + auto argument = + conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + params.N, + params.K, + params.C, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + if(!conv->IsSupportedArgument(argument.get())) + { + throw std::runtime_error( + "wrong! device_conv with the specified compilation parameters does " + "not support this Conv problem"); + } + + float ave_time = invoker->Run(argument.get(), nrepeat); + + std::size_t flop = get_flops( + params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + std::size_t num_btype = get_btype( + params.N, + params.C, + params.K, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths); + + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_btype / 1.E6 / ave_time; + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" + << std::endl; + + if(do_verification) + { + auto verify_f = [&input, &weights, &host_output, ¶ms, &out_device_buf, &device_output]( + const auto& ref_conv) { + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(input, + weights, + host_output, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + ref_invoker.Run(ref_argument); + out_device_buf.FromDevice(device_output.mData.data()); + ck::utils::check_err( + host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + }; + + switch(num_dim_spatial) + { + case 3: { + auto ref_conv = ReferenceConvNDFwdInstance<3>(); + verify_f(ref_conv); + break; + } + case 2: { + auto ref_conv = ReferenceConvNDFwdInstance<2>(); + verify_f(ref_conv); + break; + } + case 1: { + auto ref_conv = ReferenceConvNDFwdInstance<1>(); + verify_f(ref_conv); + break; + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } + } +} diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp new file mode 100644 index 0000000000..34b4645770 --- /dev/null +++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp @@ -0,0 +1,343 @@ +#include +#include +#include +#include + +#include "check_err.hpp" +#include "config.hpp" +#include "conv_fwd_util.hpp" +#include "device.hpp" +#include "device_tensor.hpp" +#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" +#include "element_wise_operation.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "reference_conv_fwd.hpp" +#include "tensor_layout.hpp" + +namespace { + +using InDataType = int8_t; +using WeiDataType = int8_t; +using OutDataType = int8_t; +using AccDataType = int32_t; + +template +using S = ck::Sequence; + +using InLayout = ck::tensor_layout::convolution::NHWC; +using WeiLayout = ck::tensor_layout::convolution::KYXC; +using OutLayout = ck::tensor_layout::convolution::NHWK; + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::PassThrough; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto ConvFwdDefault = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +using DeviceConvFwdBasePtr = + ck::tensor_operation::device::DeviceConvFwdPtr; + +template +using DeviceConvNDFwdInstance = ck::tensor_operation::device:: + DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< + // clang-format off + InDataType, // + WeiDataType, // + OutDataType, // + AccDataType, // + InElementOp, // Input Elementwise Operation + WeiElementOp, // Weights Elementwise Operation + OutElementOp, // Output Elementwise Operation + ConvFwdDefault, // ConvForwardSpecialization + NumDimSpatial, // NumDimSpatial + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 4, // K0PerBlock + 16, // K1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 16, // ABlockTransferSrcScalarPerVector + 16, // ABlockTransferDstScalarPerVector_K1 + true, // ABlockLdsAddExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 16, // BBlockTransferSrcScalarPerVector + 16, // BBlockTransferDstScalarPerVector_K1 + true, // BBlockLdsAddExtraN + 7, // CThreadTransferSrcDstVectorDim + 1>; // CThreadTransferDstScalarPerVector + +template +using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd; + +DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial) +{ + switch(num_dim_spatial) + { + case 3: { + return std::make_unique>(); + } + case 2: { + return std::make_unique>(); + } + case 1: { + return std::make_unique>(); + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } +} + +void print_use_msg() +{ + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: run kernel # of times (>1)\n" + << "arg4: N spatial dimensions (default 2)\n" + << "Following arguments (depending on number of spatial dims):\n" + << " N, K, C, \n" + << " , (ie Y, X for 2D)\n" + << " , (ie Hi, Wi for 2D)\n" + << " , (ie Sy, Sx for 2D)\n" + << " , (ie Dy, Dx for 2D)\n" + << " , (ie LeftPy, LeftPx for 2D)\n" + << " , (ie RightPy, RightPx for 2D)\n" + << std::endl; +} + +ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[]) +{ + // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) + int conv_args = 3 + num_dim_spatial * 6; + int cmdline_nargs = conv_args + 5; + if(cmdline_nargs != argc) + { + print_use_msg(); + exit(0); + } + + ck::utils::conv::ConvParams params; + int arg_idx = 5; + + params.num_dim_spatial = num_dim_spatial; + params.N = std::stoi(argv[arg_idx++]); + params.K = std::stoi(argv[arg_idx++]); + params.C = std::stoi(argv[arg_idx++]); + + params.filter_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.input_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_strides.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_dilations.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + } + params.input_left_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + } + params.input_right_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + } + + return params; +} + +} // anonymous namespace + +int main(int argc, char* argv[]) +{ + using namespace ck::utils::conv; + + bool do_verification = 0; + int init_method = 0; + int nrepeat = 5; + int num_dim_spatial = 2; + + ck::utils::conv::ConvParams params; + + if(argc >= 5) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + num_dim_spatial = std::stoi(argv[4]); + } + + if(argc >= 6) + { + params = parse_conv_params(num_dim_spatial, argc, argv); + } + + std::vector input_dims{static_cast(params.N), + static_cast(params.C)}; + input_dims.insert(std::end(input_dims), + std::begin(params.input_spatial_lengths), + std::end(params.input_spatial_lengths)); + + std::vector filter_dims{static_cast(params.K), + static_cast(params.C)}; + filter_dims.insert(std::end(filter_dims), + std::begin(params.filter_spatial_lengths), + std::end(params.filter_spatial_lengths)); + + const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); + std::vector output_dims{static_cast(params.N), + static_cast(params.K)}; + output_dims.insert(std::end(output_dims), + std::begin(output_spatial_lengths), + std::end(output_spatial_lengths)); + + Tensor input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial)); + Tensor weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial)); + Tensor host_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); + Tensor device_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); + + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weights: " << weights.mDesc << std::endl; + std::cout << "output: " << host_output.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + weights.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + weights.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); + DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace()); + + in_device_buf.ToDevice(input.mData.data()); + wei_device_buf.ToDevice(weights.mData.data()); + + // do GEMM + auto conv = get_conv_instance(num_dim_spatial); + auto invoker = conv->MakeInvokerPointer(); + auto argument = + conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + params.N, + params.K, + params.C, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + if(!conv->IsSupportedArgument(argument.get())) + { + throw std::runtime_error( + "wrong! device_conv with the specified compilation parameters does " + "not support this Conv problem"); + } + + float ave_time = invoker->Run(argument.get(), nrepeat); + + std::size_t flop = get_flops( + params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + std::size_t num_btype = get_btype( + params.N, + params.C, + params.K, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths); + + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_btype / 1.E6 / ave_time; + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" + << std::endl; + + if(do_verification) + { + auto verify_f = [&input, &weights, &host_output, ¶ms, &out_device_buf, &device_output]( + const auto& ref_conv) { + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(input, + weights, + host_output, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + ref_invoker.Run(ref_argument); + out_device_buf.FromDevice(device_output.mData.data()); + ck::utils::check_err( + host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + }; + + switch(num_dim_spatial) + { + case 3: { + auto ref_conv = ReferenceConvNDFwdInstance<3>(); + verify_f(ref_conv); + break; + } + case 2: { + auto ref_conv = ReferenceConvNDFwdInstance<2>(); + verify_f(ref_conv); + break; + } + case 1: { + auto ref_conv = ReferenceConvNDFwdInstance<1>(); + verify_f(ref_conv); + break; + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } + } +} diff --git a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp index 8307157cec..f3f9b497f5 100644 --- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp +++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "print.hpp" #include "device.hpp" @@ -247,6 +249,6 @@ int main(int argc, char* argv[]) in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data()); - check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result); + ck::utils::check_err(in_n_c_hi_wi_device_result.mData, in_n_c_hi_wi_host_result.mData); } } diff --git a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp index ff41b8d021..7b74b40d32 100644 --- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp +++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "print.hpp" #include "device.hpp" @@ -284,6 +286,6 @@ int main(int argc, char* argv[]) LogRangeAsType(std::cout << "wei_host : ", wei_k_c_y_x_host_result.mData, ",") << std::endl; } - check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result); + ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData); } } diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index 41962ac43d..b8fc980e10 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "print.hpp" #include "device.hpp" @@ -371,12 +373,13 @@ int main(int argc, char* argv[]) if(args.do_verification) { out_dev.FromDevice(out.mData.data()); - check_error(out_ref, out); + ck::utils::check_err(out.mData, out_ref.mData); if(NeedIndices) { out_indices_dev.FromDevice(out_indices.mData.data()); - check_indices(out_indices_ref, out_indices); + ck::utils::check_err(out_indices.mData, out_indices_ref.mData); + ; }; }; } diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp index 6c16ed57d0..9def6c24fe 100644 --- a/example/13_pool2d_fwd/pool2d_fwd.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd.cpp @@ -3,6 +3,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "print.hpp" #include "device.hpp" @@ -300,13 +302,14 @@ int main(int argc, char* argv[]) out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data()); - check_error(out_n_c_ho_wo_host, out_n_c_ho_wo_device); + ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData); if constexpr(NeedIndices) { out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data()); - // check_indices(out_indices_n_c_ho_wo_host, out_indices_n_c_ho_wo_device); + // ck::utils::check_err(out_indices_n_c_ho_wo_device.mData, + // out_indices_n_c_ho_wo_host.mData);; }; } } diff --git a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp index 5ad2e815e5..ca3b58bd00 100644 --- a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp +++ b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "print.hpp" #include "device.hpp" @@ -225,7 +227,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - check_error(c_m_n_host_result, c_m_n_device_result); + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); } return 0; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp index bfad477163..4e9bdbb2f5 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "print.hpp" #include "device.hpp" @@ -225,8 +227,7 @@ int main(int argc, char* argv[]) c_element_op); ref_invoker.Run(ref_argument); - - check_error(c_host_tensors[i], c_device_tensors[i]); + ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData); } } diff --git a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp index 9bc9c88995..962627ce90 100644 --- a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp +++ b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp @@ -6,7 +6,7 @@ #include #include "config.hpp" -#include "conv_utils.hpp" +#include "conv_fwd_util.hpp" #include "print.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -99,10 +99,10 @@ void print_use_msg() << " , (ie RightPy, RightPx for 2D)\n" << std::endl; } -ck::conv_util::ConvParams parse_conv_params(int num_dim_spatial, char* argv[]) +ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[]) { // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; int arg_idx = 5; params.num_dim_spatial = num_dim_spatial; @@ -144,72 +144,6 @@ ck::conv_util::ConvParams parse_conv_params(int num_dim_spatial, char* argv[]) return params; } -HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector& dims, - int num_dim_spatial = 2) -{ - namespace tl = ck::tensor_layout::convolution; - - switch(num_dim_spatial) - { - case 3: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWC{}); - } - case 2: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{}); - } - case 1: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{}); - } - default: { - throw std::runtime_error("Unsupported number of spatial dimensions provided!"); - } - } -} -HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector& dims, - int num_dim_spatial = 2) -{ - namespace tl = ck::tensor_layout::convolution; - - switch(num_dim_spatial) - { - case 3: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::KZYXC{}); - } - case 2: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::KYXC{}); - } - case 1: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::KXC{}); - } - default: { - throw std::runtime_error("Unsupported number of spatial dimensions provided!"); - } - } -} - -HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector& dims, - int num_dim_spatial = 2) -{ - namespace tl = ck::tensor_layout::convolution; - - switch(num_dim_spatial) - { - case 3: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWK{}); - } - case 2: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWK{}); - } - case 1: { - return ck::conv_util::GetHostTensorDescriptor(dims, tl::NWK{}); - } - - default: { - throw std::runtime_error("Unsupported number of spatial dimensions provided!"); - } - } -} - DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial) { switch(num_dim_spatial) @@ -236,7 +170,7 @@ int main(int argc, char* argv[]) int nrepeat = 5; int num_dim_spatial = 2; - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.C = 128; if(argc == 4) @@ -288,13 +222,13 @@ int main(int argc, char* argv[]) std::end(output_spatial_lengths)); Tensor in_n_c_hi_wi_host_result( - get_input_host_tensor_descriptor(input_dims, num_dim_spatial)); + ck::utils::conv::get_input_host_tensor_descriptor(input_dims, num_dim_spatial)); Tensor in_n_c_hi_wi_device_result( - get_input_host_tensor_descriptor(input_dims, num_dim_spatial)); + ck::utils::conv::get_input_host_tensor_descriptor(input_dims, num_dim_spatial)); Tensor wei_k_c_y_x( - get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial)); + ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial)); Tensor out_n_k_ho_wo( - get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); + ck::utils::conv::get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi_host_result.mDesc << std::endl; std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; @@ -352,15 +286,15 @@ int main(int argc, char* argv[]) float ave_time = invoker->Run(argument.get(), nrepeat); - std::size_t flop = ck::conv_util::GetFlops( + std::size_t flop = ck::utils::conv::get_flops( params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); - std::size_t num_btype = - ck::conv_util::GetBtype(params.N, - params.C, - params.K, - params.input_spatial_lengths, - params.filter_spatial_lengths, - output_spatial_lengths); + std::size_t num_btype = ck::utils::conv::get_btype( + params.N, + params.C, + params.K, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths); float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 967ed8a2f3..5f04125305 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -13,6 +13,7 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu + ${PROJECT_SOURCE_DIR}/library/include/ck/library/utility ${PROJECT_SOURCE_DIR}/external/include/half ) @@ -29,10 +30,8 @@ add_subdirectory(01_gemm) add_subdirectory(02_gemm_alpha_beta) add_subdirectory(03_gemm_bias_relu) add_subdirectory(04_gemm_bias_relu_add) -add_subdirectory(05_conv2d_fwd) add_subdirectory(06_conv2d_fwd_bias_relu) add_subdirectory(07_conv2d_fwd_bias_relu_add) -add_subdirectory(08_conv3d_fwd) add_subdirectory(09_convnd_fwd) add_subdirectory(10_conv2d_bwd_data) add_subdirectory(11_conv2d_bwd_weight) diff --git a/include/ck/tensor_operation/gpu/device/conv_utils.hpp b/include/ck/tensor_operation/gpu/device/conv_utils.hpp deleted file mode 100644 index 44a6ee1c9b..0000000000 --- a/include/ck/tensor_operation/gpu/device/conv_utils.hpp +++ /dev/null @@ -1,242 +0,0 @@ -#ifndef CONV_UTILS_HPP -#define CONV_UTILS_HPP - -#include -#include -#include -#include -#include -#include -#include - -#include "config.hpp" -#include "host_tensor.hpp" -#include "tensor_layout.hpp" - -namespace ck { -namespace conv_util { - -/** - * @brief Calculate number of FLOPs for Convolution - * - * @param[in] N Batch size. - * @param[in] C Number of input channels. - * @param[in] K Number of output channels. - * @param[in] filter_spatial_lengths Filter spatial dimensions lengths. - * @param[in] output_spatial_lengths Convolution output spatial dimensions - * lengths. - * - * @return The number of flops. - */ -std::size_t GetFlops(ck::index_t N, - ck::index_t C, - ck::index_t K, - const std::vector& filter_spatial_lengths, - const std::vector& output_spatial_lengths) -{ - // 2 * N * K * * C * - return static_cast(2) * N * K * - std::accumulate(std::begin(output_spatial_lengths), - std::end(output_spatial_lengths), - static_cast(1), - std::multiplies()) * - C * - std::accumulate(std::begin(filter_spatial_lengths), - std::end(filter_spatial_lengths), - static_cast(1), - std::multiplies()); -} - -/** - * @brief Calculate number of bytes read/write by convolution algorithm. - * - * @param[in] N Batch size. - * @param[in] C Number of input channels. - * @param[in] K Number of output channels. - * @param[in] input_spatial_lengths Input spatial dimensions lengths. - * @param[in] filter_spatial_lengths Filter spatial dimensions lengths. - * @param[in] output_spatial_lengths Output spatial dimensions lengths - * - * @tparam InDataType Input tensor data type. - * @tparam WeiDataType Weights tensor data type. - * @tparam OutDataType Output tensor data type. - * - * @return The number of used bytes. - */ -template -std::size_t GetBtype(ck::index_t N, - ck::index_t C, - ck::index_t K, - const std::vector& input_spatial_lengths, - const std::vector& filter_spatial_lengths, - const std::vector& output_spatial_lengths) -{ - // sizeof(InDataType) * (N * C * ) + - // sizeof(WeiDataType) * (K * C * ) + - // sizeof(OutDataType) * (N * K * ); - return sizeof(InDataType) * (N * C * - std::accumulate(std::begin(input_spatial_lengths), - std::end(input_spatial_lengths), - static_cast(1), - std::multiplies())) + - sizeof(WeiDataType) * (K * C * - std::accumulate(std::begin(filter_spatial_lengths), - std::end(filter_spatial_lengths), - static_cast(1), - std::multiplies())) + - sizeof(OutDataType) * (N * K * - std::accumulate(std::begin(output_spatial_lengths), - std::end(output_spatial_lengths), - static_cast(1), - std::multiplies())); -} - -struct ConvParams -{ - ConvParams() - : num_dim_spatial(2), - N(128), - K(256), - C(192), - filter_spatial_lengths(2, 3), - input_spatial_lengths(2, 71), - conv_filter_strides(2, 2), - conv_filter_dilations(2, 1), - input_left_pads(2, 1), - input_right_pads(2, 1) - { - } - ConvParams(ck::index_t n_dim_spatial, - ck::index_t n, - ck::index_t k, - ck::index_t c, - std::vector filter_lengths, - std::vector input_lengths, - std::vector conv_strides, - std::vector conv_dilations, - std::vector left_pads, - std::vector right_pads) - : num_dim_spatial(n_dim_spatial), - N(n), - K(k), - C(c), - filter_spatial_lengths(filter_lengths), - input_spatial_lengths(input_lengths), - conv_filter_strides(conv_strides), - conv_filter_dilations(conv_dilations), - input_left_pads(left_pads), - input_right_pads(right_pads) - { - } - - ck::index_t num_dim_spatial; - ck::index_t N; - ck::index_t K; - ck::index_t C; - - std::vector filter_spatial_lengths; - std::vector input_spatial_lengths; - - std::vector conv_filter_strides; - std::vector conv_filter_dilations; - - std::vector input_left_pads; - std::vector input_right_pads; - - std::vector GetOutputSpatialLengths() const - { - std::vector out_spatial_len(num_dim_spatial, 0); - for(ck::index_t i = 0; i < num_dim_spatial; ++i) - { - // XEff = (X - 1) * conv_dilation_w + 1; - // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; - const ck::index_t idx_eff = - (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1; - out_spatial_len[i] = - (input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) / - conv_filter_strides[i] + - 1; - } - return out_spatial_len; - } -}; - -/** - * @brief Gets the host tensor descriptor. - * - * @param[in] dims The tensor dimensions lengths. Always in NCHW format. - * @param[in] layout The tensor data layout. - * - * @tparam TensorLayout Layout type. - * - * @return The host tensor descriptor object. - */ -template -HostTensorDescriptor GetHostTensorDescriptor(const std::vector& dims, - const TensorLayout& layout) -{ - std::size_t C = dims[1]; - // 1D - if constexpr(std::is_same::value || - std::is_same::value || - std::is_same::value) - { - - return HostTensorDescriptor(dims, std::vector({C * dims[2], dims[2], 1})); - } - else if constexpr(std::is_same::value || - std::is_same::value || - std::is_same::value) - { - return HostTensorDescriptor(dims, std::vector({C * dims[2], 1, C})); - } - // 2D - else if constexpr(std::is_same::value || - std::is_same::value || - std::is_same::value) - { - - return HostTensorDescriptor( - dims, std::vector{C * dims[2] * dims[3], dims[2] * dims[3], dims[3], 1}); - } - else if constexpr(std::is_same::value || - std::is_same::value || - std::is_same::value) - { - return HostTensorDescriptor( - dims, std::vector{C * dims[2] * dims[3], 1, dims[3] * C, C}); - } - // 3D - else if constexpr(std::is_same::value || - std::is_same::value || - std::is_same::value) - { - - return HostTensorDescriptor(dims, - std::vector{C * dims[2] * dims[3] * dims[4], - dims[2] * dims[3] * dims[4], - dims[3] * dims[4], - dims[4], - 1}); - } - else if constexpr(std::is_same::value || - std::is_same::value || - std::is_same::value) - { - return HostTensorDescriptor( - dims, - std::vector{ - C * dims[2] * dims[3] * dims[4], 1, dims[3] * dims[4] * C, dims[4] * C, C}); - } - - std::stringstream err_msg; - err_msg << "Unsupported data layout provided: " << layout << "!"; - throw std::runtime_error(err_msg.str()); -} - -} // namespace conv_util -} // namespace ck - -#endif diff --git a/include/ck/tensor_operation/gpu/device/convolution_utility.hpp b/include/ck/tensor_operation/gpu/device/convolution_utility.hpp deleted file mode 100644 index a6b891dab2..0000000000 --- a/include/ck/tensor_operation/gpu/device/convolution_utility.hpp +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef CONVOLUTION_UTILITY_HPP -#define CONVOLUTION_UTILITY_HPP - -#include - -namespace ck { -namespace tensor_operation { - -struct ConvolutionUtility -{ - static std::vector - ComputeOutputSpatialLengths(std::vector input_spatial_lengths, - std::vector filter_spatial_lengths, - std::vector conv_strides, - std::vector conv_dilations, - std::vector in_left_pads, - std::vector in_right_pads) - { - if(input_spatial_lengths.size() == 2) - { - assert(filter_spatial_lengths.size() == 2); - assert(conv_strides.size() == 2); - assert(conv_dilations.size() == 2); - assert(in_left_pads.size() == 2); - assert(in_right_pads.size() == 2); - - const index_t YEff = (filter_spatial_lengths[0] - 1) * conv_dilations[0] + 1; - const index_t XEff = (filter_spatial_lengths[1] - 1) * conv_dilations[1] + 1; - - const index_t Hi = input_spatial_lengths[0]; - const index_t Wi = input_spatial_lengths[1]; - - const index_t Ho = - (Hi + in_left_pads[0] + in_right_pads[0] - YEff) / conv_strides[0] + 1; - const index_t Wo = - (Wi + in_left_pads[1] + in_right_pads[1] - XEff) / conv_strides[1] + 1; - - return {Ho, Wo}; - } - else if(input_spatial_lengths.size() == 3) - { - assert(filter_spatial_lengths.size() == 3); - assert(conv_strides.size() == 3); - assert(conv_dilations.size() == 3); - assert(in_left_pads.size() == 3); - assert(in_right_pads.size() == 3); - - const index_t ZEff = (filter_spatial_lengths[0] - 1) * conv_dilations[0] + 1; - const index_t YEff = (filter_spatial_lengths[1] - 1) * conv_dilations[1] + 1; - const index_t XEff = (filter_spatial_lengths[2] - 1) * conv_dilations[2] + 1; - - const index_t Di = input_spatial_lengths[0]; - const index_t Hi = input_spatial_lengths[1]; - const index_t Wi = input_spatial_lengths[2]; - - const index_t Do = - (Di + in_left_pads[0] + in_right_pads[0] - ZEff) / conv_strides[0] + 1; - const index_t Ho = - (Hi + in_left_pads[1] + in_right_pads[1] - YEff) / conv_strides[1] + 1; - const index_t Wo = - (Wi + in_left_pads[2] + in_right_pads[2] - XEff) / conv_strides[2] + 1; - return {Do, Ho, Wo}; - } - else - { - return {}; - } - } -}; - -} // namespace tensor_operation -} // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp index 0371c4ab0d..c3ebe58865 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp @@ -4,7 +4,7 @@ #include #include #include -#include "convolution_utility.hpp" +#include "conv_fwd_util.hpp" #include "device.hpp" #include "device_conv_fwd.hpp" #include "common_header.hpp" @@ -53,36 +53,30 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W InElementwiseOperation in_element_op, WeiElementwiseOperation wei_element_op, OutElementwiseOperation out_element_op) - : N_{N}, - K_{K}, - C_{C}, - in_spatial_lengths_{input_spatial_lengths}, - filter_spatial_lengths_{filter_spatial_lengths}, + : params_{3, + N, + K, + C, + filter_spatial_lengths, + input_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads}, out_spatial_lengths_{output_spatial_lengths}, - conv_filter_strides_{conv_filter_strides}, - conv_filter_dilations_{conv_filter_dilations}, - in_left_pads_{input_left_pads}, - in_right_pads_{input_right_pads}, p_in_{p_in}, p_wei_{p_wei}, p_out_{p_out}, in_element_op_{in_element_op}, wei_element_op_{wei_element_op}, out_element_op_{out_element_op} + { } // private: - index_t N_; - index_t K_; - index_t C_; - std::vector in_spatial_lengths_; - std::vector filter_spatial_lengths_; + utils::conv::ConvParams params_; std::vector out_spatial_lengths_; - std::vector conv_filter_strides_; - std::vector conv_filter_dilations_; - std::vector in_left_pads_; - std::vector in_right_pads_; const InDataType* p_in_; const WeiDataType* p_wei_; @@ -157,13 +151,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W static bool IsSupportedArgument(const Argument& arg) { - std::vector out_spatial_lengths = - ConvolutionUtility::ComputeOutputSpatialLengths(arg.in_spatial_lengths_, - arg.filter_spatial_lengths_, - arg.conv_filter_strides_, - arg.conv_filter_dilations_, - arg.in_left_pads_, - arg.in_right_pads_); + std::vector out_spatial_lengths = arg.params_.GetOutputSpatialLengths(); bool out_lengths_are_consistent = out_spatial_lengths[0] == arg.out_spatial_lengths_[0] && out_spatial_lengths[1] == arg.out_spatial_lengths_[1] && diff --git a/library/include/ck/library/host_tensor/host_tensor.hpp b/library/include/ck/library/host_tensor/host_tensor.hpp index 17ecd4a9fb..0d4c9f73d4 100644 --- a/library/include/ck/library/host_tensor/host_tensor.hpp +++ b/library/include/ck/library/host_tensor/host_tensor.hpp @@ -300,9 +300,6 @@ HostTensorDescriptor::HostTensorDescriptor(const std::vector& lens, void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout); #if 1 -// FIXME: remove -float bf16_to_f32_(ck::bhalf_t src_val); - // FIXME: remove void bf16_to_f32_(const Tensor& src, Tensor& dst); #endif @@ -353,28 +350,4 @@ float check_error(const Tensor& ref, const Tensor& result) return linf_error; } -template -void check_indices(const Tensor& ref, const Tensor& result) -{ - bool has_error = false; - int error_count = 0; - - for(int i = 0; i < ref.mData.size(); ++i) - { - if(ref.mData[i] != result.mData[i]) - { - std::cerr << std::endl - << "Indices different at position " << i << " (ref: " << ref.mData[i] - << ", result: " << result.mData[i] << ")" << std::endl; - has_error = true; - error_count++; - if(error_count == 20) - break; - }; - } - - if(!has_error) - std::cout << std::endl << "Indices result is completely acccurate!" << std::endl; -} - #endif diff --git a/test/include/test_util.hpp b/library/include/ck/library/utility/check_err.hpp similarity index 69% rename from test/include/test_util.hpp rename to library/include/ck/library/utility/check_err.hpp index 07fe67ba46..280ac83883 100644 --- a/test/include/test_util.hpp +++ b/library/include/ck/library/utility/check_err.hpp @@ -1,9 +1,10 @@ -#ifndef TEST_UTIL_HPP -#define TEST_UTIL_HPP +#ifndef CHECK_ERR_HPP +#define CHECK_ERR_HPP #include #include #include +#include #include #include #include @@ -13,16 +14,17 @@ #include "data_type.hpp" -namespace test { +namespace ck { +namespace utils { template -typename std::enable_if::value && !std::is_same::value, +typename std::enable_if::value && !std::is_same::value, bool>::type check_err(const std::vector& out, const std::vector& ref, - const std::string& msg, - double rtol = 1e-5, - double atol = 1e-8) + const std::string& msg = "Error: Incorrect results!", + double rtol = 1e-5, + double atol = 1e-8) { if(out.size() != ref.size()) { @@ -60,13 +62,12 @@ check_err(const std::vector& out, } template -typename std::enable_if::value || std::is_same::value, - bool>::type +typename std::enable_if::value, bool>::type check_err(const std::vector& out, const std::vector& ref, - const std::string& msg, - double rtol = 1e-5, - double atol = 1e-8) + const std::string& msg = "Error: Incorrect results!", + double rtol = 1e-3, + double atol = 1e-3) { if(out.size() != ref.size()) { @@ -77,14 +78,15 @@ check_err(const std::vector& out, } bool res{true}; - int err_count = 0; - double err = 0; - double max_err = ck::type_convert(ck::NumericLimits::Min()); + int err_count = 0; + double err = 0; + // TODO: This is a hack. We should have proper specialization for bhalf_t data type. + double max_err = std::numeric_limits::min(); for(std::size_t i = 0; i < ref.size(); ++i) { - float o = ck::type_convert(out[i]); - float r = ck::type_convert(ref[i]); - err = std::abs(o - r); + double o = type_convert(out[i]); + double r = type_convert(ref[i]); + err = std::abs(o - r); if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) { max_err = err > max_err ? err : max_err; @@ -105,11 +107,14 @@ check_err(const std::vector& out, return res; } -bool check_err(const std::vector& out, - const std::vector& ref, - const std::string& msg, - ck::half_t rtol = static_cast(1e-3f), - ck::half_t atol = static_cast(1e-3f)) +template +typename std::enable_if::value || std::is_same::value, + bool>::type +check_err(const std::vector& out, + const std::vector& ref, + const std::string& msg = "Error: Incorrect results!", + double rtol = 1e-3, + double atol = 1e-3) { if(out.size() != ref.size()) { @@ -122,20 +127,20 @@ bool check_err(const std::vector& out, bool res{true}; int err_count = 0; double err = 0; - double max_err = std::numeric_limits::min(); + double max_err = std::numeric_limits::min(); for(std::size_t i = 0; i < ref.size(); ++i) { - double out_ = double(out[i]); - double ref_ = double(ref[i]); - err = std::abs(out_ - ref_); - if(err > atol + rtol * std::abs(ref_) || !std::isfinite(out_) || !std::isfinite(ref_)) + double o = type_convert(out[i]); + double r = type_convert(ref[i]); + err = std::abs(o - r); + if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) { max_err = err > max_err ? err : max_err; err_count++; if(err_count < 5) { std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" - << i << "]: " << out_ << "!=" << ref_ << std::endl + << i << "]: " << o << " != " << r << std::endl << msg << std::endl; } res = false; @@ -149,13 +154,12 @@ bool check_err(const std::vector& out, } template -typename std::enable_if::value && !std::is_same::value, - bool>::type +typename std::enable_if::value && !std::is_same::value, bool>::type check_err(const std::vector& out, const std::vector& ref, - const std::string& msg, - double = 0, - double = 0) + const std::string& msg = "Error: Incorrect results!", + double = 0, + double = 0) { if(out.size() != ref.size()) { @@ -178,7 +182,8 @@ check_err(const std::vector& out, return true; } -} // namespace test +} // namespace utils +} // namespace ck template std::ostream& operator<<(std::ostream& os, const std::vector& v) diff --git a/library/include/ck/library/utility/conv_fwd_util.hpp b/library/include/ck/library/utility/conv_fwd_util.hpp new file mode 100644 index 0000000000..f758b808c3 --- /dev/null +++ b/library/include/ck/library/utility/conv_fwd_util.hpp @@ -0,0 +1,554 @@ +#ifndef CONV_FWD_UTIL_HPP +#define CONV_FWD_UTIL_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "check_err.hpp" +#include "config.hpp" +#include "device.hpp" +#include "device_conv_fwd.hpp" +#include "device_tensor.hpp" +#include "element_wise_operation.hpp" +#include "host_tensor.hpp" +#include "reference_conv_fwd.hpp" +#include "tensor_layout.hpp" + +namespace ck { +namespace utils { +namespace conv { + +using DeviceConvFwdNoOpPtr = + ck::tensor_operation::device::DeviceConvFwdPtr; + +/** + * @brief Calculate number of FLOPs for Convolution + * + * @param[in] N Batch size. + * @param[in] C Number of input channels. + * @param[in] K Number of output channels. + * @param[in] filter_spatial_lengths Filter spatial dimensions lengths. + * @param[in] output_spatial_lengths Convolution output spatial dimensions + * lengths. + * + * @return The number of flops. + */ +std::size_t get_flops(ck::index_t N, + ck::index_t C, + ck::index_t K, + const std::vector& filter_spatial_lengths, + const std::vector& output_spatial_lengths) +{ + // 2 * N * K * * C * + return static_cast(2) * N * K * + std::accumulate(std::begin(output_spatial_lengths), + std::end(output_spatial_lengths), + static_cast(1), + std::multiplies()) * + C * + std::accumulate(std::begin(filter_spatial_lengths), + std::end(filter_spatial_lengths), + static_cast(1), + std::multiplies()); +} + +/** + * @brief Calculate number of bytes read/write by convolution algorithm. + * + * @param[in] N Batch size. + * @param[in] C Number of input channels. + * @param[in] K Number of output channels. + * @param[in] input_spatial_lengths Input spatial dimensions lengths. + * @param[in] filter_spatial_lengths Filter spatial dimensions lengths. + * @param[in] output_spatial_lengths Output spatial dimensions lengths + * + * @tparam InDataType Input tensor data type. + * @tparam WeiDataType Weights tensor data type. + * @tparam OutDataType Output tensor data type. + * + * @return The number of used bytes. + */ +template +std::size_t get_btype(ck::index_t N, + ck::index_t C, + ck::index_t K, + const std::vector& input_spatial_lengths, + const std::vector& filter_spatial_lengths, + const std::vector& output_spatial_lengths) +{ + // sizeof(InDataType) * (N * C * ) + + // sizeof(WeiDataType) * (K * C * ) + + // sizeof(OutDataType) * (N * K * ); + return sizeof(InDataType) * (N * C * + std::accumulate(std::begin(input_spatial_lengths), + std::end(input_spatial_lengths), + static_cast(1), + std::multiplies())) + + sizeof(WeiDataType) * (K * C * + std::accumulate(std::begin(filter_spatial_lengths), + std::end(filter_spatial_lengths), + static_cast(1), + std::multiplies())) + + sizeof(OutDataType) * (N * K * + std::accumulate(std::begin(output_spatial_lengths), + std::end(output_spatial_lengths), + static_cast(1), + std::multiplies())); +} + +struct ConvParams +{ + ConvParams() + : num_dim_spatial(2), + N(128), + K(256), + C(192), + filter_spatial_lengths(2, 3), + input_spatial_lengths(2, 71), + conv_filter_strides(2, 2), + conv_filter_dilations(2, 1), + input_left_pads(2, 1), + input_right_pads(2, 1) + { + } + + ConvParams(ck::index_t n_dim, + ck::index_t n_batch, + ck::index_t n_out_channels, + ck::index_t n_in_channels, + const std::vector& filters_len, + const std::vector& input_len, + const std::vector& strides, + const std::vector& dilations, + const std::vector& left_pads, + const std::vector& right_pads) + : num_dim_spatial(n_dim), + N(n_batch), + K(n_out_channels), + C(n_in_channels), + filter_spatial_lengths(filters_len), + input_spatial_lengths(input_len), + conv_filter_strides(strides), + conv_filter_dilations(dilations), + input_left_pads(left_pads), + input_right_pads(right_pads) + { + if(filter_spatial_lengths.size() != num_dim_spatial || + input_spatial_lengths.size() != num_dim_spatial || + conv_filter_strides.size() != num_dim_spatial || + conv_filter_dilations.size() != num_dim_spatial || + input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial) + { + throw(std::runtime_error( + "ConvParams::GetOutputSpatialLengths: " + "parameter size is different from number of declared dimensions!")); + } + } + + ck::index_t num_dim_spatial; + ck::index_t N; + ck::index_t K; + ck::index_t C; + + std::vector filter_spatial_lengths; + std::vector input_spatial_lengths; + + std::vector conv_filter_strides; + std::vector conv_filter_dilations; + + std::vector input_left_pads; + std::vector input_right_pads; + + std::vector GetOutputSpatialLengths() const + { + if(filter_spatial_lengths.size() != num_dim_spatial || + input_spatial_lengths.size() != num_dim_spatial || + conv_filter_strides.size() != num_dim_spatial || + conv_filter_dilations.size() != num_dim_spatial || + input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial) + { + throw(std::runtime_error( + "ConvParams::GetOutputSpatialLengths: " + "parameter size is different from number of declared dimensions!")); + } + + std::vector out_spatial_len(num_dim_spatial, 0); + for(ck::index_t i = 0; i < num_dim_spatial; ++i) + { + // XEff = (X - 1) * conv_dilation_w + 1; + // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; + const ck::index_t idx_eff = + (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1; + out_spatial_len[i] = + (input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) / + conv_filter_strides[i] + + 1; + } + return out_spatial_len; + } +}; + +/** + * @brief Gets the host tensor descriptor. + * + * @param[in] dims The tensor dimensions lengths. Always in NCHW format. + * @param[in] layout The tensor data layout. + * + * @tparam TensorLayout Layout type. + * + * @return The host tensor descriptor object. + */ +template +HostTensorDescriptor get_host_tensor_descriptor(const std::vector& dims, + const TensorLayout& layout) +{ + std::size_t C = dims[1]; + // 1D + if constexpr(std::is_same::value || + std::is_same::value || + std::is_same::value) + { + + return HostTensorDescriptor(dims, std::vector({C * dims[2], dims[2], 1})); + } + else if constexpr(std::is_same::value || + std::is_same::value || + std::is_same::value) + { + return HostTensorDescriptor(dims, std::vector({C * dims[2], 1, C})); + } + // 2D + else if constexpr(std::is_same::value || + std::is_same::value || + std::is_same::value) + { + + return HostTensorDescriptor( + dims, std::vector{C * dims[2] * dims[3], dims[2] * dims[3], dims[3], 1}); + } + else if constexpr(std::is_same::value || + std::is_same::value || + std::is_same::value) + { + return HostTensorDescriptor( + dims, std::vector{C * dims[2] * dims[3], 1, dims[3] * C, C}); + } + // 3D + else if constexpr(std::is_same::value || + std::is_same::value || + std::is_same::value) + { + + return HostTensorDescriptor(dims, + std::vector{C * dims[2] * dims[3] * dims[4], + dims[2] * dims[3] * dims[4], + dims[3] * dims[4], + dims[4], + 1}); + } + else if constexpr(std::is_same::value || + std::is_same::value || + std::is_same::value) + { + return HostTensorDescriptor( + dims, + std::vector{ + C * dims[2] * dims[3] * dims[4], 1, C * dims[3] * dims[4], C * dims[4], C}); + } + + std::stringstream err_msg; + err_msg << "Unsupported data layout provided: " << layout << "!"; + throw std::runtime_error(err_msg.str()); +} + +template +auto get_host_tensors(const ConvParams& params, bool init = true) +{ + std::vector input_dims{static_cast(params.N), + static_cast(params.C)}; + input_dims.insert(std::end(input_dims), + std::begin(params.input_spatial_lengths), + std::end(params.input_spatial_lengths)); + + std::vector filter_dims{static_cast(params.K), + static_cast(params.C)}; + filter_dims.insert(std::end(filter_dims), + std::begin(params.filter_spatial_lengths), + std::end(params.filter_spatial_lengths)); + + const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); + std::vector output_dims{static_cast(params.N), + static_cast(params.K)}; + output_dims.insert(std::end(output_dims), + std::begin(output_spatial_lengths), + std::end(output_spatial_lengths)); + + Tensor input(ck::utils::conv::get_host_tensor_descriptor(input_dims, InLayout{})); + Tensor weights( + ck::utils::conv::get_host_tensor_descriptor(filter_dims, WeiLayout{})); + Tensor host_output( + ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{})); + Tensor device_output( + ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{})); + + if(init) + { + std::mt19937 gen(11939); + if constexpr(std::is_same::value) + { + std::uniform_int_distribution<> dis(-5, 5); + std::generate( + input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); }); + std::generate( + weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); }); + } + else + { + std::uniform_real_distribution<> dis(0.f, 1.f); + std::generate( + input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); }); + std::generate( + weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); }); + } + std::fill(host_output.begin(), host_output.end(), OutDataType(0.f)); + std::fill(device_output.begin(), device_output.end(), OutDataType(0.f)); + } + + return std::make_tuple(input, weights, host_output, device_output); +} + +HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector& dims, + int num_dim_spatial = 2) +{ + namespace tl = ck::tensor_layout::convolution; + + switch(num_dim_spatial) + { + case 3: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWK{}); + } + case 2: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWK{}); + } + case 1: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWK{}); + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } +} + +HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector& dims, + int num_dim_spatial = 2) +{ + namespace tl = ck::tensor_layout::convolution; + + switch(num_dim_spatial) + { + case 3: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KZYXC{}); + } + case 2: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KYXC{}); + } + case 1: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KXC{}); + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } +} + +HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector& dims, + int num_dim_spatial = 2) +{ + namespace tl = ck::tensor_layout::convolution; + + switch(num_dim_spatial) + { + case 3: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{}); + } + case 2: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{}); + } + case 1: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{}); + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } +} + +template +void run_reference_convolution_forward(const ConvParams& params, + const Tensor& input, + const Tensor& weights, + Tensor& output) +{ + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd(); + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(input, + weights, + output, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + + ref_invoker.Run(ref_argument); +} + +template + class DeviceConvNDFwdInstance> +void run_convolution_forward(const ConvParams& params, + const Tensor& input, + const Tensor& weights, + Tensor& output) +{ + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); + DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace()); + + in_device_buf.ToDevice(input.mData.data()); + wei_device_buf.ToDevice(weights.mData.data()); + const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); + + auto conv = DeviceConvNDFwdInstance(); + auto invoker = conv.MakeInvoker(); + auto argument = conv.MakeArgument(static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + params.N, + params.K, + params.C, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + + if(!conv.IsSupportedArgument(argument)) + { + throw std::runtime_error( + "Error! device_conv with the specified compilation parameters does " + "not support this Conv problem"); + } + + invoker.Run(argument); + out_device_buf.FromDevice(output.mData.data()); +} + +template +bool run_convolution_forward_instances(const ConvParams& params, + const std::vector& conv_ptrs, + const Tensor& input, + const Tensor& weights, + Tensor& output, + const Tensor& host_output) +{ + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); + DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace()); + + in_device_buf.ToDevice(input.mData.data()); + wei_device_buf.ToDevice(weights.mData.data()); + const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); + + bool res{true}; + for(auto& conv_ptr : conv_ptrs) + { + auto invoker = conv_ptr->MakeInvokerPointer(); + auto argument = conv_ptr->MakeArgumentPointer( + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + params.N, + params.K, + params.C, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + + if(conv_ptr->IsSupportedArgument(argument.get())) + { + float atol{1e-5f}; + float rtol{1e-4f}; + if constexpr(std::is_same_v) + { + atol = 1e-4f; + rtol = 2.5e-3f; + } + invoker->Run(argument.get()); + out_device_buf.FromDevice(output.mData.data()); + res = res && + ck::utils::check_err( + output.mData, host_output.mData, "Error: incorrect results!", atol, rtol); + hipGetErrorString( + hipMemset(out_device_buf.GetDeviceBuffer(), 0, out_device_buf.mMemSize)); + } + } + return res; +} + +} // namespace conv +} // namespace utils +} // namespace ck + +#endif diff --git a/library/src/host_tensor/host_tensor.cpp b/library/src/host_tensor/host_tensor.cpp index 76d420e00b..38b0796635 100644 --- a/library/src/host_tensor/host_tensor.cpp +++ b/library/src/host_tensor/host_tensor.cpp @@ -65,21 +65,10 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream } #if 1 -// FIXME: remove -float bf16_to_f32_(ck::bhalf_t src_val) -{ - union - { - uint32_t int32; - float fp32; - } u = {uint32_t(src_val) << 16}; - return u.fp32; -} - // FIXME: remove void bf16_to_f32_(const Tensor& src, Tensor& dst) { for(int i = 0; i < src.mData.size(); ++i) - dst.mData[i] = bf16_to_f32_(src.mData[i]); + dst.mData[i] = ck::type_convert(src.mData[i]); } #endif diff --git a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp index 40337d674a..a7541f03de 100644 --- a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp +++ b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "debug.hpp" #include "print.hpp" @@ -401,7 +403,7 @@ int main(int argc, char* argv[]) make_tuple(in_right_pad_h, in_right_pad_w), activ_type); - check_error(add_host, add_device); + ck::utils::check_err(add_device.mData, add_host.mData); if(do_log) { diff --git a/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp b/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp index f350f7f071..c4dcb7c085 100644 --- a/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp +++ b/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "debug.hpp" #include "print.hpp" @@ -473,7 +475,7 @@ int main(int argc, char* argv[]) make_tuple(in_right_pad_h, in_right_pad_w), layout); - check_error(in_host, in_device); + ck::utils::check_err(in_device.mData, in_host.mData); if(do_log) { diff --git a/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp b/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp index 9bdca437c9..ab8beec87b 100644 --- a/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp +++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "debug.hpp" #include "print.hpp" @@ -534,7 +536,7 @@ int main(int argc, char* argv[]) make_tuple(in_right_pad_h, in_right_pad_w), layout); - check_error(out_host, out_device); + ck::utils::check_err(out_device.mData, out_host.mData); if(do_log) { diff --git a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp index 4b3e037fc0..6fb8b4c2aa 100644 --- a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp +++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "debug.hpp" #include "print.hpp" @@ -377,7 +379,7 @@ int main(int argc, char* argv[]) make_tuple(in_right_pad_h, in_right_pad_w), activ_type); - check_error(out_host, out_device); + ck::utils::check_err(out_device.mData, out_host.mData); if(do_log) { diff --git a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp index c3e6027925..fb7e8e975b 100644 --- a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp +++ b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "debug.hpp" #include "print.hpp" @@ -397,8 +399,8 @@ int main(int argc, char* argv[]) make_tuple(in_right_pad_h, in_right_pad_w), activ_type); - check_error(out_host, out_device); - check_error(max_host, max_device); + ck::utils::check_err(out_device.mData, out_host.mData); + ck::utils::check_err(max_device.mData, max_host.mData); if(do_log) { diff --git a/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp b/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp index 253b5c2377..1ac974202c 100644 --- a/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp +++ b/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "debug.hpp" #include "print.hpp" @@ -517,7 +519,7 @@ int main(int argc, char* argv[]) make_tuple(in_right_pad_h, in_right_pad_w), layout); - check_error(wei_host, wei_device); + ck::utils::check_err(wei_device.mData, wei_host.mData); if(do_log) { diff --git a/library/src/obselete_driver_offline/gemm_driver_offline.cpp b/library/src/obselete_driver_offline/gemm_driver_offline.cpp index 8e281f71b1..a09cb932d6 100644 --- a/library/src/obselete_driver_offline/gemm_driver_offline.cpp +++ b/library/src/obselete_driver_offline/gemm_driver_offline.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "debug.hpp" #include "print.hpp" @@ -441,7 +443,7 @@ int main(int argc, char* argv[]) { host_gemm(a, b, c_host, layout); - check_error(c_host, c_device); + ck::utils::check_err(c_device.mData, c_host.mData); if(do_log) { diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt index aca34ccf77..a2cf6eeb62 100644 --- a/profiler/CMakeLists.txt +++ b/profiler/CMakeLists.txt @@ -15,6 +15,7 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu + ${PROJECT_SOURCE_DIR}/library/include/ck/library/utility ${PROJECT_SOURCE_DIR}/profiler/include ${PROJECT_SOURCE_DIR}/external/include/half ) diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp index 7c39ce685c..51fcba910f 100644 --- a/profiler/include/profile_batched_gemm_impl.hpp +++ b/profiler/include/profile_batched_gemm_impl.hpp @@ -2,6 +2,7 @@ #include +#include "check_err.hpp" #include "config.hpp" #include "element_wise_operation.hpp" #include "tensor_layout.hpp" @@ -393,7 +394,6 @@ bool profile_batched_gemm_impl(int do_verification, } else { - float err = check_error(c_g_m_n_host_result, c_g_m_n_device_result); pass = pass && (err < 1E-6); } diff --git a/profiler/include/profile_conv_bwd_data_impl.hpp b/profiler/include/profile_conv_bwd_data_impl.hpp index 587142499c..bec97e40f5 100644 --- a/profiler/include/profile_conv_bwd_data_impl.hpp +++ b/profiler/include/profile_conv_bwd_data_impl.hpp @@ -1,4 +1,6 @@ #pragma once + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -253,7 +255,8 @@ void profile_conv_bwd_data_impl(int do_verification, { in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data()); - check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result); + ck::utils::check_err(in_n_c_hi_wi_device_result.mData, + in_n_c_hi_wi_host_result.mData); if(do_log) { diff --git a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp index 286323c629..d0de7307d2 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp @@ -1,4 +1,6 @@ #pragma once + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -245,7 +247,8 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, { out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); + ck::utils::check_err(out_n_k_ho_wo_device_result.mData, + out_n_k_ho_wo_host_result.mData); if(do_log) { diff --git a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp index c17d184e84..9bdfa61283 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp @@ -1,4 +1,5 @@ #pragma once +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -301,7 +302,8 @@ void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification, { out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); + ck::utils::check_err(out_n_k_ho_wo_device_result.mData, + out_n_k_ho_wo_host_result.mData); if(do_log) { diff --git a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp index cd68f992e9..f34e52048e 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp @@ -1,4 +1,5 @@ #pragma once +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -233,7 +234,8 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, { out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); + ck::utils::check_err(out_n_k_ho_wo_device_result.mData, + out_n_k_ho_wo_host_result.mData); if(do_log) { diff --git a/profiler/include/profile_conv_fwd_impl.hpp b/profiler/include/profile_conv_fwd_impl.hpp index 95d6535485..6038cd4612 100644 --- a/profiler/include/profile_conv_fwd_impl.hpp +++ b/profiler/include/profile_conv_fwd_impl.hpp @@ -1,4 +1,6 @@ #pragma once + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -253,7 +255,8 @@ void profile_conv_fwd_impl(int do_verification, { out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); + ck::utils::check_err(out_n_k_ho_wo_device_result.mData, + out_n_k_ho_wo_host_result.mData); if(do_log) { diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp index 87254e7a0c..4f9038a72b 100644 --- a/profiler/include/profile_convnd_bwd_data_impl.hpp +++ b/profiler/include/profile_convnd_bwd_data_impl.hpp @@ -1,7 +1,7 @@ #pragma once #include "config.hpp" #include "device.hpp" -#include "conv_utils.hpp" +#include "conv_fwd_util.hpp" #include "host_tensor.hpp" #include "host_tensor_generator.hpp" #include "tensor_layout.hpp" @@ -68,13 +68,13 @@ HostTensorDescriptor get_input_host_tensor_descriptor(const std::vectorRun(argument_ptr.get(), nrepeat); std::size_t flop = - ck::conv_util::GetFlops(N, C, K, filter_spatial_lengths, output_spatial_lengths); - std::size_t num_btype = ck::conv_util::GetBtype( - N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths); + ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths); + std::size_t num_btype = + ck::utils::conv::get_btype( + N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths); float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; diff --git a/profiler/include/profile_gemm_bias_2d_impl.hpp b/profiler/include/profile_gemm_bias_2d_impl.hpp index 4980726d96..98e4ad76c9 100644 --- a/profiler/include/profile_gemm_bias_2d_impl.hpp +++ b/profiler/include/profile_gemm_bias_2d_impl.hpp @@ -1,4 +1,6 @@ #pragma once + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -283,7 +285,7 @@ void profile_gemm_bias_2d_impl(int do_verification, { c_device_buf.FromDevice(c_m_n_device_result.mData.data()); - check_error(c_m_n_host_result, c_m_n_device_result); + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); if(do_log) { diff --git a/profiler/include/profile_gemm_bias_relu_add_impl.hpp b/profiler/include/profile_gemm_bias_relu_add_impl.hpp index f6625a8b22..75ed78075b 100644 --- a/profiler/include/profile_gemm_bias_relu_add_impl.hpp +++ b/profiler/include/profile_gemm_bias_relu_add_impl.hpp @@ -1,4 +1,6 @@ #pragma once + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -257,7 +259,7 @@ void profile_gemm_bias_relu_add_impl(int do_verification, { c_device_buf.FromDevice(c_m_n_device_result.mData.data()); - check_error(c_m_n_host_result, c_m_n_device_result); + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); if(do_log) { diff --git a/profiler/include/profile_gemm_bias_relu_impl.hpp b/profiler/include/profile_gemm_bias_relu_impl.hpp index 55b6e39064..0735f3c31b 100644 --- a/profiler/include/profile_gemm_bias_relu_impl.hpp +++ b/profiler/include/profile_gemm_bias_relu_impl.hpp @@ -1,4 +1,6 @@ #pragma once + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -236,7 +238,7 @@ void profile_gemm_bias_relu_impl(int do_verification, { c_device_buf.FromDevice(c_m_n_device_result.mData.data()); - check_error(c_m_n_host_result, c_m_n_device_result); + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); if(do_log) { diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp index 409c1fd43c..f266188844 100644 --- a/profiler/include/profile_gemm_impl.hpp +++ b/profiler/include/profile_gemm_impl.hpp @@ -1,5 +1,7 @@ #pragma once #include + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -470,7 +472,7 @@ void profile_gemm_impl(int do_verification, ref_invoker.Run(ref_argument); - check_error(c_m_n_host_result, c_m_n_device_f32_result); + ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData); if(do_log) { @@ -499,7 +501,7 @@ void profile_gemm_impl(int do_verification, a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); ref_invoker.Run(ref_argument); - check_error(c_m_n_host_result, c_m_n_device_result); + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); if(do_log) { diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp index 4bdff7cbfc..cced480c36 100644 --- a/profiler/include/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profile_grouped_gemm_impl.hpp @@ -1,5 +1,7 @@ #pragma once #include + +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -283,7 +285,7 @@ void profile_grouped_gemm_impl(int do_verification, c_element_op); ref_invoker.Run(ref_argument); - check_error(c_m_n_host_result, c_m_n_device_results[i]); + ck::utils::check_err(c_m_n_device_results[i].mData, c_m_n_host_result.mData); if(do_log) { diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index e5c7b5e656..db7886e4b0 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -1,4 +1,6 @@ #pragma once + +#include "check_err.hpp" #include "device_reduce.hpp" #include "device_reduce_instance.hpp" #include "reduction_enums.hpp" @@ -455,12 +457,13 @@ void profile_reduce_impl_impl(bool do_verification, if(do_verification) { out_dev.FromDevice(out.mData.data()); - check_error(out_ref, out); + ck::utils::check_err(out.mData, out_ref.mData); if(NeedIndices) { out_indices_dev.FromDevice(out_indices.mData.data()); - check_indices(out_indices_ref, out_indices); + ck::utils::check_err(out_indices.mData, out_indices_ref.mData); + ; }; if(do_log) @@ -577,12 +580,13 @@ void profile_reduce_impl_impl(bool do_verification, if(do_verification) { out_dev.FromDevice(out.mData.data()); - check_error(out_ref, out); + ck::utils::check_err(out.mData, out_ref.mData); if(NeedIndices) { out_indices_dev.FromDevice(out_indices.mData.data()); - check_indices(out_indices_ref, out_indices); + ck::utils::check_err(out_indices.mData, out_indices_ref.mData); + ; }; if(do_log) diff --git a/profiler/src/profile_convnd_bwd_data.cpp b/profiler/src/profile_convnd_bwd_data.cpp index 655417434b..9de9170b57 100644 --- a/profiler/src/profile_convnd_bwd_data.cpp +++ b/profiler/src/profile_convnd_bwd_data.cpp @@ -32,10 +32,10 @@ enum struct ConvOutputLayout NKHW, // 0 NHWK, // 1 }; -ck::conv_util::ConvParams parse_conv_params(int num_dim_spatial, char* argv[], int arg_idx) +ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[], int arg_idx) { // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.num_dim_spatial = num_dim_spatial; params.N = std::stoi(argv[arg_idx++]); @@ -106,7 +106,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) const bool do_log = std::stoi(argv[8]); const int nrepeat = std::stoi(argv[9]); - ck::conv_util::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams); + ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams); auto Run = [&](auto input_type, auto wei_type, auto out_type, auto acc_type) { using InDataType = decltype(input_type); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 23e73bd5a7..ae9949b8ce 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,6 +15,7 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu + ${PROJECT_SOURCE_DIR}/library/include/ck/library/utility ${PROJECT_SOURCE_DIR}/test/include ${PROJECT_SOURCE_DIR}/profiler/include ${PROJECT_SOURCE_DIR}/external/include/half diff --git a/test/batched_gemm/batched_gemm_fp16.cpp b/test/batched_gemm/batched_gemm_fp16.cpp index 24ba347206..c039e344d2 100644 --- a/test/batched_gemm/batched_gemm_fp16.cpp +++ b/test/batched_gemm/batched_gemm_fp16.cpp @@ -1,7 +1,7 @@ -#include "profile_batched_gemm_impl.hpp" - #include +#include "profile_batched_gemm_impl.hpp" + namespace { using ADataType = ck::half_t; using BDataType = ck::half_t; diff --git a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp index 561e35e377..bb3ed985e3 100644 --- a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp +++ b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp @@ -6,13 +6,13 @@ #include #include -#include "conv_utils.hpp" +#include "conv_fwd_util.hpp" #include "profile_conv_bwd_weight_impl.hpp" int test_self() { bool pass = true; - std::vector params; + std::vector params; params.push_back({2, 128, 256, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); params.push_back({2, 128, 256, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); @@ -136,16 +136,16 @@ int main(int argc, char* argv[]) exit(1); } - ck::conv_util::ConvParams param{2, - N, - K, - C, - {Y, X}, - {Hi, Wi}, - {conv_stride_h, conv_stride_w}, - {conv_dilation_h, conv_dilation_w}, - {in_left_pad_h, in_left_pad_w}, - {in_right_pad_h, in_right_pad_w}}; + ck::utils::conv::ConvParams param{2, + N, + K, + C, + {Y, X}, + {Hi, Wi}, + {conv_stride_h, conv_stride_w}, + {conv_dilation_h, conv_dilation_w}, + {in_left_pad_h, in_left_pad_w}, + {in_right_pad_h, in_right_pad_w}}; if(data_type == 0) { pass = ck::profiler::profile_conv_bwd_weight_impl<2, diff --git a/test/conv_util/conv_util.cpp b/test/conv_util/conv_util.cpp index 9f95cc8eba..cc487c39e3 100644 --- a/test/conv_util/conv_util.cpp +++ b/test/conv_util/conv_util.cpp @@ -3,13 +3,13 @@ #include #include "config.hpp" -#include "conv_utils.hpp" +#include "conv_fwd_util.hpp" #include "tensor_layout.hpp" -#include "test_util.hpp" +#include "check_err.hpp" namespace { -bool TestConvParams_GetOutputSpatialLengths() +bool test_conv_params_get_output_spatial_lengths() { bool res{true}; // -------------------------- default 2D ------------------------------------ @@ -18,28 +18,28 @@ bool TestConvParams_GetOutputSpatialLengths() // stride {2,2}, // dilations {1,1}, // padding {{1,1}, {1,1}} - ck::conv_util::ConvParams conv_params; + ck::utils::conv::ConvParams conv_params; std::vector out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err(out_spatial_len, - std::vector{36, 36}, - "Error: ConvParams 2D default constructor."); + res = ck::utils::check_err(out_spatial_len, + std::vector{36, 36}, + "Error: ConvParams 2D default constructor."); conv_params.conv_filter_strides = std::vector{1, 1}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err( + res = ck::utils::check_err( out_spatial_len, std::vector{71, 71}, "Error: ConvParams 2D stride {1,1}."); conv_params.conv_filter_strides = std::vector{2, 2}; conv_params.input_left_pads = std::vector{2, 2}; conv_params.input_right_pads = std::vector{2, 2}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err(out_spatial_len, - std::vector{37, 37}, - "Error: ConvParams 2D padding left/right {2,2}."); + res = ck::utils::check_err(out_spatial_len, + std::vector{37, 37}, + "Error: ConvParams 2D padding left/right {2,2}."); conv_params.conv_filter_dilations = std::vector{2, 2}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err( + res = ck::utils::check_err( out_spatial_len, std::vector{36, 36}, "Error: ConvParams 2D dilation {2,2}."); conv_params.conv_filter_strides = std::vector{3, 3}; @@ -47,9 +47,10 @@ bool TestConvParams_GetOutputSpatialLengths() conv_params.input_right_pads = std::vector{1, 1}; conv_params.conv_filter_dilations = std::vector{2, 2}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err(out_spatial_len, - std::vector{23, 23}, - "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."); + res = + ck::utils::check_err(out_spatial_len, + std::vector{23, 23}, + "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."); // -------------------------- 1D ------------------------------------ conv_params.num_dim_spatial = 1; @@ -61,24 +62,25 @@ bool TestConvParams_GetOutputSpatialLengths() conv_params.input_right_pads = std::vector{1}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err(out_spatial_len, std::vector{36}, "Error: ConvParams 1D."); + res = ck::utils::check_err( + out_spatial_len, std::vector{36}, "Error: ConvParams 1D."); - conv_params.conv_filter_strides = std::vector{1, 1}; + conv_params.conv_filter_strides = std::vector{1}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err( + res = ck::utils::check_err( out_spatial_len, std::vector{71}, "Error: ConvParams 1D stride {1}."); conv_params.conv_filter_strides = std::vector{2}; conv_params.input_left_pads = std::vector{2}; conv_params.input_right_pads = std::vector{2}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err(out_spatial_len, - std::vector{37}, - "Error: ConvParams 1D padding left/right {2}."); + res = ck::utils::check_err(out_spatial_len, + std::vector{37}, + "Error: ConvParams 1D padding left/right {2}."); conv_params.conv_filter_dilations = std::vector{2}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err( + res = ck::utils::check_err( out_spatial_len, std::vector{36}, "Error: ConvParams 1D dilation {2}."); conv_params.conv_filter_strides = std::vector{3}; @@ -86,9 +88,9 @@ bool TestConvParams_GetOutputSpatialLengths() conv_params.input_right_pads = std::vector{1}; conv_params.conv_filter_dilations = std::vector{2}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err(out_spatial_len, - std::vector{23}, - "Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."); + res = ck::utils::check_err(out_spatial_len, + std::vector{23}, + "Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."); // -------------------------- 3D ------------------------------------ conv_params.num_dim_spatial = 3; @@ -100,35 +102,35 @@ bool TestConvParams_GetOutputSpatialLengths() conv_params.input_right_pads = std::vector{1, 1, 1}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err( + res = ck::utils::check_err( out_spatial_len, std::vector{36, 36, 36}, "Error: ConvParams 3D."); conv_params.conv_filter_strides = std::vector{1, 1, 1}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err(out_spatial_len, - std::vector{71, 71, 71}, - "Error: ConvParams 3D stride {1, 1, 1}."); + res = ck::utils::check_err(out_spatial_len, + std::vector{71, 71, 71}, + "Error: ConvParams 3D stride {1, 1, 1}."); conv_params.conv_filter_strides = std::vector{2, 2, 2}; conv_params.input_left_pads = std::vector{2, 2, 2}; conv_params.input_right_pads = std::vector{2, 2, 2}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err(out_spatial_len, - std::vector{37, 37, 37}, - "Error: ConvParams 3D padding left/right {2, 2, 2}."); + res = ck::utils::check_err(out_spatial_len, + std::vector{37, 37, 37}, + "Error: ConvParams 3D padding left/right {2, 2, 2}."); conv_params.conv_filter_dilations = std::vector{2, 2, 2}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err(out_spatial_len, - std::vector{36, 36, 36}, - "Error: ConvParams 3D dilation {2, 2, 2}."); + res = ck::utils::check_err(out_spatial_len, + std::vector{36, 36, 36}, + "Error: ConvParams 3D dilation {2, 2, 2}."); conv_params.conv_filter_strides = std::vector{3, 3, 3}; conv_params.input_left_pads = std::vector{1, 1, 1}; conv_params.input_right_pads = std::vector{1, 1, 1}; conv_params.conv_filter_dilations = std::vector{2, 2, 2}; out_spatial_len = conv_params.GetOutputSpatialLengths(); - res = test::check_err( + res = ck::utils::check_err( out_spatial_len, std::vector{23, 23, 23}, "Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}."); @@ -136,50 +138,54 @@ bool TestConvParams_GetOutputSpatialLengths() return res; } -bool TestGetHostTensorDescriptor() +bool test_get_host_tensor_descriptor() { bool res{true}; namespace tl = ck::tensor_layout::convolution; std::vector dims{2, 3, 4, 5}; - HostTensorDescriptor h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{}); - res = test::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!"); - res = test::check_err( + HostTensorDescriptor h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{}); + res = + ck::utils::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!"); + res = ck::utils::check_err( h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!"); - h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCHW{}); - res = test::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!"); - res = test::check_err( + h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCHW{}); + res = + ck::utils::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!"); + res = ck::utils::check_err( h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!"); dims = std::vector{2, 3, 4}; - h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{}); - res = test::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!"); - res = test::check_err(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!"); + h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{}); + res = ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!"); + res = + ck::utils::check_err(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!"); - h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCW{}); - res = test::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!"); - res = test::check_err(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!"); + h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCW{}); + res = ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!"); + res = + ck::utils::check_err(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!"); dims = std::vector{2, 3, 4, 5, 6}; - h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWC{}); - res = test::check_err(h.GetLengths(), dims, "Error: wrong NDHWC dimensions lengths!"); - res = test::check_err(h.GetStrides(), - {3 * 4 * 5 * 6, // N - 1, // C - 3 * 5 * 6, // D - 3 * 6, // H - 3}, // W - "Error: wrong NDHWC dimensions strides!"); + h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{}); + res = ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NDHWC dimensions lengths!"); + res = ck::utils::check_err(h.GetStrides(), + {3 * 4 * 5 * 6, // N + 1, // C + 3 * 5 * 6, // D + 3 * 6, // H + 3}, // W + "Error: wrong NDHWC dimensions strides!"); - h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCDHW{}); - res = test::check_err(h.GetLengths(), dims, "Error: wrong NCDHW dimensions lengths!"); - res = test::check_err(h.GetStrides(), - {3 * 4 * 5 * 6, // N - 4 * 5 * 6, // C - 5 * 6, // D - 6, // H - 1}, // W - "Error: wrong NCDHW dimensions strides!"); + h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCDHW{}); + res = ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NCDHW dimensions lengths!"); + res = ck::utils::check_err(h.GetStrides(), + {3 * 4 * 5 * 6, // N + 4 * 5 * 6, // C + 5 * 6, // D + 6, // H + 1}, // W + "Error: wrong NCDHW dimensions strides!"); return res; } @@ -188,10 +194,11 @@ bool TestGetHostTensorDescriptor() int main(void) { - bool res = TestConvParams_GetOutputSpatialLengths(); - std::cout << "TestConvParams_GetOutputSpatialLengths ..... " << (res ? "SUCCESS" : "FAILURE") + bool res = test_conv_params_get_output_spatial_lengths(); + std::cout << "test_conv_params_get_output_spatial_lengths ..... " + << (res ? "SUCCESS" : "FAILURE") << std::endl; + res = test_get_host_tensor_descriptor(); + std::cout << "test_get_host_tensor_descriptor ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestGetHostTensorDescriptor(); - std::cout << "TestGetHostTensorDescriptor ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; return res ? 0 : 1; } diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/test/convnd_bwd_data/convnd_bwd_data.cpp index 53c339fa8c..cbc215033b 100644 --- a/test/convnd_bwd_data/convnd_bwd_data.cpp +++ b/test/convnd_bwd_data/convnd_bwd_data.cpp @@ -12,7 +12,7 @@ int main() { bool pass = true; // check 1d - std::vector params; + std::vector params; params.push_back({1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}}); params.push_back({1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}}); params.push_back({1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}}); diff --git a/test/convnd_fwd/conv1d_fwd.cpp b/test/convnd_fwd/conv1d_fwd.cpp index 039432acb3..e6df0e6f8c 100644 --- a/test/convnd_fwd/conv1d_fwd.cpp +++ b/test/convnd_fwd/conv1d_fwd.cpp @@ -5,10 +5,11 @@ #include "data_type.hpp" #include "element_wise_operation.hpp" -#include "conv_test_util.hpp" +#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "host_tensor.hpp" #include "tensor_layout.hpp" -#include "test_util.hpp" +#include "check_err.hpp" // Forward declarations for conv instances. @@ -34,10 +35,10 @@ void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector{1}; params.input_right_pads = std::vector{1}; - auto host_tensors = test::conv::GetHostTensors(params); + auto host_tensors = + ck::utils::conv::get_host_tensors(params); const Tensor& input = std::get<0>(host_tensors); const Tensor& weights = std::get<1>(host_tensors); Tensor& host_output = std::get<2>(host_tensors); Tensor& device_output = std::get<3>(host_tensors); - test::conv::RunReferenceConv<1>(params, input, weights, host_output); + ck::utils::conv::run_reference_convolution_forward<1>(params, input, weights, host_output); test::conv::RunConv<1>(params, input, weights, device_output); res = res && - test::check_err( + ck::utils::check_err( device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); return res; } template -bool TestConv1DNWCInstances(const std::vector& conv_ptrs) +bool test_conv1d_nwc_instances(const std::vector& conv_ptrs) { - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.num_dim_spatial = 1; params.filter_spatial_lengths = std::vector{3}; params.input_spatial_lengths = std::vector{71}; @@ -81,51 +83,52 @@ bool TestConv1DNWCInstances(const std::vector& conv_ptrs) params.input_left_pads = std::vector{1}; params.input_right_pads = std::vector{1}; - auto host_tensors = test::conv::GetHostTensors(params); + auto host_tensors = + ck::utils::conv::get_host_tensors(params); const Tensor& input = std::get<0>(host_tensors); const Tensor& weights = std::get<1>(host_tensors); Tensor& host_output = std::get<2>(host_tensors); Tensor& device_output = std::get<3>(host_tensors); - test::conv::RunReferenceConv<1>(params, input, weights, host_output); - return test::conv::RunConvInstances<1>( + ck::utils::conv::run_reference_convolution_forward<1>(params, input, weights, host_output); + return ck::utils::conv::run_convolution_forward_instances<1>( params, conv_ptrs, input, weights, device_output, host_output); } -bool TestConv1DNWCBF16Instances() +bool test_conv1d_nwc_bf16_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv1d_fwd_instance:: add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs); - return TestConv1DNWCInstances(conv_ptrs); + return test_conv1d_nwc_instances(conv_ptrs); } -bool TestConv1DNWCF16Instances() +bool test_conv1d_nwc_f16_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv1d_fwd_instance:: add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs); - return TestConv1DNWCInstances(conv_ptrs); + return test_conv1d_nwc_instances(conv_ptrs); } -bool TestConv1DNWCF32Instances() +bool test_conv1d_nwc_f32_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv1d_fwd_instance:: add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs); - return TestConv1DNWCInstances(conv_ptrs); + return test_conv1d_nwc_instances(conv_ptrs); } -bool TestConv1DNWCInt8Instances() +bool test_conv1d_nwc_int8_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv1d_fwd_instance:: add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs); - return TestConv1DNWCInstances(conv_ptrs); + return test_conv1d_nwc_instances(conv_ptrs); } } // anonymous namespace @@ -133,18 +136,20 @@ bool TestConv1DNWCInt8Instances() int main() { bool res{true}; - res = TestConv1DNWC(); - std::cout << "TestConv1DNWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; + res = test_conv1D_nwc(); + std::cout << "test_conv1D_nwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv1DNWCBF16Instances(); + res = test_conv1d_nwc_bf16_instances(); std::cout << "\nTestConv1DNWCBF16Instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv1DNWCF16Instances(); - std::cout << "\nTestConv1DNWCF16Instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv1DNWCF32Instances(); - std::cout << "\nTestConv1DNWCF32Instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv1DNWCInt8Instances(); - std::cout << "\nTestConv1DNWCInt8Instances ..... " << (res ? "SUCCESS" : "FAILURE") + res = test_conv1d_nwc_f16_instances(); + std::cout << "\ntest_conv1d_nwc_f16_instances ..... " << (res ? "SUCCESS" : "FAILURE") + << std::endl; + res = test_conv1d_nwc_f32_instances(); + std::cout << "\ntest_conv1d_nwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE") + << std::endl; + res = test_conv1d_nwc_int8_instances(); + std::cout << "\ntes_tconv1_dnw_cint_8instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; return res ? 0 : 1; diff --git a/test/convnd_fwd/conv2d_fwd.cpp b/test/convnd_fwd/conv2d_fwd.cpp index 834b3c637f..2a46d74495 100644 --- a/test/convnd_fwd/conv2d_fwd.cpp +++ b/test/convnd_fwd/conv2d_fwd.cpp @@ -6,10 +6,11 @@ #include "data_type.hpp" #include "element_wise_operation.hpp" -#include "conv_test_util.hpp" +#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "host_tensor.hpp" #include "tensor_layout.hpp" -#include "test_util.hpp" +#include "check_err.hpp" // Forward declarations for conv instances. using DeviceConvFwdNoOpPtr = @@ -36,35 +37,35 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector{16, 16}; params.conv_filter_strides = std::vector{1, 1}; - auto host_tensors = test::conv::GetHostTensors(params); + auto host_tensors = ck::utils::conv::get_host_tensors(params); const Tensor& input = std::get<0>(host_tensors); const Tensor& weights = std::get<1>(host_tensors); Tensor& host_output = std::get<2>(host_tensors); Tensor& device_output = std::get<3>(host_tensors); - test::conv::RunReferenceConv<2>(params, input, weights, host_output); + ck::utils::conv::run_reference_convolution_forward<2>(params, input, weights, host_output); test::conv::RunConv<2>(params, input, weights, device_output); res = res && - test::check_err( + ck::utils::check_err( device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); return res; } template -bool TestConv2DNHWCInstances(const std::vector& conv_ptrs) +bool test_conv2d_nhwc_instances(const std::vector& conv_ptrs) { - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.num_dim_spatial = 2; params.filter_spatial_lengths = std::vector{3, 3}; params.input_spatial_lengths = std::vector{71, 71}; @@ -73,54 +74,55 @@ bool TestConv2DNHWCInstances(const std::vector& conv_ptrs) params.input_left_pads = std::vector{1, 1}; params.input_right_pads = std::vector{1, 1}; - auto host_tensors = test::conv::GetHostTensors(params); + auto host_tensors = + ck::utils::conv::get_host_tensors(params); const Tensor& input = std::get<0>(host_tensors); const Tensor& weights = std::get<1>(host_tensors); Tensor& host_output = std::get<2>(host_tensors); Tensor& device_output = std::get<3>(host_tensors); - test::conv::RunReferenceConv<2>(params, input, weights, host_output); - return test::conv::RunConvInstances<2>( + ck::utils::conv::run_reference_convolution_forward<2>(params, input, weights, host_output); + return ck::utils::conv::run_convolution_forward_instances<2>( params, conv_ptrs, input, weights, device_output, host_output); } -bool TestConv2DNHWCBF16Instances() +bool test_conv2d_nhwc_bf16_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv2d_fwd_instance:: add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs); - return TestConv2DNHWCInstances(conv_ptrs); + return test_conv2d_nhwc_instances(conv_ptrs); } -bool TestConv2DNHWCF16Instances() +bool test_conv2d_nhwc_f16_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv2d_fwd_instance:: add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); ck::tensor_operation::device::device_conv2d_fwd_instance:: add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); - return TestConv2DNHWCInstances(conv_ptrs); + return test_conv2d_nhwc_instances(conv_ptrs); } -bool TestConv2DNHWCF32Instances() +bool test_conv2d_nhwc_f32_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv2d_fwd_instance:: add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); - return TestConv2DNHWCInstances(conv_ptrs); + return test_conv2d_nhwc_instances(conv_ptrs); } -bool TestConv2DNHWCInt8Instances() +bool test_conv2d_nhwc_int8_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv2d_fwd_instance:: add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs); - return TestConv2DNHWCInstances(conv_ptrs); + return test_conv2d_nhwc_instances(conv_ptrs); } } // anonymous namespace @@ -128,19 +130,20 @@ bool TestConv2DNHWCInt8Instances() int main() { bool res{true}; - res = TestConv2DNHWC(); - std::cout << "TestConv2DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; + res = test_conv2d_nhwc(); + std::cout << "test_conv2d_nhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv2DNHWCBF16Instances(); - std::cout << "\nTestConv2DNHWCBF16Instances ..... " << (res ? "SUCCESS" : "FAILURE") + res = test_conv2d_nhwc_bf16_instances(); + std::cout << "\ntest_conv2d_nhwc_bf16_instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv2DNHWCF16Instances(); - std::cout << "\nTestConv2DNHWCF16Instances ....." << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv2DNHWCF32Instances(); - std::cout << "\nTestConv2DNHWCF32Instances ..... " << (res ? "SUCCESS" : "FAILURE") + res = test_conv2d_nhwc_f16_instances(); + std::cout << "\ntest_conv2d_nhwc_f16_instances ....." << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv2DNHWCInt8Instances(); - std::cout << "\nTestConv2DNHWCInt8Instances ..... " << (res ? "SUCCESS" : "FAILURE") + res = test_conv2d_nhwc_f32_instances(); + std::cout << "\ntest_conv2d_nhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE") + << std::endl; + res = test_conv2d_nhwc_int8_instances(); + std::cout << "\ntest_conv2d_nhwc_int8_instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; return res ? 0 : 1; diff --git a/test/convnd_fwd/conv3d_fwd.cpp b/test/convnd_fwd/conv3d_fwd.cpp index 2d6244d57c..3dc1a6b160 100644 --- a/test/convnd_fwd/conv3d_fwd.cpp +++ b/test/convnd_fwd/conv3d_fwd.cpp @@ -6,10 +6,11 @@ #include "data_type.hpp" #include "element_wise_operation.hpp" -#include "conv_test_util.hpp" +#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "host_tensor.hpp" #include "tensor_layout.hpp" -#include "test_util.hpp" +#include "check_err.hpp" // Forward declarations for conv instances. using DeviceConvFwdNoOpPtr = @@ -34,10 +35,10 @@ void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector{1, 1, 1}; params.input_right_pads = std::vector{1, 1, 1}; - auto host_tensors = test::conv::GetHostTensors(params); + auto host_tensors = + ck::utils::conv::get_host_tensors(params); const Tensor& input = std::get<0>(host_tensors); const Tensor& weights = std::get<1>(host_tensors); Tensor& host_output = std::get<2>(host_tensors); Tensor& device_output = std::get<3>(host_tensors); - test::conv::RunReferenceConv<3>(params, input, weights, host_output); + ck::utils::conv::run_reference_convolution_forward<3>(params, input, weights, host_output); test::conv::RunConv<3>(params, input, weights, device_output); res = res && - test::check_err( + ck::utils::check_err( device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); return res; } -bool TestConv3DNDHWC2GBInput() +bool test_conv3d_ndhwc_2gb_input() { // >2GB Input - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.num_dim_spatial = 3; params.N = 2; params.K = 16; @@ -85,12 +87,12 @@ bool TestConv3DNDHWC2GBInput() params.input_right_pads = std::vector{1, 1, 1}; auto host_tensors = - test::conv::GetHostTensors(params, false); + ck::utils::conv::get_host_tensors(params, false); const Tensor& input = std::get<0>(host_tensors); const Tensor& weights = std::get<1>(host_tensors); Tensor& device_output = std::get<3>(host_tensors); @@ -113,10 +115,10 @@ bool TestConv3DNDHWC2GBInput() return false; } -bool TestConv3DNDHWC2GBFilters() +bool test_conv3d_ndhwc_2gb_filters() { // >2GB Filters - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.num_dim_spatial = 3; params.N = 2; params.K = 16; @@ -129,12 +131,12 @@ bool TestConv3DNDHWC2GBFilters() params.input_right_pads = std::vector{1, 1, 1}; auto host_tensors = - test::conv::GetHostTensors(params, false); + ck::utils::conv::get_host_tensors(params, false); const Tensor& input = std::get<0>(host_tensors); const Tensor& weights = std::get<1>(host_tensors); Tensor& device_output = std::get<3>(host_tensors); @@ -157,10 +159,10 @@ bool TestConv3DNDHWC2GBFilters() return false; } -bool TestConv3DNDHWC2GBOutput() +bool test_conv3d_ndhwc_2gb_output() { // >2GB Output - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.num_dim_spatial = 3; params.N = 2; params.K = 16; @@ -173,12 +175,12 @@ bool TestConv3DNDHWC2GBOutput() params.input_right_pads = std::vector{2, 2, 2}; auto host_tensors = - test::conv::GetHostTensors(params, false); + ck::utils::conv::get_host_tensors(params, false); const Tensor& input = std::get<0>(host_tensors); const Tensor& weights = std::get<1>(host_tensors); Tensor& device_output = std::get<3>(host_tensors); @@ -202,9 +204,9 @@ bool TestConv3DNDHWC2GBOutput() } template -bool TestConv3DNDHWCInstances(const std::vector& conv_ptrs) +bool test_conv3d_ndhwc_instances(const std::vector& conv_ptrs) { - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.N = 64; params.num_dim_spatial = 3; params.filter_spatial_lengths = std::vector{3, 3, 2}; @@ -214,52 +216,53 @@ bool TestConv3DNDHWCInstances(const std::vector& conv_ptrs params.input_left_pads = std::vector{1, 1, 1}; params.input_right_pads = std::vector{1, 1, 1}; - auto host_tensors = test::conv::GetHostTensors(params); + auto host_tensors = + ck::utils::conv::get_host_tensors(params); const Tensor& input = std::get<0>(host_tensors); const Tensor& weights = std::get<1>(host_tensors); Tensor& host_output = std::get<2>(host_tensors); Tensor& device_output = std::get<3>(host_tensors); - test::conv::RunReferenceConv<3>(params, input, weights, host_output); - return test::conv::RunConvInstances<3>( + ck::utils::conv::run_reference_convolution_forward<3>(params, input, weights, host_output); + return ck::utils::conv::run_convolution_forward_instances<3>( params, conv_ptrs, input, weights, device_output, host_output); } -bool TestConv3DNDHWCBF16Instances() +bool test_conv3d_ndhwc_bf16_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv3d_fwd_instance:: add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs); - return TestConv3DNDHWCInstances(conv_ptrs); + return test_conv3d_ndhwc_instances(conv_ptrs); } -bool TestConv3DNDHWCF16Instances() +bool test_conv3d_ndhwc_f16_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv3d_fwd_instance:: add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs); - return TestConv3DNDHWCInstances(conv_ptrs); + return test_conv3d_ndhwc_instances(conv_ptrs); } -bool TestConv3DNDHWCF32Instances() +bool test_conv3d_ndhwc_f32_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv3d_fwd_instance:: add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs); - return TestConv3DNDHWCInstances(conv_ptrs); + return test_conv3d_ndhwc_instances(conv_ptrs); } -bool TestConv3DNDHWCInt8Instances() +bool test_conv3d_ndhwc_int8_instances() { std::vector conv_ptrs; ck::tensor_operation::device::device_conv3d_fwd_instance:: add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs); - return TestConv3DNDHWCInstances(conv_ptrs); + return test_conv3d_ndhwc_instances(conv_ptrs); } } // anonymous namespace @@ -267,27 +270,30 @@ bool TestConv3DNDHWCInt8Instances() int main() { bool res{true}; - res = TestConv3DNDHWC(); - std::cout << "TestConv3DNDHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; + res = test_conv3d_ndhwc(); + std::cout << "test_conv3d_ndhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv3DNDHWC2GBInput(); - std::cout << "\nTestConv3DNDHWC2GBInput ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv3DNDHWC2GBFilters(); - std::cout << "\nTestConv3DNDHWC2GBFilters ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv3DNDHWC2GBOutput(); - std::cout << "\nTestConv3DNDHWC2GBOutput ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; + res = test_conv3d_ndhwc_2gb_input(); + std::cout << "\ntest_conv3d_ndhwc_2gb_input ..... " << (res ? "SUCCESS" : "FAILURE") + << std::endl; + res = test_conv3d_ndhwc_2gb_filters(); + std::cout << "\ntest_conv3d_ndhwc_2gb_filters ..... " << (res ? "SUCCESS" : "FAILURE") + << std::endl; + res = test_conv3d_ndhwc_2gb_output(); + std::cout << "\ntest_conv3d_ndhwc_2gb_output ..... " << (res ? "SUCCESS" : "FAILURE") + << std::endl; - res = TestConv3DNDHWCBF16Instances(); - std::cout << "\nTestConv3DNDHWCBF16Instances ..... " << (res ? "SUCCESS" : "FAILURE") + res = test_conv3d_ndhwc_bf16_instances(); + std::cout << "\ntest_conv3d_ndhwc_bf16_instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv3DNDHWCF16Instances(); - std::cout << "\nTestConv3DNDHWCF16Instances ..... " << (res ? "SUCCESS" : "FAILURE") + res = test_conv3d_ndhwc_f16_instances(); + std::cout << "\ntest_conv3d_ndhwc_f16_instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv3DNDHWCF32Instances(); - std::cout << "\nTestConv3DNDHWCF32Instances ..... " << (res ? "SUCCESS" : "FAILURE") + res = test_conv3d_ndhwc_f32_instances(); + std::cout << "\ntest_conv3d_ndhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv3DNDHWCInt8Instances(); - std::cout << "\nTestConv3DNDHWCInt8Instances ..... " << (res ? "SUCCESS" : "FAILURE") + res = test_conv3d_ndhwc_int8_instances(); + std::cout << "\ntest_conv3d_ndhw_cint_8instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; return res ? 0 : 1; diff --git a/test/convnd_fwd/conv_util.hpp b/test/convnd_fwd/conv_util.hpp new file mode 100644 index 0000000000..d62dab7366 --- /dev/null +++ b/test/convnd_fwd/conv_util.hpp @@ -0,0 +1,90 @@ +#ifndef TEST_CONV_UTIL_HPP +#define TEST_CONV_UTIL_HPP + +#include + +#include "config.hpp" +#include "conv_fwd_util.hpp" +#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" +#include "element_wise_operation.hpp" +#include "host_tensor.hpp" +#include "sequence.hpp" + +namespace { + +template +using S = ck::Sequence; + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto ConvFwdDefault = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +template +using DeviceConvNDFwdInstance = ck::tensor_operation::device:: + DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< + // clang-format off + InDataType, // + WeiDataType, // + OutDataType, // + InDataType, // + InElementOp, // Input Elementwise Operation + WeiElementOp, // Weights Elementwise Operation + OutElementOp, // Output Elementwise Operation + ConvFwdDefault, // ConvForwardSpecialization + SpatialDims, // SptialDims + 64, // BlockSize + 16, // MPerBlock + 16, // NPerBlock + 4, // K0PerBlock + 1, // K1 + 16, // MPerXDL + 16, // NPerXDL + 1, // MXdlPerWave + 1, // NXdlPerWave + S<1, 16, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 1, // ABlockTransferSrcScalarPerVector + 1, // ABlockTransferDstScalarPerVector_K1 + true, // ABlockLdsAddExtraM + S<1, 16, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 1, // BBlockTransferSrcScalarPerVector + 1, // BBlockTransferDstScalarPerVector_K1 + true, // BBlockTransferAddExtraN + 7, // CThreadTransferSrcDstVectorDim + 1>; // CThreadTransferDstScalarPerVector +// clang-format on + +} // namespace + +namespace test { +namespace conv { + +template +void RunConv(const ck::utils::conv::ConvParams& params, + const Tensor& input, + const Tensor& weights, + Tensor& output) +{ + ck::utils::conv::run_convolution_forward( + params, input, weights, output); +} + +} // namespace conv +} // namespace test + +#endif diff --git a/test/gemm/gemm_bf16.cpp b/test/gemm/gemm_bf16.cpp index 98c96b8b58..3f08acb1e6 100644 --- a/test/gemm/gemm_bf16.cpp +++ b/test/gemm/gemm_bf16.cpp @@ -19,7 +19,6 @@ #include "element_wise_operation.hpp" #include "reference_gemm.hpp" #include "gemm_specialization.hpp" -#include "test_util.hpp" using PassThrough = ck::tensor_operation::element_wise::PassThrough; diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp index cd68158402..6c86085f3b 100644 --- a/test/gemm/gemm_fp32.cpp +++ b/test/gemm/gemm_fp32.cpp @@ -19,7 +19,6 @@ #include "element_wise_operation.hpp" #include "reference_gemm.hpp" #include "gemm_specialization.hpp" -#include "test_util.hpp" using PassThrough = ck::tensor_operation::element_wise::PassThrough; diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp index bb3dbdf43b..864fca8df4 100644 --- a/test/gemm/gemm_int8.cpp +++ b/test/gemm/gemm_int8.cpp @@ -19,7 +19,6 @@ #include "element_wise_operation.hpp" #include "reference_gemm.hpp" #include "gemm_specialization.hpp" -#include "test_util.hpp" using PassThrough = ck::tensor_operation::element_wise::PassThrough; diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index a2502c04ef..08c8edfb94 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -1,13 +1,13 @@ #ifndef GEMM_UTILS_HPP #define GEMM_UTILS_HPP +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" #include "host_tensor_generator.hpp" #include "reference_gemm.hpp" #include "tensor_layout.hpp" -#include "test_util.hpp" namespace ck { namespace gemm_util { @@ -202,20 +202,17 @@ struct TestGemm bool res = false; if(std::is_same::value) { - res = test::check_err(c_device.mData, c_host.mData, "Error: incorrect results!"); - + res = ck::utils::check_err(c_device.mData, c_host.mData); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } else if(std::is_same::value) { - res = test::check_err(c_device.mData, c_host.mData, "Error: incorrect results!"); - + res = ck::utils::check_err(c_device.mData, c_host.mData); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } else if(std::is_same::value) { - res = test::check_err(c_device.mData, c_host.mData, "Error: incorrect results!"); - + res = ck::utils::check_err(c_device.mData, c_host.mData); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } @@ -330,9 +327,8 @@ struct TestGemmBF16 bf16_to_f32_(c_device_bf16, c_device_fp32); // Assert - bool res = test::check_err( + bool res = ck::utils::check_err( c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f); - std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; return res; diff --git a/test/grouped_gemm/grouped_gemm_fp16.cpp b/test/grouped_gemm/grouped_gemm_fp16.cpp index 1568f4935f..2260b01462 100644 --- a/test/grouped_gemm/grouped_gemm_fp16.cpp +++ b/test/grouped_gemm/grouped_gemm_fp16.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "print.hpp" #include "device.hpp" @@ -15,7 +17,6 @@ #include "element_wise_operation.hpp" #include "reference_gemm.hpp" #include "gemm_specialization.hpp" -#include "test_util.hpp" using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -46,24 +47,6 @@ using ALayout = ck::tensor_layout::gemm::RowMajor; using BLayout = ck::tensor_layout::gemm::ColumnMajor; using CLayout = ck::tensor_layout::gemm::RowMajor; -template -static bool check_err(const Tensor& ref, const Tensor& result) -{ - float max_diff = 1e-2; - - for(int i = 0; i < ref.mData.size(); ++i) - { - float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); - if(max_diff < diff) - { - std::cout << double(ref.mData[i]) << "," << double(result.mData[i]) << std::endl; - return false; - } - } - - return true; -} - bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr) { int group_count = rand() % 10 + 1; @@ -188,7 +171,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr) ref_invoker.Run(ref_argument); - bool res = check_err(c_device_tensors[i], c_host_tensors[i]); + bool res = ck::utils::check_err(c_host_tensors[i].mData, c_device_tensors[i].mData); std::cout << "group_id: " << i << (res ? " SUCCESS" : " FAILURE") << std::endl; diff --git a/test/include/conv_test_util.hpp b/test/include/conv_test_util.hpp deleted file mode 100644 index 31bde8e99d..0000000000 --- a/test/include/conv_test_util.hpp +++ /dev/null @@ -1,289 +0,0 @@ -#ifndef TEST_CONV_UTIL_HPP -#define TEST_CONV_UTIL_HPP - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "config.hpp" -#include "conv_utils.hpp" -#include "device.hpp" -#include "device_tensor.hpp" -#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" -#include "element_wise_operation.hpp" -#include "host_tensor.hpp" -#include "reference_conv_fwd.hpp" -#include "tensor_layout.hpp" -#include "test_util.hpp" - -namespace { - -template -using S = ck::Sequence; - -using InElementOp = ck::tensor_operation::element_wise::PassThrough; -using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; -using OutElementOp = ck::tensor_operation::element_wise::PassThrough; - -static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; - -template -using DeviceConvNDFwdInstance = ck::tensor_operation::device:: - DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< - // clang-format off - InDataType, // - WeiDataType, // - OutDataType, // - InDataType, // - InElementOp, // Input Elementwise Operation - WeiElementOp, // Weights Elementwise Operation - OutElementOp, // Output Elementwise Operation - ConvFwdDefault, // ConvForwardSpecialization - SpatialDims, // SptialDims - 64, // BlockSize - 16, // MPerBlock - 16, // NPerBlock - 4, // K0PerBlock - 1, // K1 - 16, // MPerXDL - 16, // NPerXDL - 1, // MXdlPerWave - 1, // NXdlPerWave - S<1, 16, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 1, // ABlockTransferSrcScalarPerVector - 1, // ABlockTransferDstScalarPerVector_K1 - true, // ABlockLdsAddExtraM - S<1, 16, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 1, // BBlockTransferSrcScalarPerVector - 1, // BBlockTransferDstScalarPerVector_K1 - true, // BBlockTransferAddExtraN - 7, // CThreadTransferSrcDstVectorDim - 1>; // CThreadTransferDstScalarPerVector -// clang-format on - -} // namespace - -namespace test { -namespace conv { - -using DeviceConvFwdNoOpPtr = - ck::tensor_operation::device::DeviceConvFwdPtr; - -template -auto GetHostTensors(const ck::conv_util::ConvParams& params, bool init = true) -{ - std::vector input_dims{static_cast(params.N), - static_cast(params.C)}; - input_dims.insert(std::end(input_dims), - std::begin(params.input_spatial_lengths), - std::end(params.input_spatial_lengths)); - - std::vector filter_dims{static_cast(params.K), - static_cast(params.C)}; - filter_dims.insert(std::end(filter_dims), - std::begin(params.filter_spatial_lengths), - std::end(params.filter_spatial_lengths)); - - const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - std::vector output_dims{static_cast(params.N), - static_cast(params.K)}; - output_dims.insert(std::end(output_dims), - std::begin(output_spatial_lengths), - std::end(output_spatial_lengths)); - - Tensor input(ck::conv_util::GetHostTensorDescriptor(input_dims, InLayout{})); - Tensor weights(ck::conv_util::GetHostTensorDescriptor(filter_dims, WeiLayout{})); - Tensor host_output( - ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{})); - Tensor device_output( - ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{})); - - if(init) - { - std::mt19937 gen(11939); - if constexpr(std::is_same::value) - { - std::uniform_int_distribution<> dis(-5, 5); - std::generate( - input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); }); - std::generate( - weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); }); - } - else - { - std::uniform_real_distribution<> dis(0.f, 1.f); - std::generate( - input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); }); - std::generate( - weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); }); - } - std::fill(host_output.begin(), host_output.end(), OutDataType(0.f)); - std::fill(device_output.begin(), device_output.end(), OutDataType(0.f)); - } - - return std::make_tuple(input, weights, host_output, device_output); -} - -template -void RunReferenceConv(const ck::conv_util::ConvParams& params, - const Tensor& input, - const Tensor& weights, - Tensor& output) -{ - auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd(); - auto ref_invoker = ref_conv.MakeInvoker(); - auto ref_argument = ref_conv.MakeArgument(input, - weights, - output, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, - InElementOp{}, - WeiElementOp{}, - OutElementOp{}); - - ref_invoker.Run(ref_argument); -} - -template -void RunConv(const ck::conv_util::ConvParams& params, - const Tensor& input, - const Tensor& weights, - Tensor& output) -{ - DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace()); - - in_device_buf.ToDevice(input.mData.data()); - wei_device_buf.ToDevice(weights.mData.data()); - const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - - auto conv = DeviceConvNDFwdInstance(); - auto invoker = conv.MakeInvoker(); - auto argument = conv.MakeArgument(static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, - output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, - InElementOp{}, - WeiElementOp{}, - OutElementOp{}); - - if(!conv.IsSupportedArgument(argument)) - { - throw std::runtime_error( - "Error! device_conv with the specified compilation parameters does " - "not support this Conv problem"); - } - - invoker.Run(argument); - out_device_buf.FromDevice(output.mData.data()); -} - -template -bool RunConvInstances(const ck::conv_util::ConvParams& params, - const std::vector& conv_ptrs, - const Tensor& input, - const Tensor& weights, - Tensor& output, - const Tensor& host_output) -{ - DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace()); - - in_device_buf.ToDevice(input.mData.data()); - wei_device_buf.ToDevice(weights.mData.data()); - const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - - bool res{true}; - for(auto& conv_ptr : conv_ptrs) - { - auto invoker = conv_ptr->MakeInvokerPointer(); - auto argument = conv_ptr->MakeArgumentPointer( - static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, - output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, - InElementOp{}, - WeiElementOp{}, - OutElementOp{}); - - if(conv_ptr->IsSupportedArgument(argument.get())) - { - float atol{1e-5f}; - float rtol{1e-4f}; - if constexpr(std::is_same_v) - { - atol = 1e-4f; - rtol = 2.5e-3f; - } - invoker->Run(argument.get()); - out_device_buf.FromDevice(output.mData.data()); - res = res && - test::check_err( - output.mData, host_output.mData, "Error: incorrect results!", atol, rtol); - hipGetErrorString( - hipMemset(out_device_buf.GetDeviceBuffer(), 0, out_device_buf.mMemSize)); - } - } - return res; -} - -} // namespace conv -} // namespace test - -#endif diff --git a/test/magic_number_division/magic_number_division.cpp b/test/magic_number_division/magic_number_division.cpp index 267882e0cb..751a62be19 100644 --- a/test/magic_number_division/magic_number_division.cpp +++ b/test/magic_number_division/magic_number_division.cpp @@ -4,6 +4,8 @@ #include #include #include + +#include "check_err.hpp" #include "config.hpp" #include "magic_division.hpp" #include "device.hpp" @@ -54,29 +56,6 @@ __host__ void cpu_magic_number_division(uint32_t magic_multiplier, } } -template -T check_error(const std::vector& ref, const std::vector& result) -{ - T error = 0; - T max_diff = 0; - T ref_value = 0, result_value = 0; - - for(std::size_t i = 0; i < ref.size(); ++i) - { - T diff = std::abs(ref[i] - result[i]); - error += diff; - - if(max_diff < diff) - { - max_diff = diff; - ref_value = ref[i]; - result_value = result[i]; - } - } - - return max_diff; -} - int main(int, char*[]) { uint64_t num_divisor = 4096; @@ -135,9 +114,9 @@ int main(int, char*[]) naive_result_dev_buf.FromDevice(naive_result_host.data()); magic_result_dev_buf.FromDevice(magic_result_host.data()); - int32_t max_diff = check_error(naive_result_host, magic_result_host); + bool res = ck::utils::check_err(magic_result_host, naive_result_host); - if(max_diff != 0) + if(!res) { pass = false; continue; @@ -149,9 +128,9 @@ int main(int, char*[]) magic_result_host2.data(), num_dividend); - max_diff = check_error(naive_result_host, magic_result_host2); + res = ck::utils::check_err(magic_result_host2, naive_result_host); - if(max_diff != 0) + if(!res) { pass = false; continue; diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp index f031648881..6bb35f3fa6 100644 --- a/test/reduce/reduce_no_index.cpp +++ b/test/reduce/reduce_no_index.cpp @@ -1,10 +1,11 @@ #include "getopt.h" + +#include "check_err.hpp" #include "device_reduce_instance.hpp" #include "reduction_enums.hpp" #include "host_tensor.hpp" #include "host_tensor_generator.hpp" #include "host_reduction.hpp" -#include "test_util.hpp" #include "reduce_util.hpp" using namespace ck; @@ -289,13 +290,13 @@ bool test_reduce_no_index_impl(int init_method, { reduce_util::to_f32_vector(out, out_fp32); reduce_util::to_f32_vector(out_ref, out_ref_fp32); - single_result = test::check_err( + single_result = ck::utils::check_err( out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); } else { single_result = - test::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); + ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); }; if(!single_result) @@ -376,13 +377,13 @@ bool test_reduce_no_index_impl(int init_method, { reduce_util::to_f32_vector(out, out_fp32); reduce_util::to_f32_vector(out_ref, out_ref_fp32); - single_result = test::check_err( + single_result = ck::utils::check_err( out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); } else { single_result = - test::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); + ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); }; if(!single_result) diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp index 0a3692696d..de67da9352 100644 --- a/test/reduce/reduce_with_index.cpp +++ b/test/reduce/reduce_with_index.cpp @@ -4,7 +4,7 @@ #include "host_tensor.hpp" #include "host_tensor_generator.hpp" #include "host_reduction.hpp" -#include "test_util.hpp" +#include "check_err.hpp" #include "reduce_util.hpp" using namespace ck; @@ -273,21 +273,21 @@ bool test_reduce_with_index_impl(int init_method, { reduce_util::to_f32_vector(out, out_fp32); reduce_util::to_f32_vector(out_ref, out_ref_fp32); - single_result = test::check_err( + single_result = ck::utils::check_err( out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); } else { single_result = - test::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); + ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); }; if(NeedIndices) { out_indices_dev.FromDevice(out_indices.mData.data()); - single_result = single_result && test::check_err(out_indices_ref.mData, - out_indices.mData, - "Error: incorrect index result!"); + single_result = single_result && ck::utils::check_err(out_indices_ref.mData, + out_indices.mData, + "Error: incorrect index result!"); }; if(!single_result) @@ -370,21 +370,22 @@ bool test_reduce_with_index_impl(int init_method, { reduce_util::to_f32_vector(out, out_fp32); reduce_util::to_f32_vector(out_ref, out_ref_fp32); - single_result = test::check_err( + single_result = ck::utils::check_err( out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); } else { single_result = - test::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); + ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); }; if(NeedIndices) { out_indices_dev.FromDevice(out_indices.mData.data()); - single_result = single_result && test::check_err(out_indices_ref.mData, - out_indices.mData, - "Error: incorrect index result!"); + single_result = + single_result && ck::utils::check_err(out_indices_ref.mData, + out_indices.mData, + "Error: incorrect index result!"); }; if(!single_result) diff --git a/test/reference_conv_fwd/reference_conv_fwd.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp index 5e3b6f7458..d852e8f5eb 100644 --- a/test/reference_conv_fwd/reference_conv_fwd.cpp +++ b/test/reference_conv_fwd/reference_conv_fwd.cpp @@ -6,13 +6,13 @@ #include #include +#include "check_err.hpp" #include "config.hpp" -#include "conv_utils.hpp" +#include "conv_fwd_util.hpp" #include "element_wise_operation.hpp" #include "host_tensor.hpp" #include "reference_conv_fwd.hpp" #include "tensor_layout.hpp" -#include "test_util.hpp" namespace { using InElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -57,9 +57,10 @@ template , typename FillWeightsOp = FillConstant> -Tensor RunReferenceConv(const ck::conv_util::ConvParams& params, - const FillInputOp& fill_input_op = FillInputOp{}, - const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f}) +Tensor +run_reference_convolution_forward(const ck::utils::conv::ConvParams& params, + const FillInputOp& fill_input_op = FillInputOp{}, + const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f}) { std::vector input_dims{static_cast(params.N), static_cast(params.C)}; @@ -80,18 +81,16 @@ Tensor RunReferenceConv(const ck::conv_util::ConvParams& params, std::begin(output_spatial_lengths), std::end(output_spatial_lengths)); - Tensor input(ck::conv_util::GetHostTensorDescriptor(input_dims, InLayout{})); - Tensor weights(ck::conv_util::GetHostTensorDescriptor(filter_dims, WeiLayout{})); + Tensor input(ck::utils::conv::get_host_tensor_descriptor(input_dims, InLayout{})); + Tensor weights( + ck::utils::conv::get_host_tensor_descriptor(filter_dims, WeiLayout{})); Tensor host_output( - ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{})); + ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{})); fill_input_op(input.begin(), input.end()); fill_weights_op(weights.begin(), weights.end()); std::fill(host_output.begin(), host_output.end(), OutDataType(0.f)); - // std::cout <<"input: " << input.mDesc << std::endl << input.mData << std::endl; - // std::cout <<"weight: " << weights.mDesc << std::endl << weights.mData << std::endl; - auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd RunReferenceConv(const ck::conv_util::ConvParams& params, return host_output; } -bool TestConv2DNHWC() +bool test_conv2d_nhwc() { bool res{true}; - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.N = 1; params.K = 1; params.C = 2; @@ -130,7 +129,7 @@ bool TestConv2DNHWC() params.input_left_pads = std::vector{0, 0}; params.input_right_pads = std::vector{0, 0}; - auto out_tensor = RunReferenceConv<2>(params); + auto out_tensor = run_reference_convolution_forward<2>(params); std::vector ref_dims{1, 1, 4, 4}; std::vector ref_data{130.5, 148.5, @@ -148,10 +147,10 @@ bool TestConv2DNHWC() 472.5, 490.5, 508.5}; - res = res && test::check_err(out_tensor.mDesc.GetLengths(), - ref_dims, - "Error: wrong output tensor dimensions!"); - res = res && test::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"); + res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(), + ref_dims, + "Error: wrong output tensor dimensions!"); + res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"); params.N = 1; params.K = 2; @@ -163,7 +162,7 @@ bool TestConv2DNHWC() params.input_left_pads = std::vector{1, 1}; params.input_right_pads = std::vector{1, 1}; - out_tensor = RunReferenceConv<2>(params); + out_tensor = run_reference_convolution_forward<2>(params); ref_dims = std::vector{1, 2, 5, 5}; ref_data = std::vector{ 210., 210., 327., 327., 351., 351., 375., 375., 399., 399., @@ -171,18 +170,18 @@ bool TestConv2DNHWC() 747., 747., 1138.5, 1138.5, 1174.5, 1174.5, 1210.5, 1210.5, 1246.5, 1246.5, 1035., 1035., 1570.5, 1570.5, 1606.5, 1606.5, 1642.5, 1642.5, 1678.5, 1678.5, 1323., 1323., 2002.5, 2002.5, 2038.5, 2038.5, 2074.5, 2074.5, 2110.5, 2110.5}; - res = res && test::check_err(out_tensor.mDesc.GetLengths(), - ref_dims, - "Error: wrong output tensor dimensions!"); - res = res && test::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"); + res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(), + ref_dims, + "Error: wrong output tensor dimensions!"); + res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"); return res; } -bool TestConv1DNWC() +bool test_conv1d_nwc() { bool res{true}; - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.num_dim_spatial = 1; params.N = 1; params.K = 1; @@ -194,19 +193,20 @@ bool TestConv1DNWC() params.input_left_pads = std::vector{0}; params.input_right_pads = std::vector{0}; - auto out_tensor = RunReferenceConv<1, - float, - float, - float, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK>(params); + auto out_tensor = + run_reference_convolution_forward<1, + float, + float, + float, + ck::tensor_layout::convolution::NWC, + ck::tensor_layout::convolution::KXC, + ck::tensor_layout::convolution::NWK>(params); std::vector ref_dims{1, 1, 4}; std::vector ref_data{7.5, 13.5, 19.5, 25.5}; - res = res && test::check_err(out_tensor.mDesc.GetLengths(), - ref_dims, - "Error: wrong output tensor dimensions!"); - res = res && test::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"); + res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(), + ref_dims, + "Error: wrong output tensor dimensions!"); + res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"); params.num_dim_spatial = 1; params.N = 1; @@ -219,19 +219,19 @@ bool TestConv1DNWC() params.input_left_pads = std::vector{1}; params.input_right_pads = std::vector{1}; - out_tensor = RunReferenceConv<1, - float, - float, - float, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK>(params); + out_tensor = run_reference_convolution_forward<1, + float, + float, + float, + ck::tensor_layout::convolution::NWC, + ck::tensor_layout::convolution::KXC, + ck::tensor_layout::convolution::NWK>(params); ref_dims = std::vector{1, 2, 5}; ref_data = std::vector{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5}; - res = res && test::check_err(out_tensor.mDesc.GetLengths(), - ref_dims, - "Error: wrong output tensor dimensions!"); - res = res && test::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"); + res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(), + ref_dims, + "Error: wrong output tensor dimensions!"); + res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"); params.num_dim_spatial = 1; params.N = 2; @@ -244,13 +244,13 @@ bool TestConv1DNWC() params.input_left_pads = std::vector{1}; params.input_right_pads = std::vector{1}; - auto out_tensor2 = RunReferenceConv<1, - float, - float, - float, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK>( + auto out_tensor2 = run_reference_convolution_forward<1, + float, + float, + float, + ck::tensor_layout::convolution::NWC, + ck::tensor_layout::convolution::KXC, + ck::tensor_layout::convolution::NWK>( params, FillMonotonicSeq{0.f, 0.1f}); ref_dims = std::vector{2, 16, 16}; @@ -319,18 +319,18 @@ bool TestConv1DNWC() 72.9, 72.9, 72.9, 72.9, 72.9, 72.9, 72.9, 72.9, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4}; - res = res && test::check_err(out_tensor2.mDesc.GetLengths(), - ref_dims, - "Error: wrong output tensor dimensions!"); - res = res && test::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!"); + res = res && ck::utils::check_err(out_tensor2.mDesc.GetLengths(), + ref_dims, + "Error: wrong output tensor dimensions!"); + res = res && ck::utils::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!"); return res; } -bool TestConv3DNCDHW() +bool test_conv3d_ncdhw() { bool res{true}; - ck::conv_util::ConvParams params; + ck::utils::conv::ConvParams params; params.num_dim_spatial = 3; params.N = 1; params.K = 1; @@ -342,13 +342,13 @@ bool TestConv3DNCDHW() params.input_left_pads = std::vector{0, 0, 0}; params.input_right_pads = std::vector{0, 0, 0}; - auto out_tensor = RunReferenceConv<3, - float, - float, - float, - ck::tensor_layout::convolution::NCDHW, - ck::tensor_layout::convolution::KCZYX, - ck::tensor_layout::convolution::NKDHW>( + auto out_tensor = run_reference_convolution_forward<3, + float, + float, + float, + ck::tensor_layout::convolution::NCDHW, + ck::tensor_layout::convolution::KCZYX, + ck::tensor_layout::convolution::NKDHW>( params, FillMonotonicSeq{0.f, 0.1f}); std::vector ref_dims{1, 1, 4, 4, 4}; std::vector ref_data{ @@ -360,10 +360,11 @@ bool TestConv3DNCDHW() 634.5, 637.2, 639.9, 642.60004, 650.7, 653.4, 656.10004, 658.8, 699.3, 702., 704.7, 707.4, 715.5, 718.2, 720.9, 723.60004, 731.7, 734.4001, 737.10004, 739.8, 747.9001, 750.60004, 753.3, 756.}; - res = res && test::check_err(out_tensor.mDesc.GetLengths(), - ref_dims, - "Error [case 1]: wrong output tensor dimensions!"); - res = res && test::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!"); + res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(), + ref_dims, + "Error [case 1]: wrong output tensor dimensions!"); + res = res && + ck::utils::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!"); params.N = 1; params.K = 2; @@ -375,13 +376,13 @@ bool TestConv3DNCDHW() params.input_left_pads = std::vector{0, 0, 0}; params.input_right_pads = std::vector{0, 0, 0}; - out_tensor = RunReferenceConv<3, - float, - float, - float, - ck::tensor_layout::convolution::NCDHW, - ck::tensor_layout::convolution::KCZYX, - ck::tensor_layout::convolution::NKDHW>( + out_tensor = run_reference_convolution_forward<3, + float, + float, + float, + ck::tensor_layout::convolution::NCDHW, + ck::tensor_layout::convolution::KCZYX, + ck::tensor_layout::convolution::NKDHW>( params, FillMonotonicSeq{0.f, 0.1f}); ref_dims = std::vector{1, 2, 4, 4, 4}; ref_data = std::vector{ @@ -401,11 +402,11 @@ bool TestConv3DNCDHW() 5283.9004, 5292., 5300.0996, 5308.2, 5381.0996, 5389.2, 5397.3, 5405.4004, 6255.9004, 6264.0005, 6272.1, 6280.2, 6353.1, 6361.2, 6369.301, 6377.4, 6450.301, 6458.4, 6466.5, 6474.6, 6547.5, 6555.6, 6563.699, 6571.801}; - res = res && test::check_err(out_tensor.mDesc.GetLengths(), - ref_dims, - "Error [case 2]: wrong output tensor dimensions!"); + res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(), + ref_dims, + "Error [case 2]: wrong output tensor dimensions!"); res = - res && test::check_err( + res && ck::utils::check_err( out_tensor.mData, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f); return res; @@ -416,11 +417,11 @@ bool TestConv3DNCDHW() int main(void) { bool res{true}; - res = TestConv2DNHWC(); - std::cout << "TestConv2DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv1DNWC(); + res = test_conv2d_nhwc(); + std::cout << "test_conv2d_nhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; + res = test_conv1d_nwc(); std::cout << "TestConv1DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - res = TestConv3DNCDHW(); - std::cout << "TestConv3DNCDHW ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; + res = test_conv3d_ncdhw(); + std::cout << "test_conv3d_ncdhw ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; return res ? 0 : 1; }