From 973edb014d72c115d94a128633a2915eaecb018d Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Thu, 26 May 2022 23:01:12 +0800 Subject: [PATCH] Add pooling example (#257) * Add example for computing LayerNorm mean and meansquare * Refactor the pool2d_fwd example and add example for float type testing * Revert "Add example for computing LayerNorm mean and meansquare" This reverts commit df52e6f9d897b00c981baa48f291450bcd60925d. * Tiny fix in pool2d_fwd_common.hpp [ROCm/composable_kernel commit: 97c4d486f46f26bc241be5565f373ca28221e454] --- example/13_pool2d_fwd/CMakeLists.txt | 4 +- example/13_pool2d_fwd/README.md | 27 +++- .../{pool2d_fwd.cpp => pool2d_fwd_common.hpp} | 142 ++++++------------ example/13_pool2d_fwd/pool2d_fwd_fp16.cpp | 116 ++++++++++++++ example/13_pool2d_fwd/pool2d_fwd_fp32.cpp | 116 ++++++++++++++ 5 files changed, 303 insertions(+), 102 deletions(-) rename example/13_pool2d_fwd/{pool2d_fwd.cpp => pool2d_fwd_common.hpp} (76%) create mode 100644 example/13_pool2d_fwd/pool2d_fwd_fp16.cpp create mode 100644 example/13_pool2d_fwd/pool2d_fwd_fp32.cpp diff --git a/example/13_pool2d_fwd/CMakeLists.txt b/example/13_pool2d_fwd/CMakeLists.txt index 1fdeb4c585..db09c03321 100644 --- a/example/13_pool2d_fwd/CMakeLists.txt +++ b/example/13_pool2d_fwd/CMakeLists.txt @@ -1 +1,3 @@ -add_example_executable(example_pool2d_fwd pool2d_fwd.cpp) +add_example_executable(example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp) +add_example_executable(example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp) + diff --git a/example/13_pool2d_fwd/README.md b/example/13_pool2d_fwd/README.md index 2314cfd670..9b017734e9 100644 --- a/example/13_pool2d_fwd/README.md +++ b/example/13_pool2d_fwd/README.md @@ -1,12 +1,12 @@ -# Instructions for ```example_pool2d_fwd``` Example +# Instructions for ```example_pool2d_fwd``` Examples -## Run ```example_pool2d_fwd``` +## Run ```example_pool2d_fwd_fp16``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) #arg3: time kernel (0=no, 1=yes) #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx -./bin/example_pool2d_fwd 1 1 1 +./bin/example_pool2d_fwd_fp16 1 1 1 ``` Result @@ -18,3 +18,24 @@ Warm up 1 time Start running 10 times... Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s ``` + +## Run ```example_pool2d_fwd_fp32``` +```bash +#arg1: verification (0=no, 1=yes) +#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +#arg3: time kernel (0=no, 1=yes) +#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx +./bin/example_pool2d_fwd_fp32 1 1 1 +``` + + +Result +``` +./bin/example_pool2d_fwd_fp32 1 1 1 +in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192} +out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192} +launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s +``` diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp similarity index 76% rename from example/13_pool2d_fwd/pool2d_fwd.cpp rename to example/13_pool2d_fwd/pool2d_fwd_common.hpp index 662a48500f..632112a77a 100644 --- a/example/13_pool2d_fwd/pool2d_fwd.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp @@ -1,8 +1,6 @@ +#pragma once + #include -#include -#include -#include -#include #include "check_err.hpp" #include "config.hpp" @@ -13,44 +11,13 @@ #include "host_reduce_util.hpp" #include "device_tensor.hpp" #include "tensor_layout.hpp" -#include "reduction_operator.hpp" +#include "reduction_enums.hpp" #include "device_pool2d_fwd_nhwc_nhwc.hpp" -using InDataType = ck::half_t; -using OutDataType = ck::half_t; -using AccDataType = float; - -using IndexDataType = int32_t; - -using InLayout = ck::tensor_layout::convolution::NHWC; -using OutLayout = ck::tensor_layout::convolution::NHWC; - -#if 1 -static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; -#else -static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; -#endif - -static constexpr bool OutputIndex = false; -static constexpr bool PropagateNan = false; - -using DevicePoolFwdInstance = - ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C< - InDataType, // InDataType - OutDataType, // OutDataType - AccDataType, // AccDataType - ReduceOpId, - OutputIndex, - 64, // BlockSize - 64, // ReduceMThreadClusterSize - 1, // ReduceKThreadClusterSize - 4, // ReduceMThreadSliceSize - 1, // ReduceKThreadSliceSize - 4>; // InSrcOutDstVectorSize - template @@ -147,68 +114,46 @@ static void pool_host_verify(const Tensor& in, }; } -int main(int argc, char* argv[]) +template +bool pool_test(bool do_verification, + int init_method, + bool time_kernel, + ck::index_t N, + ck::index_t C, + ck::index_t Y, + ck::index_t X, + ck::index_t Hi, + ck::index_t Wi, + ck::index_t window_stride_h, + ck::index_t window_stride_w, + ck::index_t in_left_pad_h, + ck::index_t in_left_pad_w, + ck::index_t in_right_pad_h, + ck::index_t in_right_pad_w) { using namespace ck::host_reduce; - bool do_verification; - int init_method; - bool time_kernel; - - // Pool shape - ck::index_t N = 128; - ck::index_t C = 192; - ck::index_t Y = 3; - ck::index_t X = 3; - ck::index_t Hi = 71; - ck::index_t Wi = 71; - ck::index_t window_stride_h = 2; - ck::index_t window_stride_w = 2; - ck::index_t in_left_pad_h = 1; - ck::index_t in_left_pad_w = 1; - ck::index_t in_right_pad_h = 1; - ck::index_t in_right_pad_w = 1; - - if(argc == 1) - { - do_verification = true; - init_method = 1; - time_kernel = true; - } - else if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = static_cast(std::stoi(argv[3])); - } - else if(argc == 16) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = static_cast(std::stoi(argv[3])); - - N = std::stoi(argv[4]); - C = std::stoi(argv[5]); - Y = std::stoi(argv[6]); - X = std::stoi(argv[7]); - Hi = std::stoi(argv[8]); - Wi = std::stoi(argv[9]); - window_stride_h = std::stoi(argv[10]); - window_stride_w = std::stoi(argv[11]); - in_left_pad_h = std::stoi(argv[12]); - in_left_pad_w = std::stoi(argv[13]); - in_right_pad_h = std::stoi(argv[14]); - in_right_pad_w = std::stoi(argv[15]); - } - else - { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: time kernel (0=no, 1=yes)\n"); - printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " - "RightPx\n"); - exit(0); - } + using DevicePoolFwdInstance = + ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C< + InDataType, // InDataType + OutDataType, // OutDataType + AccDataType, // AccDataType + ReduceOpId, + OutputIndex, + 64, // BlockSize + 64, // ReduceMThreadClusterSize + 1, // ReduceKThreadClusterSize + 4, // ReduceMThreadSliceSize + 1, // ReduceKThreadSliceSize + 4>; // InSrcOutDstVectorSize const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1; const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1; @@ -302,6 +247,7 @@ int main(int argc, char* argv[]) pool_host_verify(in_n_c_hi_wi, @@ -325,5 +271,5 @@ int main(int argc, char* argv[]) }; } - return (pass ? 0 : 1); -} + return (pass); +}; diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp new file mode 100644 index 0000000000..624c8ad6cd --- /dev/null +++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp @@ -0,0 +1,116 @@ +#include +#include + +#include "config.hpp" +#include "tensor_layout.hpp" +#include "reduction_enums.hpp" + +#include "pool2d_fwd_common.hpp" + +using InDataType = ck::half_t; +using OutDataType = ck::half_t; +using AccDataType = float; + +using IndexDataType = int32_t; + +using InLayout = ck::tensor_layout::convolution::NHWC; +using OutLayout = ck::tensor_layout::convolution::NHWC; + +#if 1 +static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; +#else +static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; +#endif + +static constexpr bool OutputIndex = false; +static constexpr bool PropagateNan = false; + +int main(int argc, char* argv[]) +{ + using namespace ck::host_reduce; + + bool do_verification; + int init_method; + bool time_kernel; + + // Pool shape + ck::index_t N = 128; + ck::index_t C = 192; + ck::index_t Y = 3; + ck::index_t X = 3; + ck::index_t Hi = 71; + ck::index_t Wi = 71; + ck::index_t window_stride_h = 2; + ck::index_t window_stride_w = 2; + ck::index_t in_left_pad_h = 1; + ck::index_t in_left_pad_w = 1; + ck::index_t in_right_pad_h = 1; + ck::index_t in_right_pad_w = 1; + + if(argc == 1) + { + do_verification = true; + init_method = 1; + time_kernel = true; + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = static_cast(std::stoi(argv[3])); + } + else if(argc == 16) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = static_cast(std::stoi(argv[3])); + + N = std::stoi(argv[4]); + C = std::stoi(argv[5]); + Y = std::stoi(argv[6]); + X = std::stoi(argv[7]); + Hi = std::stoi(argv[8]); + Wi = std::stoi(argv[9]); + window_stride_h = std::stoi(argv[10]); + window_stride_w = std::stoi(argv[11]); + in_left_pad_h = std::stoi(argv[12]); + in_left_pad_w = std::stoi(argv[13]); + in_right_pad_h = std::stoi(argv[14]); + in_right_pad_w = std::stoi(argv[15]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " + "RightPx\n"); + exit(0); + } + + bool pass = pool_test(do_verification, + init_method, + time_kernel, + N, + C, + Y, + X, + Hi, + Wi, + window_stride_h, + window_stride_w, + in_left_pad_h, + in_left_pad_w, + in_right_pad_h, + in_right_pad_w); + + return (pass ? 0 : 1); +} diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp new file mode 100644 index 0000000000..d2d2ae05d1 --- /dev/null +++ b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp @@ -0,0 +1,116 @@ +#include +#include + +#include "config.hpp" +#include "tensor_layout.hpp" +#include "reduction_enums.hpp" + +#include "pool2d_fwd_common.hpp" + +using InDataType = float; +using OutDataType = float; +using AccDataType = float; + +using IndexDataType = int32_t; + +using InLayout = ck::tensor_layout::convolution::NHWC; +using OutLayout = ck::tensor_layout::convolution::NHWC; + +#if 1 +static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; +#else +static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; +#endif + +static constexpr bool OutputIndex = false; +static constexpr bool PropagateNan = false; + +int main(int argc, char* argv[]) +{ + using namespace ck::host_reduce; + + bool do_verification; + int init_method; + bool time_kernel; + + // Pool shape + ck::index_t N = 128; + ck::index_t C = 192; + ck::index_t Y = 3; + ck::index_t X = 3; + ck::index_t Hi = 71; + ck::index_t Wi = 71; + ck::index_t window_stride_h = 2; + ck::index_t window_stride_w = 2; + ck::index_t in_left_pad_h = 1; + ck::index_t in_left_pad_w = 1; + ck::index_t in_right_pad_h = 1; + ck::index_t in_right_pad_w = 1; + + if(argc == 1) + { + do_verification = true; + init_method = 1; + time_kernel = true; + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = static_cast(std::stoi(argv[3])); + } + else if(argc == 16) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = static_cast(std::stoi(argv[3])); + + N = std::stoi(argv[4]); + C = std::stoi(argv[5]); + Y = std::stoi(argv[6]); + X = std::stoi(argv[7]); + Hi = std::stoi(argv[8]); + Wi = std::stoi(argv[9]); + window_stride_h = std::stoi(argv[10]); + window_stride_w = std::stoi(argv[11]); + in_left_pad_h = std::stoi(argv[12]); + in_left_pad_w = std::stoi(argv[13]); + in_right_pad_h = std::stoi(argv[14]); + in_right_pad_w = std::stoi(argv[15]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " + "RightPx\n"); + exit(0); + } + + bool pass = pool_test(do_verification, + init_method, + time_kernel, + N, + C, + Y, + X, + Hi, + Wi, + window_stride_h, + window_stride_w, + in_left_pad_h, + in_left_pad_w, + in_right_pad_h, + in_right_pad_w); + + return (pass ? 0 : 1); +}