From 8aeb2afbe24e235af4633f5a507ce89d33afc948 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Wed, 11 Sep 2024 15:21:00 +0200 Subject: [PATCH] Rewrite pool2d fwd (#1462) * added pool2d fwd * add tests * add reviewers changes * Revert "Merge remote-tracking branch 'origin/develop' into jakpiase/pool2d_fwd_new" This reverts commit 6b2ba7ff8960b0a6ddbe30d8dac53eeb55a8597e, reversing changes made to 22c82bea0caf3e0f29399100c1bb67b8003fc042. * Revert "add reviewers changes" This reverts commit 22c82bea0caf3e0f29399100c1bb67b8003fc042. * added reviewers comments * revert some old files * add reviewers requests --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> [ROCm/composable_kernel commit: e8d2887cb263644b2e778419ad1f892cea0b0466] --- .../impl/device_pool2d_fwd_nhwc_nhwc.hpp | 417 +++++++++++++++--- .../gpu/pool2d_fwd.hpp | 153 +++++++ .../gpu/pool2d_fwd/CMakeLists.txt | 8 + ...vice_avg_pool2d_fwd_nhwc_bf16_instance.cpp | 25 ++ ...evice_avg_pool2d_fwd_nhwc_f16_instance.cpp | 24 + ...evice_avg_pool2d_fwd_nhwc_f32_instance.cpp | 24 + ...vice_max_pool2d_fwd_nhwc_bf16_instance.cpp | 34 ++ ...evice_max_pool2d_fwd_nhwc_f16_instance.cpp | 32 ++ ...evice_max_pool2d_fwd_nhwc_f32_instance.cpp | 32 ++ .../pool2d_fwd/pool2d_fwd_instance_common.hpp | 41 ++ .../profiler/profile_pool2d_fwd_impl.hpp | 274 ++++++++++++ profiler/src/CMakeLists.txt | 2 + profiler/src/profile_max_pool2d_fwd.cpp | 233 ++++++++++ test/pool/CMakeLists.txt | 6 + test/pool/test_avg_pool2d_fwd.cpp | 68 +++ test/pool/test_max_pool2d_fwd.cpp | 72 +++ 16 files changed, 1374 insertions(+), 71 deletions(-) create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp create mode 100644 library/src/tensor_operation_instance/gpu/pool2d_fwd/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_bf16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_bf16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/pool2d_fwd/pool2d_fwd_instance_common.hpp create mode 100644 profiler/include/profiler/profile_pool2d_fwd_impl.hpp create mode 100644 profiler/src/profile_max_pool2d_fwd.cpp create mode 100644 test/pool/test_avg_pool2d_fwd.cpp create mode 100644 test/pool/test_max_pool2d_fwd.cpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp index c94c568c49..fcb9b8a903 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp @@ -1,9 +1,20 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include "ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp" +#include +#include + +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" +#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" namespace ck { namespace tensor_operation { @@ -16,95 +27,359 @@ template -struct DevicePool2dFwd_NHWC_NHWC : public DevicePool3dFwd_NDHWC_NDHWC +struct DevicePool2dFwd_NHWC_NHWC : public DevicePoolFwd<4, + 2, + InDataType, + OutDataType, + IndexDataType, + tensor_layout::convolution::NHWC, + tensor_layout::convolution::NHWC, + ReduceOpId, + OutputIndex> { - using DevicePool3D = DevicePool3dFwd_NDHWC_NDHWC{}; + static constexpr auto I1 = Number<1>{}; + + static constexpr index_t InOutRank = 4; + static constexpr index_t WindowRank = 2; + + using ReduceOperation = typename reduce_binary_operator::opType; + + using InElementwiseOperation = + typename reduce_unary_operator::InElementwiseOperation; + + using AccElementwiseOperation = + typename reduce_unary_operator::AccElementwiseOperation; + + static constexpr ck::index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr ck::index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + static auto MakeABGridDescriptor_A_M_K_B_M(std::vector input_nchw_lengths, + std::vector output_nchw_lengths, + std::vector input_nchw_stride, + std::vector output_nchw_stride, + std::vector window_spatial_yx_lengths, + std::vector window_yx_strides, + std::vector window_yx_dilations, + std::vector input_left_hw_pads, + std::vector input_right_hw_pads) + { + const index_t N = input_nchw_lengths[0]; + const index_t C = input_nchw_lengths[1]; + const index_t Hi = input_nchw_lengths[2]; + const index_t Wi = input_nchw_lengths[3]; + + const index_t Ho = output_nchw_lengths[2]; + const index_t Wo = output_nchw_lengths[3]; + const index_t Y = window_spatial_yx_lengths[0]; + const index_t X = window_spatial_yx_lengths[1]; + + const index_t WindowStrideH = window_yx_strides[0]; + const index_t WindowStrideW = window_yx_strides[1]; + + const index_t WindowDilationH = window_yx_dilations[0]; + const index_t WindowDilationW = window_yx_dilations[1]; + + const index_t InLeftPadH = input_left_hw_pads[0]; + const index_t InLeftPadW = input_left_hw_pads[1]; + + const index_t InRightPadH = input_right_hw_pads[0]; + const index_t InRightPadW = input_right_hw_pads[1]; + + const index_t MRaw = N * Ho * Wo * C; + const index_t MPad = math::integer_least_multiple(MRaw, M_BlockTileSize) - MRaw; + + const index_t KRaw = Y * X; + const index_t KPad = math::integer_least_multiple(KRaw, K_BlockTileSize) - KRaw; + + // A[ReduceM, ReduceK] + const index_t Ni_stride = input_nchw_stride[0]; + const index_t Ci_stride = input_nchw_stride[1]; + const index_t Hi_stride = input_nchw_stride[2]; + const index_t Wi_stride = input_nchw_stride[3]; + + const auto in_grid_desc_n_hi_wi_c = make_naive_tensor_descriptor( + make_tuple(N, Hi, Wi, C), make_tuple(Ni_stride, Hi_stride, Wi_stride, Ci_stride)); + + const auto in_grid_desc_n_hip_wip_c = transform_tensor_descriptor( + in_grid_desc_n_hi_wi_c, + make_tuple(make_pass_through_transform(N), + make_pad_transform(Hi, InLeftPadH, InRightPadH), + make_pad_transform(Wi, InLeftPadW, InRightPadW), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + const auto in_grid_desc_n_y_ho_x_wo_c = transform_tensor_descriptor( + in_grid_desc_n_hip_wip_c, + make_tuple( + make_pass_through_transform(N), + make_embed_transform(make_tuple(Y, Ho), make_tuple(WindowDilationH, WindowStrideH)), + make_embed_transform(make_tuple(X, Wo), make_tuple(WindowDilationW, WindowStrideW)), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); + + const auto in_grid_desc_reducemraw_reducekraw = + transform_tensor_descriptor(in_grid_desc_n_y_ho_x_wo_c, + make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, C)), + make_merge_transform(make_tuple(Y, X))), + make_tuple(Sequence<0, 2, 4, 5>{}, Sequence<1, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + const auto in_grid_desc_reducem_reducek = transform_tensor_descriptor( + in_grid_desc_reducemraw_reducekraw, + make_tuple(make_right_pad_transform(MRaw, MPad), make_right_pad_transform(KRaw, KPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // B[ReduceM] + const index_t No_stride = output_nchw_stride[0]; + const index_t Co_stride = output_nchw_stride[1]; + const index_t Ho_stride = output_nchw_stride[2]; + const index_t Wo_stride = output_nchw_stride[3]; + + const auto out_grid_desc_n_ho_wo_c = make_naive_tensor_descriptor( + make_tuple(N, Hi, Wi, C), make_tuple(No_stride, Ho_stride, Wo_stride, Co_stride)); + + const auto out_grid_desc_reducemraw = + transform_tensor_descriptor(out_grid_desc_n_ho_wo_c, + make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, C))), + make_tuple(Sequence<0, 1, 2, 3>{}), + make_tuple(Sequence<0>{})); + + const auto out_grid_desc_reducem = + transform_tensor_descriptor(out_grid_desc_reducemraw, + make_tuple(make_right_pad_transform(MRaw, MPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + + return make_tuple(in_grid_desc_reducem_reducek, out_grid_desc_reducem); + } + + using ABGridDescs = + decltype(MakeABGridDescriptor_A_M_K_B_M({}, {}, {}, {}, {}, {}, {}, {}, {})); + + using AGridDesc_M_K = remove_cvref_t; + using BGridDesc_M = remove_cvref_t; + + struct Argument : public BaseArgument + { + Argument(const InDataType* p_in_dev, + OutDataType* p_out_dev, + IndexDataType* p_out_indices_dev, + std::vector& input_nchw_lengths, + std::vector& output_nchw_lengths, + std::vector& input_nchw_stride, + std::vector& output_nchw_stride, + std::vector&, // indices_nchw_stride + std::vector& window_spatial_yx_lengths, + std::vector& window_yx_strides, + std::vector& window_yx_dilations, + std::vector& input_left_hw_pads, + std::vector& input_right_hw_pads) + : p_in_dev_{p_in_dev}, + p_out_dev_{p_out_dev}, + p_out_indices_dev_{p_out_indices_dev}, + a_grid_desc_m_k_{}, + b_grid_desc_m_{}, + input_nchw_lengths_{input_nchw_lengths}, + output_nchw_lengths_{output_nchw_lengths}, + input_nchw_stride_{input_nchw_stride}, + output_nchw_stride_{output_nchw_stride} + { + const auto descs = MakeABGridDescriptor_A_M_K_B_M(input_nchw_lengths, + output_nchw_lengths, + input_nchw_stride, + output_nchw_stride, + window_spatial_yx_lengths, + window_yx_strides, + window_yx_dilations, + input_left_hw_pads, + input_right_hw_pads); + + a_grid_desc_m_k_ = descs[I0]; + b_grid_desc_m_ = descs[I1]; + + int32_t reduceLength = window_spatial_yx_lengths[0] * window_spatial_yx_lengths[1]; + + std::tie(in_element_op_, acc_element_op_) = + reduce_unary_operator::GetElementwiseOperator(reduceLength); + } + + const InDataType* p_in_dev_; + OutDataType* p_out_dev_; + IndexDataType* p_out_indices_dev_; + AGridDesc_M_K a_grid_desc_m_k_; + BGridDesc_M b_grid_desc_m_; + + InElementwiseOperation in_element_op_; + AccElementwiseOperation acc_element_op_; + + // for checking vector load/store + std::vector input_nchw_lengths_; + std::vector output_nchw_lengths_; + std::vector input_nchw_stride_; + std::vector output_nchw_stride_; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + // for NHWC, the dim C is the fastest dimension, and is not reduced. + // Hence, it is in M dimension for reduction kernel. + static constexpr index_t InSrcOutDstVectorDim = 0; // 0: M, 1: K + + using gridwise_reduce = + GridwiseReduction_mk_to_m_threadwise; - std::unique_ptr + const auto kernel = + kernel_reduce_threadwise; + + ck::index_t M = arg.a_grid_desc_m_k_.GetLength(I0); + + const index_t grid_size = (M / M_BlockTileSize); + + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.a_grid_desc_m_k_, + arg.b_grid_desc_m_, + arg.in_element_op_, + arg.acc_element_op_, + float(1), + arg.p_in_dev_, + nullptr, + float(0), + arg.p_out_dev_, + arg.p_out_indices_dev_); + } + + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + // C should be fastest dimension + if(pArg->input_nchw_stride_[1] != 1) + return false; + + for(int i = 0; i < InOutRank; ++i) + { + if(pArg->input_nchw_stride_[i] == 1 && + pArg->input_nchw_lengths_[i] % InSrcOutDstVectorSize != 0) + return false; + + if(pArg->output_nchw_stride_[i] == 1 && + pArg->output_nchw_lengths_[i] % InSrcOutDstVectorSize != 0) + return false; + } + + return true; + } + + virtual std::unique_ptr MakeArgumentPointer(const void* p_in_dev, void* p_out_dev, void* p_out_indices_dev, - std::vector input_lengths, - std::vector window_lengths, - std::vector output_lengths, - std::vector input_stride, - std::vector output_stride, - std::vector indices_stride, - std::vector window_strides, - std::vector window_dilations, - std::vector input_left_pads, - std::vector input_right_pads, + std::vector input_nchw_lengths, + std::vector window_yx_lengths, + std::vector output_nchw_lengths, + std::vector input_nchw_stride, + std::vector output_nchw_stride, + std::vector indices_nchw_stride, + std::vector window_yx_strides, + std::vector window_yx_dilations, + std::vector input_left_hw_pads, + std::vector input_right_hw_pads, std::vector pooling_dims) override { - static constexpr index_t InOutRank = 4; - static constexpr index_t WindowRank = 2; - - if(input_lengths.size() != InOutRank || window_lengths.size() != WindowRank || - input_lengths.size() != InOutRank || window_strides.size() != WindowRank || - window_dilations.size() != WindowRank || input_left_pads.size() != WindowRank || - input_right_pads.size() != WindowRank) + if(input_nchw_lengths.size() != InOutRank || window_yx_lengths.size() != WindowRank || + input_nchw_lengths.size() != InOutRank || window_yx_strides.size() != WindowRank || + window_yx_dilations.size() != WindowRank || input_left_hw_pads.size() != WindowRank || + input_right_hw_pads.size() != WindowRank) throw std::runtime_error("dimension is incorrect"); if(pooling_dims != std::vector{2, 3}) throw std::runtime_error("pooling_dims only support {2, 3} in pool2d so far"); - // NCHW to NCDHW - input_lengths.insert(input_lengths.begin() + 2, 1); - output_lengths.insert(output_lengths.begin() + 2, 1); - input_stride.insert(input_stride.begin() + 2, 0); - output_stride.insert(output_stride.begin() + 2, 0); - indices_stride.insert(indices_stride.begin() + 2, 0); + if(output_nchw_stride != indices_nchw_stride) + throw std::runtime_error( + "output_nchw_stride need to be equal to indices_nchw_stride for now"); - // YX to ZYX - window_lengths.insert(window_lengths.begin(), 1); - window_strides.insert(window_strides.begin(), 0); - window_dilations.insert(window_dilations.begin(), 0); - input_left_pads.insert(input_left_pads.begin(), 0); - input_right_pads.insert(input_right_pads.begin(), 0); + return std::make_unique(static_cast(p_in_dev), + static_cast(p_out_dev), + static_cast(p_out_indices_dev), + input_nchw_lengths, + output_nchw_lengths, + input_nchw_stride, + output_nchw_stride, + indices_nchw_stride, + window_yx_lengths, + window_yx_strides, + window_yx_dilations, + input_left_hw_pads, + input_right_hw_pads); + } - pooling_dims = {2, 3, 4}; + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } - return DevicePool3D::MakeArgumentPointer(p_in_dev, - p_out_dev, - p_out_indices_dev, - input_lengths, - window_lengths, - output_lengths, - input_stride, - output_stride, - indices_stride, - window_strides, - window_dilations, - input_left_pads, - input_right_pads, - pooling_dims); + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DevicePool2dFwd_NHWC_NHWC<" << BlockSize << ","; + str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; + str <<"InSrcOutDstVectorSize_" << InSrcOutDstVectorSize << ">"; + // clang-format on + + return str.str(); } }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp b/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp new file mode 100644 index 0000000000..f8ef7cdc0d --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto InOutRank = 4; +static constexpr auto WindowRank = 2; + +static constexpr auto MaxOp = ck::ReduceTensorOp::MAX; +static constexpr auto AvgOp = ck::ReduceTensorOp::AVG; +#ifdef CK_ENABLE_FP16 +// FP16 +void add_device_pool2d_fwd_nhwc_f16_instances( + std::vector>>&); + +void add_device_pool2d_fwd_nhwc_f16_instances( + std::vector>>&); + +// FP16 - return index +void add_device_pool2d_fwd_nhwc_index_f16_instances( + std::vector>>&); +#endif +#ifdef CK_ENABLE_BF16 +// BF16 +void add_device_pool2d_fwd_nhwc_bf16_instances( + std::vector>>&); + +void add_device_pool2d_fwd_nhwc_bf16_instances( + std::vector>>&); + +// BF16 - return index +void add_device_pool2d_fwd_nhwc_index_bf16_instances( + std::vector>>&); +#endif +#ifdef CK_ENABLE_FP32 +// FP32 +void add_device_pool2d_fwd_nhwc_f32_instances( + std::vector>>&); + +void add_device_pool2d_fwd_nhwc_f32_instances( + std::vector>>&); + +// FP32 - return index +void add_device_pool2d_fwd_nhwc_index_f32_instances( + std::vector>>&); +#endif +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = DevicePoolFwd; + + static auto GetInstances() + { + std::vector> op_ptrs; + if constexpr(is_same_v && is_same_v) + { +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(OutputIndex && ReduceOpId == MaxOp) + { + add_device_pool2d_fwd_nhwc_index_f16_instances(op_ptrs); + } + else + { + add_device_pool2d_fwd_nhwc_f16_instances(op_ptrs); + } + } +#endif +#ifdef CK_ENABLE_BF16 + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(OutputIndex && ReduceOpId == MaxOp) + { + add_device_pool2d_fwd_nhwc_index_bf16_instances(op_ptrs); + } + else + { + add_device_pool2d_fwd_nhwc_bf16_instances(op_ptrs); + } + } +#endif +#ifdef CK_ENABLE_FP32 + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(OutputIndex && ReduceOpId == MaxOp) + { + add_device_pool2d_fwd_nhwc_index_f32_instances(op_ptrs); + } + else + { + add_device_pool2d_fwd_nhwc_f32_instances(op_ptrs); + } + } +#endif + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/pool2d_fwd/CMakeLists.txt new file mode 100644 index 0000000000..28ec0553d6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/CMakeLists.txt @@ -0,0 +1,8 @@ +set(DEVICE_POOL2D_FWD_INSTANCES) +list(APPEND DEVICE_POOL2D_FWD_INSTANCES device_avg_pool2d_fwd_nhwc_f16_instance.cpp + device_max_pool2d_fwd_nhwc_f16_instance.cpp + device_avg_pool2d_fwd_nhwc_f32_instance.cpp + device_max_pool2d_fwd_nhwc_f32_instance.cpp + device_avg_pool2d_fwd_nhwc_bf16_instance.cpp + device_max_pool2d_fwd_nhwc_bf16_instance.cpp) +add_instance_library(device_pool2d_fwd_instance ${DEVICE_POOL2D_FWD_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_bf16_instance.cpp new file mode 100644 index 0000000000..4ead89488d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_bf16_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pool2d_fwd_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; + +void add_device_pool2d_fwd_nhwc_bf16_instances( + std::vector< + std::unique_ptr>>& + instances) +{ + add_device_operation_instances( + instances, device_pool2d_fwd_nhwc_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp new file mode 100644 index 0000000000..5c64f913f5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pool2d_fwd_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; + +void add_device_pool2d_fwd_nhwc_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool2d_fwd_nhwc_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp new file mode 100644 index 0000000000..4a2d3c5c9b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pool2d_fwd_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; + +void add_device_pool2d_fwd_nhwc_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool2d_fwd_nhwc_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_bf16_instance.cpp new file mode 100644 index 0000000000..4f02d84305 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_bf16_instance.cpp @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pool2d_fwd_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; + +void add_device_pool2d_fwd_nhwc_bf16_instances( + std::vector< + std::unique_ptr>>& + instances) +{ + add_device_operation_instances( + instances, device_pool2d_fwd_nhwc_instances{}); +} + +void add_device_pool2d_fwd_nhwc_index_bf16_instances( + std::vector< + std::unique_ptr>>& + instances) +{ + add_device_operation_instances( + instances, device_pool2d_fwd_nhwc_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp new file mode 100644 index 0000000000..e6a580d07f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pool2d_fwd_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; + +void add_device_pool2d_fwd_nhwc_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool2d_fwd_nhwc_instances{}); +} + +void add_device_pool2d_fwd_nhwc_index_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool2d_fwd_nhwc_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp new file mode 100644 index 0000000000..1f104eab7f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pool2d_fwd_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; + +void add_device_pool2d_fwd_nhwc_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool2d_fwd_nhwc_instances{}); +} + +void add_device_pool2d_fwd_nhwc_index_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool2d_fwd_nhwc_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool2d_fwd/pool2d_fwd_instance_common.hpp b/library/src/tensor_operation_instance/gpu/pool2d_fwd/pool2d_fwd_instance_common.hpp new file mode 100644 index 0000000000..0d1a64ab59 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/pool2d_fwd_instance_common.hpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using I32 = int32_t; +using F16 = ck::half_t; +using BF16 = ck::bhalf_t; +using F32 = float; +using NHWC = ck::tensor_layout::convolution::NHWC; + +template +using device_pool2d_fwd_nhwc_instances = + // clang-format off + std::tuple < + DevicePool2dFwd_NHWC_NHWC, + DevicePool2dFwd_NHWC_NHWC, + DevicePool2dFwd_NHWC_NHWC + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp new file mode 100644 index 0000000000..23226a4881 --- /dev/null +++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp" + +namespace ck { +namespace profiler { + +template +bool profile_pool2d_fwd_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + std::vector in_length, // NCHW + std::vector window_spatial_lengths, + std::vector window_strides, + std::vector window_dilations, + std::vector input_left_pads, + std::vector input_right_pads) +{ + constexpr index_t InOutRank = 4; + constexpr index_t WindowRank = 2; + + if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank || + window_strides.size() != WindowRank || window_dilations.size() != WindowRank || + input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank) + return false; + + std::vector out_length(InOutRank); + + int N = in_length[0]; + int C = in_length[1]; + + out_length[0] = N; + out_length[1] = C; + + // Calculate Ho, Wo + for(int i = 2; i < InOutRank; ++i) + { + auto pad1 = input_left_pads[i - 2]; + auto pad2 = input_right_pads[i - 2]; + auto windows_size = window_spatial_lengths[i - 2]; + auto windows_stride = window_strides[i - 2]; + auto windows_dilation = window_dilations[i - 2]; + auto eff = (windows_size - 1) * windows_dilation + 1; + out_length[i] = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1; + } + + int Hi = in_length[2]; + int Wi = in_length[3]; + int Ho = out_length[2]; + int Wo = out_length[3]; + + auto f_host_tensor_descriptor = + [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) { + using namespace ck::literals; + + return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_}); + }; + + Tensor in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi)); + Tensor out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo)); + Tensor out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo)); + + Tensor out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo)); + Tensor out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo)); + + switch(init_method) + { + case 0: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1{}); break; + case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2{-5, 5}); break; + default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutDataType) * + out_n_c_ho_wo_device.mDesc.GetElementSpaceSize()); + DeviceMem out_indices_device_buf(sizeof(IndexDataType) * + out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize()); + + in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); + + // add device normalization instances + using DeviceOp = ck::tensor_operation::device::DevicePoolFwd; + + // get device op instances + const auto instance_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; + + std::string best_instance_name; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + if(do_verification) + { + using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd; + + ReferenceInstance ref; + auto ref_argument = ref.MakeArgument(in_n_c_hi_wi, + out_n_c_ho_wo_host, + out_indices_n_c_ho_wo_host, + window_spatial_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + } + + int num_kernel = 0; + + for(auto& inst_ptr : instance_ptrs) + { + auto argument_ptr = inst_ptr->MakeArgumentPointer( + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + static_cast(out_indices_device_buf.GetDeviceBuffer()), + in_length, + window_spatial_lengths, + out_length, + {C * Hi * Wi, 1, Wi * C, C}, + {C * Ho * Wo, 1, Wo * C, C}, + {C * Ho * Wo, 1, Wo * C, C}, + window_strides, + window_dilations, + input_left_pads, + input_right_pads, + {2, 3}); + + if(inst_ptr->IsSupportedArgument(argument_ptr.get())) + { + ++num_kernel; + } + else + { + if(time_kernel) + { + std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; + LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl; + } + + continue; + } + + auto invoker_ptr = inst_ptr->MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t num_bytes = in_n_c_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) + + out_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType); + + if constexpr(OutputIndex) + num_bytes += out_indices_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(IndexDataType); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + if(time_kernel) + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " + << inst_ptr->GetTypeString() << std::endl; + + if(avg_time < best_avg_time) + { + best_instance_name = inst_ptr->GetTypeString(); + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data()); + + bool pass = ck::utils::check_err(out_n_c_ho_wo_device.mData, + out_n_c_ho_wo_host.mData, + "Error: Incorrect results", + 1e-3, + 1e-3); + + if constexpr(OutputIndex) + { + out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data()); + + pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device, + out_indices_n_c_ho_wo_host); + } + + if(do_log) + { + LogRangeAsType(std::cout << "in_n_c_hi_wi : ", in_n_c_hi_wi.mData, ",") + << std::endl; + LogRangeAsType( + std::cout << "out_n_c_ho_wo_host : ", out_n_c_ho_wo_host.mData, ",") + << std::endl; + LogRangeAsType( + std::cout << "out_n_c_ho_wo_device : ", out_n_c_ho_wo_device.mData, ",") + << std::endl; + + if constexpr(OutputIndex) + LogRangeAsType(std::cout << "out_indices_n_c_ho_wo_device : ", + out_indices_n_c_ho_wo_device.mData, + ",") + << std::endl; + } + + if(!pass) + { + std::cout << inst_ptr->GetTypeString() << " failed verification: "; + LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl; + return false; + } + else + { + if(time_kernel) + std::cout << "pass" << std::endl; + } + } + } + + if(time_kernel) + { + LogRange(std::cout << "length = ", in_length, ",") << std::endl; + std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_instance_name << std::endl; + } + + if(num_kernel == 0) + { + std::cout << "Error: No kernel is applicable" << std::endl; + return false; + } + + return true; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 43bebba8cb..554808cac5 100755 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -9,6 +9,7 @@ set(PROFILER_SOURCES profile_layernorm_bwd_gamma_beta.cpp profile_groupnorm_bwd_gamma_beta.cpp profile_layernorm_fwd.cpp + profile_max_pool2d_fwd.cpp profile_max_pool3d_fwd.cpp profile_avg_pool3d_bwd.cpp profile_max_pool3d_bwd.cpp @@ -98,6 +99,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_ga target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance) +target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool2d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance) diff --git a/profiler/src/profile_max_pool2d_fwd.cpp b/profiler/src/profile_max_pool2d_fwd.cpp new file mode 100644 index 0000000000..a7ff6b57a4 --- /dev/null +++ b/profiler/src/profile_max_pool2d_fwd.cpp @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "profiler/data_type_enum.hpp" +#include "profiler/profile_pool2d_fwd_impl.hpp" +#include "profiler_operation_registry.hpp" + +using ck::index_t; + +struct maxPoolFwdArgParser +{ + std::unordered_map> long_opts = {{"length", {}}, + {"wsize", {}}, + {"wstride", {}}, + {"wdilation", {}}, + {"pad1", {}}, + {"pad2", {}}}; + + bool parse_opt(int argc, char* argv[], const std::string& key, int i) + { + if(std::string("--") + key == argv[i]) + { + int pos = i; + while(++i < argc && argv[i][0] != '-') {} + int end = i; + for(int j = pos + 1; j < end; j++) + { + long_opts[key].push_back(std::stoi(argv[j])); + } + return true; + } + return false; + } + + void operator()(int argc, char* argv[]) + { + for(auto& kv : long_opts) + { + for(int i = 1; i < argc; i++) + { + if(parse_opt(argc, argv, kv.first, i)) + break; + } + } + } +}; + +void print_help_max_pool2d_fwd() +{ + std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n" + << "arg2: verification (0: no; 1: yes)\n" + << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" + << "arg4: print tensor value (0: no; 1: yes)\n" + << "arg5: time kernel (0=no, 1=yes)\n" + << "arg6: return index (0=no, 1=yes)\n" + << "--length: input tensor length for NCHW(e.g, --length 2 32 30 30) \n" + << "--wsize: window size for YX (e.g, --wsize 2 2) \n" + << "--wstride: window stride for HW (e.g, --wstride 2 2) \n" + << "--wdilation: window dilation for HW (e.g, --wdilation 1 1) \n" + << "--pad1: left side of padding in HW (e.g, --pad1 1 1) \n" + << "--pad2: right side of padding in HW (e.g, --pad2 1 1) \n" + << "eg: ckProfiler max_pool2d_fwd 0 1 2 0 1 0 --length 2 32 30 30 --wsize 2 2" + "--wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1" + << std::endl; +} + +int profile_max_pool2d_fwd(int argc, char* argv[]) +{ + ck::DataTypeEnum data_type = ck::DataTypeEnum::Half; + bool do_verification = true; + int init_method = 0; + bool do_log = false; + bool time_kernel = true; + bool return_index = false; + + std::vector in_length = {2, 32, 30, 30}; + std::vector wsize = {2, 2}; + std::vector wstride = {2, 2}; + std::vector wdilation = {1, 1}; + std::vector pad1 = {1, 1}; + std::vector pad2 = {1, 1}; + + if(argc != 2 && argc != 28) + { + print_help_max_pool2d_fwd(); + return 0; + } + else if(argc == 28) + { + data_type = static_cast(std::stoi(argv[2])); + do_verification = std::stoi(argv[3]); + init_method = std::stoi(argv[4]); + do_log = std::stoi(argv[5]); + time_kernel = std::stoi(argv[6]); + return_index = std::stoi(argv[7]); + + // parse the long options + maxPoolFwdArgParser arg_parser; + arg_parser(argc, argv); + in_length = arg_parser.long_opts["length"]; + wsize = arg_parser.long_opts["wsize"]; + wstride = arg_parser.long_opts["wstride"]; + wdilation = arg_parser.long_opts["wdilation"]; + pad1 = arg_parser.long_opts["pad1"]; + pad2 = arg_parser.long_opts["pad2"]; + } + + using F16 = ck::half_t; + using BF16 = ck::bhalf_t; + using F32 = float; + using I32 = int32_t; + using NHWC = ck::tensor_layout::convolution::NHWC; + + constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; + + if(data_type == ck::DataTypeEnum::Half) + { + if(return_index) + { + ck::profiler:: + profile_pool2d_fwd_impl( + do_verification, + init_method, + do_log, + time_kernel, + in_length, + wsize, + wstride, + wdilation, + pad1, + pad2); + } + else + { + ck::profiler:: + profile_pool2d_fwd_impl( + do_verification, + init_method, + do_log, + time_kernel, + in_length, + wsize, + wstride, + wdilation, + pad1, + pad2); + } + } + else if(data_type == ck::DataTypeEnum::BFloat16) + { + if(return_index) + { + ck::profiler:: + profile_pool2d_fwd_impl( + do_verification, + init_method, + do_log, + time_kernel, + in_length, + wsize, + wstride, + wdilation, + pad1, + pad2); + } + else + { + ck::profiler::profile_pool2d_fwd_impl(do_verification, + init_method, + do_log, + time_kernel, + in_length, + wsize, + wstride, + wdilation, + pad1, + pad2); + } + } + else if(data_type == ck::DataTypeEnum::Float) + { + if(return_index) + { + ck::profiler:: + profile_pool2d_fwd_impl( + do_verification, + init_method, + do_log, + time_kernel, + in_length, + wsize, + wstride, + wdilation, + pad1, + pad2); + } + else + { + ck::profiler:: + profile_pool2d_fwd_impl( + do_verification, + init_method, + do_log, + time_kernel, + in_length, + wsize, + wstride, + wdilation, + pad1, + pad2); + } + } + else + { + throw std::runtime_error("not implemented yet"); + } + + return 0; +} + +REGISTER_PROFILER_OPERATION("max_pool2d_fwd", "max_pool2d fwd", profile_max_pool2d_fwd); diff --git a/test/pool/CMakeLists.txt b/test/pool/CMakeLists.txt index fac806897a..0118a7591b 100644 --- a/test/pool/CMakeLists.txt +++ b/test/pool/CMakeLists.txt @@ -4,13 +4,19 @@ add_gtest_executable(test_avg_pool3d_bwd test_avg_pool3d_bwd.cpp) add_gtest_executable(test_max_pool3d_bwd test_max_pool3d_bwd.cpp) add_gtest_executable(test_avg_pool3d_fwd test_avg_pool3d_fwd.cpp) add_gtest_executable(test_max_pool3d_fwd test_max_pool3d_fwd.cpp) +add_gtest_executable(test_avg_pool2d_fwd test_avg_pool2d_fwd.cpp) +add_gtest_executable(test_max_pool2d_fwd test_max_pool2d_fwd.cpp) target_link_libraries(test_avg_pool3d_bwd PRIVATE utility device_avg_pool3d_bwd_instance) target_link_libraries(test_max_pool3d_bwd PRIVATE utility device_max_pool_bwd_instance) target_link_libraries(test_avg_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance) target_link_libraries(test_max_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance) +target_link_libraries(test_avg_pool2d_fwd PRIVATE utility device_pool2d_fwd_instance) +target_link_libraries(test_max_pool2d_fwd PRIVATE utility device_pool2d_fwd_instance) add_dependencies(test_pool test_avg_pool3d_bwd) add_dependencies(test_pool test_max_pool3d_bwd) add_dependencies(test_pool test_avg_pool3d_fwd) add_dependencies(test_pool test_max_pool3d_fwd) +add_dependencies(test_pool test_avg_pool2d_fwd) +add_dependencies(test_pool test_max_pool2d_fwd) diff --git a/test/pool/test_avg_pool2d_fwd.cpp b/test/pool/test_avg_pool2d_fwd.cpp new file mode 100644 index 0000000000..358128de8c --- /dev/null +++ b/test/pool/test_avg_pool2d_fwd.cpp @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "profiler/profile_pool2d_fwd_impl.hpp" +#include "test_pool_fwd_common.hpp" + +template +class TestAvgPool2dFwd : public ::testing::Test +{ + protected: + using InDataType = std::tuple_element_t<0, Tuple>; + using OutDataType = std::tuple_element_t<1, Tuple>; + using ComputeDataType = std::tuple_element_t<2, Tuple>; + using IndexDataType = std::tuple_element_t<3, Tuple>; + + std::vector params; + + void Run() + { + for(auto param : params) + { + // avg pool + bool success = + ck::profiler::profile_pool2d_fwd_impl(true, + 2, + false, + false, + param.length_, + param.window_spatial_lengths_, + param.window_strides_, + param.window_dilations_, + param.input_left_pads_, + param.input_right_pads_); + EXPECT_TRUE(success); + } + } +}; + +using KernelTypes = std::conditional_t< + CK_ENABLE_FP16 && CK_ENABLE_BF16, + ::testing::Types, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple>, + ::testing::Types, std::tuple>>; + +TYPED_TEST_SUITE(TestAvgPool2dFwd, KernelTypes); +TYPED_TEST(TestAvgPool2dFwd, Test_Pool) +{ + // length, window_length, window_stride, window_dilation, left_pad, right_pad + this->params = {{{1, 1, 1, 1}, {1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}, + {{2, 16, 64, 64}, {64, 64}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}, + {{2, 16, 64, 64}, {4, 4}, {4, 4}, {2, 2}, {0, 0}, {0, 0}}, + {{2, 32, 30, 30}, {2, 2}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}}; + + this->Run(); +} diff --git a/test/pool/test_max_pool2d_fwd.cpp b/test/pool/test_max_pool2d_fwd.cpp new file mode 100644 index 0000000000..c9529f4f9b --- /dev/null +++ b/test/pool/test_max_pool2d_fwd.cpp @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "profiler/profile_pool2d_fwd_impl.hpp" +#include "test_pool_fwd_common.hpp" + +template +class TestMaxPool2dFwd : public ::testing::Test +{ + protected: + using InDataType = std::tuple_element_t<0, Tuple>; + using OutDataType = std::tuple_element_t<1, Tuple>; + using ComputeDataType = std::tuple_element_t<2, Tuple>; + using IndexDataType = std::tuple_element_t<3, Tuple>; + static constexpr bool ReturnIndex = std::tuple_element_t<4, Tuple>::value; + + std::vector params; + + void Run() + { + for(auto param : params) + { + // max pool + bool success = + ck::profiler::profile_pool2d_fwd_impl(true, + 2, + false, + false, + param.length_, + param.window_spatial_lengths_, + param.window_strides_, + param.window_dilations_, + param.input_left_pads_, + param.input_right_pads_); + EXPECT_TRUE(success); + } + } +}; + +using true_t = std::integral_constant; +using false_t = std::integral_constant; + +using KernelTypes = std::conditional_t, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple>, + ::testing::Types, + std::tuple>>; + +TYPED_TEST_SUITE(TestMaxPool2dFwd, KernelTypes); +TYPED_TEST(TestMaxPool2dFwd, Test_Pool) +{ + // length, window_length, window_stride, window_dilation, left_pad, right_pad + this->params = {{{1, 1, 1, 1}, {1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}, + {{2, 16, 64, 64}, {64, 64}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}, + {{2, 16, 64, 64}, {4, 4}, {4, 4}, {2, 2}, {0, 0}, {0, 0}}, + {{2, 32, 30, 30}, {2, 2}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}}; + + this->Run(); +}