add reviewers changes

2026-06-05 20:55:59 +00:00 · 2024-08-20 15:50:13 +00:00
parent 432cbda3eb
commit 22c82bea0c
43 changed files with 616 additions and 355 deletions
--- a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc_new.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc_new.hpp
@@ -0,0 +1,392 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType, // enable if OutputIndex == true
+          typename ComputeDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool OutputIndex,
+          ck::index_t BlockSize,
+          ck::index_t MThreadClusterSize,
+          ck::index_t KThreadClusterSize,
+          ck::index_t MThreadSliceSize,
+          ck::index_t KThreadSliceSize,
+          ck::index_t InSrcOutDstVectorSize>
+struct DevicePool2dFwd_NHWC_NHWC : public DevicePoolFwd<4,
+                                                        2,
+                                                        InDataType,
+                                                        OutDataType,
+                                                        IndexDataType,
+                                                        tensor_layout::convolution::NHWC,
+                                                        tensor_layout::convolution::NHWC,
+                                                        ReduceOpId,
+                                                        OutputIndex>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static constexpr index_t InOutRank  = 4;
+    static constexpr index_t WindowRank = 2;
+
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+
+    using InElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+
+    using AccElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+
+    static constexpr ck::index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr ck::index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto MakeABGridDescriptor_A_M_K_B_M(std::vector<ck::index_t> input_nchw_lengths,
+                                               std::vector<ck::index_t> output_nchw_lengths,
+                                               std::vector<ck::index_t> input_nchw_stride,
+                                               std::vector<ck::index_t> output_nchw_stride,
+                                               std::vector<ck::index_t> window_spatial_yx_lengths,
+                                               std::vector<ck::index_t> window_yx_strides,
+                                               std::vector<ck::index_t> window_yx_dilations,
+                                               std::vector<ck::index_t> input_left_hw_pads,
+                                               std::vector<ck::index_t> input_right_hw_pads)
+    {
+        const index_t N  = input_nchw_lengths[0];
+        const index_t C  = input_nchw_lengths[1];
+        const index_t Hi = input_nchw_lengths[2];
+        const index_t Wi = input_nchw_lengths[3];
+
+        const index_t Ho = output_nchw_lengths[2];
+        const index_t Wo = output_nchw_lengths[3];
+        const index_t Y  = window_spatial_yx_lengths[1];
+        const index_t X  = window_spatial_yx_lengths[2];
+
+        const index_t WindowStrideH = window_yx_strides[0];
+        const index_t WindowStrideW = window_yx_strides[1];
+
+        const index_t WindowDilationH = window_yx_dilations[0];
+        const index_t WindowDilationW = window_yx_dilations[1];
+
+        const index_t InLeftPadH = input_left_hw_pads[0];
+        const index_t InLeftPadW = input_left_hw_pads[1];
+
+        const index_t InRightPadH = input_right_hw_pads[0];
+        const index_t InRightPadW = input_right_hw_pads[1];
+
+        const index_t MRaw = N * Ho * Wo * C;
+        const index_t MPad = math::integer_least_multiple(MRaw, M_BlockTileSize) - MRaw;
+
+        const index_t KRaw = Y * X;
+        const index_t KPad = math::integer_least_multiple(KRaw, K_BlockTileSize) - KRaw;
+
+        // A[ReduceM, ReduceK]
+        const index_t Ni_stride = input_ncdhw_stride[0];
+        const index_t Ci_stride = input_ncdhw_stride[1];
+        const index_t Hi_stride = input_ncdhw_stride[2];
+        const index_t Wi_stride = input_ncdhw_stride[3];
+
+        const auto in_grid_desc_n_hi_wi_c = make_naive_tensor_descriptor(
+            make_tuple(N, Hi, Wi, C), make_tuple(Ni_stride, Hi_stride, Wi_stride, Ci_stride));
+
+        const auto in_grid_desc_n_hip_wip_c = transform_tensor_descriptor(
+            in_grid_desc_n_hi_wi_c,
+            make_tuple(make_pass_through_transform(N),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto in_grid_desc_n_y_ho_x_wo_c = transform_tensor_descriptor(
+            in_grid_desc_n_hip_wip_c,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_embed_transform(make_tuple(Y, Ho), make_tuple(WindowDilationH, WindowStrideH)),
+                make_embed_transform(make_tuple(X, Wo), make_tuple(WindowDilationW, WindowStrideW)),
+                make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+        const auto in_grid_desc_reducemraw_reducekraw =
+            transform_tensor_descriptor(in_grid_desc_n_y_ho_x_wo_c,
+                                        make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, C)),
+                                                   make_merge_transform(make_tuple(Y, X))),
+                                        make_tuple(Sequence<0, 2, 4, 5>{}, Sequence<1, 3>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        const auto in_grid_desc_reducem_reducek = transform_tensor_descriptor(
+            in_grid_desc_reducemraw_reducekraw,
+            make_tuple(make_right_pad_transform(MRaw, MPad), make_right_pad_transform(KRaw, KPad)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        // B[ReduceM]
+        const index_t No_stride = output_ncdhw_stride[0];
+        const index_t Co_stride = output_ncdhw_stride[1];
+        const index_t Ho_stride = output_ncdhw_stride[2];
+        const index_t Wo_stride = output_ncdhw_stride[3];
+
+        const auto out_grid_desc_n_ho_wo_c = make_naive_tensor_descriptor(
+            make_tuple(N, Hi, Wi, C), make_tuple(No_stride, Ho_stride, Wo_stride, Co_stride));
+
+        const auto out_grid_desc_reducemraw =
+            transform_tensor_descriptor(out_grid_desc_n_ho_wo_c,
+                                        make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, C))),
+                                        make_tuple(Sequence<0, 1, 2, 3>{}),
+                                        make_tuple(Sequence<0>{}));
+
+        const auto out_grid_desc_reducem =
+            transform_tensor_descriptor(out_grid_desc_reducemraw,
+                                        make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+
+        return make_tuple(in_grid_desc_reducem_reducek, out_grid_desc_reducem);
+    }
+
+    using ABGridDescs =
+        decltype(MakeABGridDescriptor_A_M_K_B_M({}, {}, {}, {}, {}, {}, {}, {}, {}));
+
+    using AGridDesc_M_K = remove_cvref_t<decltype(ABGridDescs{}[I0])>;
+    using BGridDesc_M   = remove_cvref_t<decltype(ABGridDescs{}[I1])>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_dev,
+                 OutDataType* p_out_dev,
+                 IndexDataType* p_out_indices_dev,
+                 std::vector<ck::index_t>& input_nchw_lengths,
+                 std::vector<ck::index_t>& output_nchw_lengths,
+                 std::vector<ck::index_t>& input_nchw_stride,
+                 std::vector<ck::index_t>& output_nchw_stride,
+                 std::vector<ck::index_t>&, // indices_nchw_stride
+                 std::vector<ck::index_t>& window_spatial_yx_lengths,
+                 std::vector<ck::index_t>& window_yx_strides,
+                 std::vector<ck::index_t>& window_yx_dilations,
+                 std::vector<ck::index_t>& input_left_hw_pads,
+                 std::vector<ck::index_t>& input_right_hw_pads)
+            : p_in_dev_{p_in_dev},
+              p_out_dev_{p_out_dev},
+              p_out_indices_dev_{p_out_indices_dev},
+              a_grid_desc_m_k_{},
+              b_grid_desc_m_{},
+              input_ncdhw_lengths_{input_nchw_lengths},
+              output_ncdhw_lengths_{output_nchw_lengths},
+              input_ncdhw_stride_{input_nchw_stride},
+              output_ncdhw_stride_{output_nchw_stride}
+        {
+            const auto descs = MakeABGridDescriptor_A_M_K_B_M(input_nchw_lengths,
+                                                              output_nchw_lengths,
+                                                              input_nchw_stride,
+                                                              output_nchw_stride,
+                                                              window_spatial_yx_lengths,
+                                                              window_yx_strides,
+                                                              window_yx_dilations,
+                                                              input_left_hw_pads,
+                                                              input_right_hw_pads);
+
+            a_grid_desc_m_k_ = descs[I0];
+            b_grid_desc_m_   = descs[I1];
+
+            int32_t reduceLength = window_spatial_yx_lengths[0] * window_spatial_yx_lengths[1];
+
+            std::tie(in_element_op_, acc_element_op_) =
+                reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
+        }
+
+        const InDataType* p_in_dev_;
+        OutDataType* p_out_dev_;
+        IndexDataType* p_out_indices_dev_;
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_M b_grid_desc_m_;
+
+        InElementwiseOperation in_element_op_;
+        AccElementwiseOperation acc_element_op_;
+
+        // for checking vector load/store
+        std::vector<ck::index_t> input_nchw_lengths_;
+        std::vector<ck::index_t> output_nchw_lengths_;
+        std::vector<ck::index_t> input_nchw_stride_;
+        std::vector<ck::index_t> output_nchw_stride_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            // for NHWC, the dim C is the fastest dimension, and is not reduced.
+            // Hence, it is in M dimension for reduction kernel.
+            static constexpr index_t InSrcOutDstVectorDim = 0; // 0: M, 1: K
+
+            using gridwise_reduce =
+                GridwiseReduction_mk_to_m_threadwise<InDataType,
+                                                     OutDataType,
+                                                     ComputeDataType,
+                                                     IndexDataType,
+                                                     AGridDesc_M_K,
+                                                     BGridDesc_M,
+                                                     ReduceOperation,
+                                                     InElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     InMemoryDataOperationEnum::Set,
+                                                     false, // propagate_nan
+                                                     BlockSize,
+                                                     MThreadSliceSize,
+                                                     KThreadSliceSize,
+                                                     InSrcOutDstVectorDim,
+                                                     InSrcOutDstVectorSize,
+                                                     InSrcOutDstVectorSize>;
+
+            const auto kernel =
+                kernel_reduce_threadwise<gridwise_reduce,
+                                         OutputIndex,
+                                         true,  // pooling need to return global index
+                                         false, // don't have index input
+                                         InDataType,
+                                         OutDataType,
+                                         ComputeDataType,
+                                         IndexDataType,
+                                         AGridDesc_M_K,
+                                         BGridDesc_M,
+                                         InElementwiseOperation,
+                                         AccElementwiseOperation>;
+
+            ck::index_t M = arg.a_grid_desc_m_k_.GetLength(I0);
+
+            const index_t grid_size = (M / M_BlockTileSize);
+
+            return launch_and_time_kernel(stream_config,
+                                          kernel,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          arg.a_grid_desc_m_k_,
+                                          arg.b_grid_desc_m_,
+                                          arg.in_element_op_,
+                                          arg.acc_element_op_,
+                                          float(1),
+                                          arg.p_in_dev_,
+                                          nullptr,
+                                          float(0),
+                                          arg.p_out_dev_,
+                                          arg.p_out_indices_dev_);
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+
+        // C should be fastest dimension
+        if(pArg->input_nchw_stride_[1] != 1)
+            return false;
+
+        for(int i = 0; i < InOutRank; ++i)
+        {
+            if(pArg->input_nchw_stride_[i] == 1 &&
+               pArg->input_nchw_lengths_[i] % InSrcOutDstVectorSize != 0)
+                return false;
+
+            if(pArg->output_nchw_stride_[i] == 1 &&
+               pArg->output_nchw_lengths_[i] % InSrcOutDstVectorSize != 0)
+                return false;
+        }
+
+        return true;
+    }
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_dev,
+                        void* p_out_dev,
+                        void* p_out_indices_dev,
+                        std::vector<ck::index_t> input_nchw_lengths,
+                        std::vector<ck::index_t> window_yx_lengths,
+                        std::vector<ck::index_t> output_nchw_lengths,
+                        std::vector<ck::index_t> input_nchw_stride,
+                        std::vector<ck::index_t> output_nchw_stride,
+                        std::vector<ck::index_t> indices_nchw_stride,
+                        std::vector<ck::index_t> window_yx_strides,
+                        std::vector<ck::index_t> window_yx_dilations,
+                        std::vector<ck::index_t> input_left_hw_pads,
+                        std::vector<ck::index_t> input_right_hw_pads,
+                        std::vector<ck::index_t> pooling_dims) override
+    {
+        if(input_nchw_lengths.size() != InOutRank || window_yx_lengths.size() != WindowRank ||
+           input_nchw_lengths.size() != InOutRank || window_yx_strides.size() != WindowRank ||
+           window_yx_dilations.size() != WindowRank || input_left_hw_pads.size() != WindowRank ||
+           input_right_hw_pads.size() != WindowRank)
+            throw std::runtime_error("dimension is incorrect");
+
+        if(pooling_dims != std::vector<ck::index_t>{2, 3})
+            throw std::runtime_error("pooling_dims only support {2, 3} in pool2d so far");
+
+        if(output_nchw_stride != indices_nchw_stride)
+            throw std::runtime_error(
+                "output_ncdhw_stride need to be equal to indices_ncdhw_stride for now");
+
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_dev),
+                                          static_cast<OutDataType*>(p_out_dev),
+                                          static_cast<IndexDataType*>(p_out_indices_dev),
+                                          input_nchw_lengths,
+                                          output_nchw_lengths,
+                                          input_nchw_stride,
+                                          output_nchw_stride,
+                                          indices_nchw_stride,
+                                          window_yx_lengths,
+                                          window_yx_strides,
+                                          window_yx_dilations,
+                                          input_left_hw_pads,
+                                          input_right_hw_pads);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DevicePool2dFwd_NHWC_NHWC<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str <<"InSrcOutDstVectorSize_" << InSrcOutDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_pool_max_bwd_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool_max_bwd_impl.hpp
--- a/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(DEVICE_AVGPOOL_BWD_INSTANCES)
-list(APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_f16_instance.cpp
-                                         device_avg_pool3d_bwd_ndhwc_bf16_instance.cpp
-                                         device_avg_pool3d_bwd_ndhwc_f32_instance.cpp)
-add_instance_library(device_avg_pool3d_bwd_instance ${DEVICE_AVGPOOL_BWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(DEVICE_MAXPOOL_BWD_INSTANCES)
-list(APPEND DEVICE_MAXPOOL_BWD_INSTANCES device_max_pool_bwd_f16_instance.cpp
-                                         device_max_pool_bwd_bf16_instance.cpp
-                                         device_max_pool_bwd_f32_instance.cpp)
-add_instance_library(device_max_pool_bwd_instance ${DEVICE_MAXPOOL_BWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/CMakeLists.txt
@@ -1,8 +1,8 @@
 set(DEVICE_POOL2D_FWD_INSTANCES)
-list(APPEND DEVICE_POOL2D_FWD_INSTANCES device_avg_pool2d_fwd_nhwc_f16_instance.cpp
-                                        device_max_pool2d_fwd_nhwc_f16_instance.cpp
-                                        device_avg_pool2d_fwd_nhwc_f32_instance.cpp
-                                        device_max_pool2d_fwd_nhwc_f32_instance.cpp
-                                        device_avg_pool2d_fwd_nhwc_bf16_instance.cpp
-                                        device_max_pool2d_fwd_nhwc_bf16_instance.cpp)
+list(APPEND DEVICE_POOL2D_FWD_INSTANCES device_pool2d_avg_fwd_nhwc_f16_instance.cpp
+                                        device_pool2d_max_fwd_nhwc_f16_instance.cpp
+                                        device_pool2d_avg_fwd_nhwc_f32_instance.cpp
+                                        device_pool2d_max_fwd_nhwc_f32_instance.cpp
+                                        device_pool2d_avg_fwd_nhwc_bf16_instance.cpp
+                                        device_pool2d_max_fwd_nhwc_bf16_instance.cpp)
 add_instance_library(device_pool2d_fwd_instance ${DEVICE_POOL2D_FWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_avg_fwd_nhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_avg_fwd_nhwc_bf16_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_avg_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_avg_fwd_nhwc_f16_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_avg_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_avg_fwd_nhwc_f32_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_max_fwd_nhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_max_fwd_nhwc_bf16_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_max_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_max_fwd_nhwc_f16_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_max_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool2d_fwd/device_pool2d_max_fwd_nhwc_f32_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/pool3d_avg_bwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool3d_avg_bwd/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(DEVICE_AVGPOOL_BWD_INSTANCES)
+list(APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_pool3d_avg_bwd_ndhwc_f16_instance.cpp
+                                         device_pool3d_avg_bwd_ndhwc_bf16_instance.cpp
+                                         device_pool3d_avg_bwd_ndhwc_f32_instance.cpp)
+add_instance_library(device_pool3d_avg_bwd_instance ${DEVICE_AVGPOOL_BWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/pool3d_avg_bwd/device_pool3d_avg_bwd_ndhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_avg_bwd/device_pool3d_avg_bwd_ndhwc_bf16_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "avg_pool3d_bwd_ndhwc_instance_common.hpp"
+#include "pool3d_avg_bwd_ndhwc_instance_common.hpp"

 namespace ck {
 namespace tensor_operation {
--- a/library/src/tensor_operation_instance/gpu/pool3d_avg_bwd/device_pool3d_avg_bwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_avg_bwd/device_pool3d_avg_bwd_ndhwc_f16_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "avg_pool3d_bwd_ndhwc_instance_common.hpp"
+#include "pool3d_avg_bwd_ndhwc_instance_common.hpp"

 namespace ck {
 namespace tensor_operation {
--- a/library/src/tensor_operation_instance/gpu/pool3d_avg_bwd/device_pool3d_avg_bwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_avg_bwd/device_pool3d_avg_bwd_ndhwc_f32_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "avg_pool3d_bwd_ndhwc_instance_common.hpp"
+#include "pool3d_avg_bwd_ndhwc_instance_common.hpp"

 namespace ck {
 namespace tensor_operation {
--- a/library/src/tensor_operation_instance/gpu/pool3d_avg_bwd/pool3d_avg_bwd_ndhwc_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_avg_bwd/pool3d_avg_bwd_ndhwc_instance_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
@@ -1,8 +1,8 @@
 set(DEVICE_POOL3D_FWD_INSTANCES)
-list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
-                                        device_max_pool3d_fwd_ndhwc_f16_instance.cpp
-                                        device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
-                                        device_max_pool3d_fwd_ndhwc_f32_instance.cpp
-                                        device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp
-                                        device_max_pool3d_fwd_ndhwc_bf16_instance.cpp)
+list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_pool3d_avg_fwd_ndhwc_f16_instance.cpp
+                                        device_pool3d_max_fwd_ndhwc_f16_instance.cpp
+                                        device_pool3d_avg_fwd_ndhwc_f32_instance.cpp
+                                        device_pool3d_max_fwd_ndhwc_f32_instance.cpp
+                                        device_pool3d_avg_fwd_ndhwc_bf16_instance.cpp
+                                        device_pool3d_max_fwd_ndhwc_bf16_instance.cpp)
 add_instance_library(device_pool3d_fwd_instance ${DEVICE_POOL3D_FWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_avg_fwd_ndhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_avg_fwd_ndhwc_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "pool_fwd_instance_common.hpp"

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_avg_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_avg_fwd_ndhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "pool_fwd_instance_common.hpp"

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_avg_fwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_avg_fwd_ndhwc_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "pool_fwd_instance_common.hpp"

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_max_fwd_ndhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_max_fwd_ndhwc_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "pool_fwd_instance_common.hpp"

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_max_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_max_fwd_ndhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "pool_fwd_instance_common.hpp"

--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_max_fwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_pool3d_max_fwd_ndhwc_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "pool_fwd_instance_common.hpp"

--- a/library/src/tensor_operation_instance/gpu/pool_max_bwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool_max_bwd/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(DEVICE_MAXPOOL_BWD_INSTANCES)
+list(APPEND DEVICE_MAXPOOL_BWD_INSTANCES device_pool_max_bwd_f16_instance.cpp
+                                         device_pool_max_bwd_bf16_instance.cpp
+                                         device_pool_max_bwd_f32_instance.cpp)
+add_instance_library(device_pool_max_bwd_instance ${DEVICE_MAXPOOL_BWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/pool_max_bwd/device_pool_max_bwd_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_max_bwd/device_pool_max_bwd_bf16_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "max_pool_bwd_instance_common.hpp"
+#include "pool_max_bwd_instance_common.hpp"

 namespace ck {
 namespace tensor_operation {
--- a/library/src/tensor_operation_instance/gpu/pool_max_bwd/device_pool_max_bwd_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_max_bwd/device_pool_max_bwd_f16_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "max_pool_bwd_instance_common.hpp"
+#include "pool_max_bwd_instance_common.hpp"

 namespace ck {
 namespace tensor_operation {
--- a/library/src/tensor_operation_instance/gpu/pool_max_bwd/device_pool_max_bwd_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_max_bwd/device_pool_max_bwd_f32_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "max_pool_bwd_instance_common.hpp"
+#include "pool_max_bwd_instance_common.hpp"

 namespace ck {
 namespace tensor_operation {
--- a/library/src/tensor_operation_instance/gpu/pool_max_bwd/pool_max_bwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool_max_bwd/pool_max_bwd_instance_common.hpp
@@ -1,10 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool_max_bwd_impl.hpp"
 #include "ck/utility/data_type.hpp"

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
--- a/profiler/include/profiler/profile_pool3d_avg_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_avg_bwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -39,7 +39,7 @@ template <typename DOutDataType,
          typename ComputeDataType,
          typename DOutLayout,
          typename DInLayout>
-bool profile_avg_pool3d_bwd_impl(int do_verification,
+bool profile_pool3d_avg_bwd_impl(int do_verification,
                                 int init_method,
                                 bool do_log,
                                 bool time_kernel,
--- a/profiler/include/profiler/profile_pool3d_max_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_max_bwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -25,7 +25,7 @@ template <typename InDataType,
          typename DOutDataType,
          typename DInDataType,
          bool PropagateNan>
-bool profile_max_pool3d_bwd_impl(int do_verification,
+bool profile_pool3d_max_bwd_impl(int do_verification,
                                 int init_method,
                                 bool do_log,
                                 bool time_kernel,
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -9,10 +9,10 @@ set(PROFILER_SOURCES
    profile_layernorm_bwd_gamma_beta.cpp
    profile_groupnorm_bwd_gamma_beta.cpp
    profile_layernorm_fwd.cpp
-    profile_max_pool2d_fwd.cpp
-    profile_max_pool3d_fwd.cpp
-    profile_avg_pool3d_bwd.cpp
-    profile_max_pool3d_bwd.cpp
+    profile_pool2d_max_fwd.cpp
+    profile_pool3d_max_fwd.cpp
+    profile_pool3d_avg_bwd.cpp
+    profile_pool3d_max_bwd.cpp
    profile_softmax.cpp
    profile_batchnorm_fwd.cpp
    profile_batchnorm_bwd.cpp
@@ -101,8 +101,8 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool2d_fwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_avg_bwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_max_bwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
--- a/profiler/src/profile_pool2d_max_fwd.cpp
+++ b/profiler/src/profile_pool2d_max_fwd.cpp
@@ -49,7 +49,7 @@ struct maxPoolFwdArgParser
    }
 };

-void print_help_max_pool2d_fwd()
+void print_help_pool2d_max_fwd()
 {
    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
              << "arg2: verification (0: no; 1: yes)\n"
@@ -68,7 +68,7 @@ void print_help_max_pool2d_fwd()
              << std::endl;
 }

-int profile_max_pool2d_fwd(int argc, char* argv[])
+int profile_pool2d_max_fwd(int argc, char* argv[])
 {
    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
    bool do_verification       = true;
@@ -109,30 +109,18 @@ int profile_max_pool2d_fwd(int argc, char* argv[])
        pad2      = arg_parser.long_opts["pad2"];
    }

-#ifdef CK_ENABLE_FP16
-    using F16 = ck::half_t;
-#endif
-#ifdef CK_ENABLE_BF16
+    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
-#endif
-#ifdef CK_ENABLE_FP32
-    using F32 = float;
-#endif
+    using F32  = float;
    using I32  = int32_t;
    using NHWC = ck::tensor_layout::convolution::NHWC;

-#if 1
    constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
-#else
-    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
-#endif

-    if(false)
-        ;
-#ifdef CK_ENABLE_FP16
-    else if(data_type == ck::DataTypeEnum::Half)
+    if(data_type == ck::DataTypeEnum::Half)
    {
        if(return_index)
+        {
            ck::profiler::
                profile_pool2d_fwd_impl<F16, F16, F16, I32, NHWC, NHWC, ReduceOpId, false, true>(
                    do_verification,
@@ -145,7 +133,9 @@ int profile_max_pool2d_fwd(int argc, char* argv[])
                    wdilation,
                    pad1,
                    pad2);
+        }
        else
+        {
            ck::profiler::
                profile_pool2d_fwd_impl<F16, F16, F16, I32, NHWC, NHWC, ReduceOpId, false, false>(
                    do_verification,
@@ -158,12 +148,12 @@ int profile_max_pool2d_fwd(int argc, char* argv[])
                    wdilation,
                    pad1,
                    pad2);
+        }
    }
-#endif
-#ifdef CK_ENABLE_BF16
    else if(data_type == ck::DataTypeEnum::BFloat16)
    {
        if(return_index)
+        {
            ck::profiler::
                profile_pool2d_fwd_impl<BF16, BF16, BF16, I32, NHWC, NHWC, ReduceOpId, false, true>(
                    do_verification,
@@ -176,7 +166,9 @@ int profile_max_pool2d_fwd(int argc, char* argv[])
                    wdilation,
                    pad1,
                    pad2);
+        }
        else
+        {
            ck::profiler::profile_pool2d_fwd_impl<BF16,
                                                  BF16,
                                                  BF16,
@@ -195,12 +187,12 @@ int profile_max_pool2d_fwd(int argc, char* argv[])
                                                         wdilation,
                                                         pad1,
                                                         pad2);
+        }
    }
-#endif
-#ifdef CK_ENABLE_FP32
    else if(data_type == ck::DataTypeEnum::Float)
    {
        if(return_index)
+        {
            ck::profiler::
                profile_pool2d_fwd_impl<F32, F32, F32, I32, NHWC, NHWC, ReduceOpId, false, true>(
                    do_verification,
@@ -213,7 +205,9 @@ int profile_max_pool2d_fwd(int argc, char* argv[])
                    wdilation,
                    pad1,
                    pad2);
+        }
        else
+        {
            ck::profiler::
                profile_pool2d_fwd_impl<F32, F32, F32, I32, NHWC, NHWC, ReduceOpId, false, false>(
                    do_verification,
@@ -226,8 +220,8 @@ int profile_max_pool2d_fwd(int argc, char* argv[])
                    wdilation,
                    pad1,
                    pad2);
+        }
    }
-#endif
    else
    {
        throw std::runtime_error("not implemented yet");
@@ -236,4 +230,4 @@ int profile_max_pool2d_fwd(int argc, char* argv[])
    return 0;
 }

-REGISTER_PROFILER_OPERATION("max_pool2d_fwd", "max_pool2d fwd", profile_max_pool2d_fwd);
+REGISTER_PROFILER_OPERATION("pool2d_max_fwd", "pool2d_max fwd", profile_pool2d_max_fwd);
--- a/profiler/src/profile_pool3d_avg_bwd.cpp
+++ b/profiler/src/profile_pool3d_avg_bwd.cpp
@@ -1,12 +1,12 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <vector>
 #include <unordered_map>

 #include "profiler/data_type_enum.hpp"
-#include "profiler/profile_avg_pool3d_bwd_impl.hpp"
+#include "profiler/profile_pool3d_avg_bwd_impl.hpp"
 #include "profiler_operation_registry.hpp"

 using ck::index_t;
@@ -49,7 +49,7 @@ struct maxPoolbwdArgParser
    }
 };

-void print_help_avg_pool3d_bwd()
+void print_help_pool3d_avg_bwd()
 {
    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
              << "arg2: verification (0: no; 1: yes)\n"
@@ -62,12 +62,12 @@ void print_help_avg_pool3d_bwd()
              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
-              << "eg: ckProfiler avg_pool3d_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
+              << "eg: ckProfiler pool3d_avg_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
              << std::endl;
 }

-int profile_avg_pool3d_bwd(int argc, char* argv[])
+int profile_pool3d_avg_bwd(int argc, char* argv[])
 {
    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
    bool do_verification       = true;
@@ -84,7 +84,7 @@ int profile_avg_pool3d_bwd(int argc, char* argv[])

    if(argc != 2 && argc != 33)
    {
-        print_help_avg_pool3d_bwd();
+        print_help_pool3d_avg_bwd();
        return 0;
    }
    else if(argc == 33)
@@ -106,23 +106,14 @@ int profile_avg_pool3d_bwd(int argc, char* argv[])
        pad2      = arg_parser.long_opts["pad2"];
    }

-#ifdef CK_ENABLE_FP16
-    using F16 = ck::half_t;
-#endif
-#ifdef CK_ENABLE_BF16
-    using BF16 = ck::bhalf_t;
-#endif
-#ifdef CK_ENABLE_FP32
-    using F32 = float;
-#endif
+    using F16   = ck::half_t;
+    using BF16  = ck::bhalf_t;
+    using F32   = float;
    using NDHWC = ck::tensor_layout::convolution::NDHWC;

-    if(false)
-        ;
-#ifdef CK_ENABLE_FP16
-    else if(data_type == ck::DataTypeEnum::Half)
+    if(data_type == ck::DataTypeEnum::Half)
    {
-        ck::profiler::profile_avg_pool3d_bwd_impl<F16, F16, F16, NDHWC, NDHWC>(do_verification,
+        ck::profiler::profile_pool3d_avg_bwd_impl<F16, F16, F16, NDHWC, NDHWC>(do_verification,
                                                                               init_method,
                                                                               do_log,
                                                                               time_kernel,
@@ -133,11 +124,9 @@ int profile_avg_pool3d_bwd(int argc, char* argv[])
                                                                               pad1,
                                                                               pad2);
    }
-#endif
-#ifdef CK_ENABLE_BF16
    else if(data_type == ck::DataTypeEnum::BFloat16)
    {
-        ck::profiler::profile_avg_pool3d_bwd_impl<BF16, BF16, BF16, NDHWC, NDHWC>(do_verification,
+        ck::profiler::profile_pool3d_avg_bwd_impl<BF16, BF16, BF16, NDHWC, NDHWC>(do_verification,
                                                                                  init_method,
                                                                                  do_log,
                                                                                  time_kernel,
@@ -148,11 +137,9 @@ int profile_avg_pool3d_bwd(int argc, char* argv[])
                                                                                  pad1,
                                                                                  pad2);
    }
-#endif
-#ifdef CK_ENABLE_FP32
    else if(data_type == ck::DataTypeEnum::Float)
    {
-        ck::profiler::profile_avg_pool3d_bwd_impl<F32, F32, F32, NDHWC, NDHWC>(do_verification,
+        ck::profiler::profile_pool3d_avg_bwd_impl<F32, F32, F32, NDHWC, NDHWC>(do_verification,
                                                                               init_method,
                                                                               do_log,
                                                                               time_kernel,
@@ -163,7 +150,6 @@ int profile_avg_pool3d_bwd(int argc, char* argv[])
                                                                               pad1,
                                                                               pad2);
    }
-#endif
    else
    {
        throw std::runtime_error("not implemented yet");
@@ -172,4 +158,4 @@ int profile_avg_pool3d_bwd(int argc, char* argv[])
    return 0;
 }

-REGISTER_PROFILER_OPERATION("avg_pool3d_bwd", "max_pool bwd", profile_avg_pool3d_bwd);
+REGISTER_PROFILER_OPERATION("pool3d_avg_bwd", "max_pool bwd", profile_pool3d_avg_bwd);
--- a/profiler/src/profile_pool3d_max_bwd.cpp
+++ b/profiler/src/profile_pool3d_max_bwd.cpp
@@ -1,12 +1,12 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <vector>
 #include <unordered_map>

 #include "profiler/data_type_enum.hpp"
-#include "profiler/profile_max_pool3d_bwd_impl.hpp"
+#include "profiler/profile_pool3d_max_bwd_impl.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "profiler_operation_registry.hpp"

@@ -50,7 +50,7 @@ struct maxPoolbwdArgParser
    }
 };

-void print_help_max_pool3d_bwd()
+void print_help_pool3d_max_bwd()
 {
    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
              << "arg2: verification (0: no; 1: yes)\n"
@@ -63,12 +63,12 @@ void print_help_max_pool3d_bwd()
              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
-              << "eg: ckProfiler max_pool3d_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
+              << "eg: ckProfiler pool3d_max_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
              << std::endl;
 }

-int profile_max_pool3d_bwd(int argc, char* argv[])
+int profile_pool3d_max_bwd(int argc, char* argv[])
 {
    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
    bool do_verification       = true;
@@ -85,7 +85,7 @@ int profile_max_pool3d_bwd(int argc, char* argv[])

    if(argc != 2 && argc != 33)
    {
-        print_help_max_pool3d_bwd();
+        print_help_pool3d_max_bwd();
        return 0;
    }
    else if(argc == 33)
@@ -107,23 +107,14 @@ int profile_max_pool3d_bwd(int argc, char* argv[])
        pad2      = arg_parser.long_opts["pad2"];
    }

-#ifdef CK_ENABLE_FP16
-    using F16 = ck::half_t;
-#endif
-#ifdef CK_ENABLE_BF16
+    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
-#endif
-#ifdef CK_ENABLE_FP32
-    using F32 = float;
-#endif
-    using I32 = int32_t;
+    using F32  = float;
+    using I32  = int32_t;

-    if(false)
-        ;
-#ifdef CK_ENABLE_FP16
-    else if(data_type == ck::DataTypeEnum::Half)
+    if(data_type == ck::DataTypeEnum::Half)
    {
-        ck::profiler::profile_max_pool3d_bwd_impl<F16, F16, I32, F16, F16, false>(do_verification,
+        ck::profiler::profile_pool3d_max_bwd_impl<F16, F16, I32, F16, F16, false>(do_verification,
                                                                                  init_method,
                                                                                  do_log,
                                                                                  time_kernel,
@@ -134,11 +125,9 @@ int profile_max_pool3d_bwd(int argc, char* argv[])
                                                                                  pad1,
                                                                                  pad2);
    }
-#endif
-#ifdef CK_ENABLE_BF16
    else if(data_type == ck::DataTypeEnum::BFloat16)
    {
-        ck::profiler::profile_max_pool3d_bwd_impl<BF16, BF16, I32, BF16, BF16, false>(
+        ck::profiler::profile_pool3d_max_bwd_impl<BF16, BF16, I32, BF16, BF16, false>(
            do_verification,
            init_method,
            do_log,
@@ -150,11 +139,9 @@ int profile_max_pool3d_bwd(int argc, char* argv[])
            pad1,
            pad2);
    }
-#endif
-#ifdef CK_ENABLE_FP32
    else if(data_type == ck::DataTypeEnum::Float)
    {
-        ck::profiler::profile_max_pool3d_bwd_impl<F32, F32, I32, F32, F32, false>(do_verification,
+        ck::profiler::profile_pool3d_max_bwd_impl<F32, F32, I32, F32, F32, false>(do_verification,
                                                                                  init_method,
                                                                                  do_log,
                                                                                  time_kernel,
@@ -165,7 +152,6 @@ int profile_max_pool3d_bwd(int argc, char* argv[])
                                                                                  pad1,
                                                                                  pad2);
    }
-#endif
    else
    {
        throw std::runtime_error("not implemented yet");
@@ -174,4 +160,4 @@ int profile_max_pool3d_bwd(int argc, char* argv[])
    return 0;
 }

-REGISTER_PROFILER_OPERATION("max_pool3d_bwd", "max_pool3d bwd", profile_max_pool3d_bwd);
+REGISTER_PROFILER_OPERATION("pool3d_max_bwd", "pool3d_max bwd", profile_pool3d_max_bwd);
--- a/profiler/src/profile_pool3d_max_fwd.cpp
+++ b/profiler/src/profile_pool3d_max_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <vector>
@@ -49,7 +49,7 @@ struct maxPoolFwdArgParser
    }
 };

-void print_help_max_pool3d_fwd()
+void print_help_pool3d_max_fwd()
 {
    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
              << "arg2: verification (0: no; 1: yes)\n"
@@ -63,12 +63,12 @@ void print_help_max_pool3d_fwd()
              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
-              << "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
+              << "eg: ckProfiler pool3d_max_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
              << std::endl;
 }

-int profile_max_pool3d_fwd(int argc, char* argv[])
+int profile_pool3d_max_fwd(int argc, char* argv[])
 {
    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
    bool do_verification       = true;
@@ -86,7 +86,7 @@ int profile_max_pool3d_fwd(int argc, char* argv[])

    if(argc != 2 && argc != 34)
    {
-        print_help_max_pool3d_fwd();
+        print_help_pool3d_max_fwd();
        return 0;
    }
    else if(argc == 34)
@@ -109,28 +109,15 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
        pad2      = arg_parser.long_opts["pad2"];
    }

-#ifdef CK_ENABLE_FP16
-    using F16 = ck::half_t;
-#endif
-#ifdef CK_ENABLE_BF16
-    using BF16 = ck::bhalf_t;
-#endif
-#ifdef CK_ENABLE_FP32
-    using F32 = float;
-#endif
+    using F16   = ck::half_t;
+    using BF16  = ck::bhalf_t;
+    using F32   = float;
    using I32   = int32_t;
    using NDHWC = ck::tensor_layout::convolution::NDHWC;

-#if 1
    constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
-#else
-    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
-#endif

-    if(false)
-        ;
-#ifdef CK_ENABLE_FP16
-    else if(data_type == ck::DataTypeEnum::Half)
+    if(data_type == ck::DataTypeEnum::Half)
    {
        if(return_index)
            ck::profiler::
@@ -159,8 +146,6 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
                    pad1,
                    pad2);
    }
-#endif
-#ifdef CK_ENABLE_BF16
    else if(data_type == ck::DataTypeEnum::BFloat16)
    {
        if(return_index)
@@ -202,8 +187,6 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
                                                         pad1,
                                                         pad2);
    }
-#endif
-#ifdef CK_ENABLE_FP32
    else if(data_type == ck::DataTypeEnum::Float)
    {
        if(return_index)
@@ -233,7 +216,6 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
                    pad1,
                    pad2);
    }
-#endif
    else
    {
        throw std::runtime_error("not implemented yet");
@@ -242,4 +224,4 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
    return 0;
 }

-REGISTER_PROFILER_OPERATION("max_pool3d_fwd", "max_pool3d fwd", profile_max_pool3d_fwd);
+REGISTER_PROFILER_OPERATION("pool3d_max_fwd", "pool3d_max fwd", profile_pool3d_max_fwd);
--- a/test/pool/CMakeLists.txt
+++ b/test/pool/CMakeLists.txt
@@ -1,22 +1,22 @@
 add_custom_target(test_pool)

-add_gtest_executable(test_avg_pool3d_bwd test_avg_pool3d_bwd.cpp)
-add_gtest_executable(test_max_pool3d_bwd test_max_pool3d_bwd.cpp)
-add_gtest_executable(test_avg_pool3d_fwd test_avg_pool3d_fwd.cpp)
-add_gtest_executable(test_max_pool3d_fwd test_max_pool3d_fwd.cpp)
-add_gtest_executable(test_avg_pool2d_fwd test_avg_pool2d_fwd.cpp)
-add_gtest_executable(test_max_pool2d_fwd test_max_pool2d_fwd.cpp)
+add_gtest_executable(test_pool3d_avg_bwd test_pool3d_avg_bwd.cpp)
+add_gtest_executable(test_pool3d_max_bwd test_pool3d_max_bwd.cpp)
+add_gtest_executable(test_pool3d_avg_fwd test_pool3d_avg_fwd.cpp)
+add_gtest_executable(test_pool3d_max_fwd test_pool3d_max_fwd.cpp)
+add_gtest_executable(test_pool2d_avg_fwd test_pool2d_avg_fwd.cpp)
+add_gtest_executable(test_pool2d_max_fwd test_pool2d_max_fwd.cpp)

-target_link_libraries(test_avg_pool3d_bwd PRIVATE utility device_avg_pool3d_bwd_instance)
-target_link_libraries(test_max_pool3d_bwd PRIVATE utility device_max_pool_bwd_instance)
-target_link_libraries(test_avg_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance)
-target_link_libraries(test_max_pool3d_fwd PRIVATE utility device_pool3d_fwd_instance)
-target_link_libraries(test_avg_pool2d_fwd PRIVATE utility device_pool2d_fwd_instance)
-target_link_libraries(test_max_pool2d_fwd PRIVATE utility device_pool2d_fwd_instance)
+target_link_libraries(test_pool3d_avg_bwd PRIVATE utility device_pool3d_avg_bwd_instance)
+target_link_libraries(test_pool3d_max_bwd PRIVATE utility device_pool_max_bwd_instance)
+target_link_libraries(test_pool3d_avg_fwd PRIVATE utility device_pool3d_fwd_instance)
+target_link_libraries(test_pool3d_max_fwd PRIVATE utility device_pool3d_fwd_instance)
+target_link_libraries(test_pool2d_avg_fwd PRIVATE utility device_pool2d_fwd_instance)
+target_link_libraries(test_pool2d_max_fwd PRIVATE utility device_pool2d_fwd_instance)

-add_dependencies(test_pool test_avg_pool3d_bwd)
-add_dependencies(test_pool test_max_pool3d_bwd)
-add_dependencies(test_pool test_avg_pool3d_fwd)
-add_dependencies(test_pool test_max_pool3d_fwd)
-add_dependencies(test_pool test_avg_pool2d_fwd)
-add_dependencies(test_pool test_max_pool2d_fwd)
+add_dependencies(test_pool test_pool3d_avg_bwd)
+add_dependencies(test_pool test_pool3d_max_bwd)
+add_dependencies(test_pool test_pool3d_avg_fwd)
+add_dependencies(test_pool test_pool3d_max_fwd)
+add_dependencies(test_pool test_pool2d_avg_fwd)
+add_dependencies(test_pool test_pool2d_max_fwd)
--- a/test/pool/test_max_pool3d_fwd.cpp
+++ b/test/pool/test_max_pool3d_fwd.cpp
@@ -1,86 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "gtest/gtest.h"
-#include "profiler/profile_pool3d_fwd_impl.hpp"
-#include "test_pool_fwd_common.hpp"
-
-template <typename Tuple>
-class TestMaxPool3dFwd : public ::testing::Test
-{
-    protected:
-    using InDataType      = std::tuple_element_t<0, Tuple>;
-    using OutDataType     = std::tuple_element_t<1, Tuple>;
-    using ComputeDataType = std::tuple_element_t<2, Tuple>;
-    using IndexDataType   = std::tuple_element_t<3, Tuple>;
-
-    std::vector<PoolingParam> params;
-
-    void Run()
-    {
-        for(auto param : params)
-        {
-            // max pool
-            bool success =
-                ck::profiler::profile_pool3d_fwd_impl<InDataType,
-                                                      OutDataType,
-                                                      ComputeDataType,
-                                                      IndexDataType,
-                                                      ck::tensor_layout::convolution::NDHWC,
-                                                      ck::tensor_layout::convolution::NDHWC,
-                                                      ck::ReduceTensorOp::MAX,
-                                                      false,
-                                                      false>(true,
-                                                             2,
-                                                             false,
-                                                             false,
-                                                             param.length_,
-                                                             param.window_spatial_lengths_,
-                                                             param.window_strides_,
-                                                             param.window_dilations_,
-                                                             param.input_left_pads_,
-                                                             param.input_right_pads_);
-            EXPECT_TRUE(success);
-
-            // max pool + index
-            success = ck::profiler::profile_pool3d_fwd_impl<InDataType,
-                                                            OutDataType,
-                                                            ComputeDataType,
-                                                            IndexDataType,
-                                                            ck::tensor_layout::convolution::NDHWC,
-                                                            ck::tensor_layout::convolution::NDHWC,
-                                                            ck::ReduceTensorOp::MAX,
-                                                            false,
-                                                            true>(true,
-                                                                  2,
-                                                                  false,
-                                                                  false,
-                                                                  param.length_,
-                                                                  param.window_spatial_lengths_,
-                                                                  param.window_strides_,
-                                                                  param.window_dilations_,
-                                                                  param.input_left_pads_,
-                                                                  param.input_right_pads_);
-            EXPECT_TRUE(success);
-        }
-    }
-};
-
-#ifdef CK_ENABLE_FP16
-using KernelTypes =
-    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
-#else
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
-#endif
-
-TYPED_TEST_SUITE(TestMaxPool3dFwd, KernelTypes);
-TYPED_TEST(TestMaxPool3dFwd, Test_Pool)
-{
-    // length, window_length, window_stride, window_dilation, left_pad, right_pad
-    this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
-                    {{2, 16, 64, 64, 64}, {64, 64, 64}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
-                    {{2, 16, 64, 64, 64}, {4, 4, 4}, {4, 4, 4}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}},
-                    {{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}};
-
-    this->Run();
-}
--- a/test/pool/test_pool2d_avg_fwd.cpp
+++ b/test/pool/test_pool2d_avg_fwd.cpp
@@ -45,12 +45,8 @@ class TestAvgPool2dFwd : public ::testing::Test
    }
 };

-#ifdef CK_ENABLE_FP16
 using KernelTypes =
    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
-#else
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
-#endif

 TYPED_TEST_SUITE(TestAvgPool2dFwd, KernelTypes);
 TYPED_TEST(TestAvgPool2dFwd, Test_Pool)
--- a/test/pool/test_pool2d_max_fwd.cpp
+++ b/test/pool/test_pool2d_max_fwd.cpp
@@ -9,10 +9,11 @@ template <typename Tuple>
 class TestMaxPool2dFwd : public ::testing::Test
 {
    protected:
-    using InDataType      = std::tuple_element_t<0, Tuple>;
-    using OutDataType     = std::tuple_element_t<1, Tuple>;
-    using ComputeDataType = std::tuple_element_t<2, Tuple>;
-    using IndexDataType   = std::tuple_element_t<3, Tuple>;
+    using InDataType                  = std::tuple_element_t<0, Tuple>;
+    using OutDataType                 = std::tuple_element_t<1, Tuple>;
+    using ComputeDataType             = std::tuple_element_t<2, Tuple>;
+    using IndexDataType               = std::tuple_element_t<3, Tuple>;
+    static constexpr bool ReturnIndex = std::tuple_element_t<4, Tuple>::value;

    std::vector<PoolingParam> params;

@@ -30,48 +31,28 @@ class TestMaxPool2dFwd : public ::testing::Test
                                                      ck::tensor_layout::convolution::NHWC,
                                                      ck::ReduceTensorOp::MAX,
                                                      false,
-                                                      false>(true,
-                                                             2,
-                                                             false,
-                                                             false,
-                                                             param.length_,
-                                                             param.window_spatial_lengths_,
-                                                             param.window_strides_,
-                                                             param.window_dilations_,
-                                                             param.input_left_pads_,
-                                                             param.input_right_pads_);
-            EXPECT_TRUE(success);
-
-            // max pool + index
-            success = ck::profiler::profile_pool2d_fwd_impl<InDataType,
-                                                            OutDataType,
-                                                            ComputeDataType,
-                                                            IndexDataType,
-                                                            ck::tensor_layout::convolution::NHWC,
-                                                            ck::tensor_layout::convolution::NHWC,
-                                                            ck::ReduceTensorOp::MAX,
-                                                            false,
-                                                            true>(true,
-                                                                  2,
-                                                                  false,
-                                                                  false,
-                                                                  param.length_,
-                                                                  param.window_spatial_lengths_,
-                                                                  param.window_strides_,
-                                                                  param.window_dilations_,
-                                                                  param.input_left_pads_,
-                                                                  param.input_right_pads_);
+                                                      ReturnIndex>(true,
+                                                                   2,
+                                                                   false,
+                                                                   false,
+                                                                   param.length_,
+                                                                   param.window_spatial_lengths_,
+                                                                   param.window_strides_,
+                                                                   param.window_dilations_,
+                                                                   param.input_left_pads_,
+                                                                   param.input_right_pads_);
            EXPECT_TRUE(success);
        }
    }
 };

-#ifdef CK_ENABLE_FP16
-using KernelTypes =
-    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
-#else
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
-#endif
+using true_t  = std::integral_constant<bool, true>;
+using false_t = std::integral_constant<bool, false>;
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, I32, true_t>,
+                                     std::tuple<F16, F16, F32, I32, false_t>,
+                                     std::tuple<F32, F32, F32, I32, true_t>,
+                                     std::tuple<F32, F32, F32, I32, false_t>>;

 TYPED_TEST_SUITE(TestMaxPool2dFwd, KernelTypes);
 TYPED_TEST(TestMaxPool2dFwd, Test_Pool)
--- a/test/pool/test_pool3d_avg_bwd.cpp
+++ b/test/pool/test_pool3d_avg_bwd.cpp
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/profile_avg_pool3d_bwd_impl.hpp"
+#include "profiler/profile_pool3d_avg_bwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"

 template <typename Tuple>
@@ -22,7 +22,7 @@ class TestAvgPool3dBwd : public ::testing::Test
        for(auto param : params)
        {
            bool success =
-                ck::profiler::profile_avg_pool3d_bwd_impl<DOutDataType,
+                ck::profiler::profile_pool3d_avg_bwd_impl<DOutDataType,
                                                          DInDataType,
                                                          ComputeDataType,
                                                          DOutLayout,
@@ -41,26 +41,9 @@ class TestAvgPool3dBwd : public ::testing::Test
    }
 };

-#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
 using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>,
                                     std::tuple<BF16, BF16, F32, NDHWC, NDHWC>,
                                     std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP32)
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>,
-                                     std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
-using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, F32, NDHWC, NDHWC>,
-                                     std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16)
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>,
-                                     std::tuple<BF16, BF16, F32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_FP16)
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_BF16)
-using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, F32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_FP32)
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, NDHWC, NDHWC>>;
-#endif

 TYPED_TEST_SUITE(TestAvgPool3dBwd, KernelTypes);
 TYPED_TEST(TestAvgPool3dBwd, Test_Pool)
--- a/test/pool/test_pool3d_avg_fwd.cpp
+++ b/test/pool/test_pool3d_avg_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
 #include "profiler/profile_pool3d_fwd_impl.hpp"
@@ -43,12 +43,10 @@ class TestAvgPool3dFwd : public ::testing::Test
        }
    }
 };
-#ifdef CK_ENABLE_FP16
+
 using KernelTypes =
    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
-#else
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
-#endif
+
 TYPED_TEST_SUITE(TestAvgPool3dFwd, KernelTypes);
 TYPED_TEST(TestAvgPool3dFwd, Test_Pool)
 {
--- a/test/pool/test_pool3d_max_bwd.cpp
+++ b/test/pool/test_pool3d_max_bwd.cpp
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/profile_max_pool3d_bwd_impl.hpp"
+#include "profiler/profile_pool3d_max_bwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"

 template <typename Tuple>
@@ -23,7 +23,7 @@ class TestMaxPool3dBwd : public ::testing::Test
        for(auto param : params)
        {
            bool success =
-                ck::profiler::profile_max_pool3d_bwd_impl<InDataType,
+                ck::profiler::profile_pool3d_max_bwd_impl<InDataType,
                                                          OutDataType,
                                                          IndexDataType,
                                                          DOutDataType,
@@ -43,26 +43,9 @@ class TestMaxPool3dBwd : public ::testing::Test
    }
 };

-#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
 using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>,
                                     std::tuple<BF16, BF16, I32, NDHWC, NDHWC>,
                                     std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP32)
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>,
-                                     std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP32)
-using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, I32, NDHWC, NDHWC>,
-                                     std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_FP16) && defined(CK_ENABLE_BF16)
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>,
-                                     std::tuple<BF16, BF16, I32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_FP16)
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, I32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_BF16)
-using KernelTypes = ::testing::Types<std::tuple<BF16, BF16, I32, NDHWC, NDHWC>>;
-#elif defined(CK_ENABLE_FP32)
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, I32, NDHWC, NDHWC>>;
-#endif

 TYPED_TEST_SUITE(TestMaxPool3dBwd, KernelTypes);
 TYPED_TEST(TestMaxPool3dBwd, Test_Pool)
--- a/test/pool/test_pool3d_max_fwd.cpp
+++ b/test/pool/test_pool3d_max_fwd.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_pool3d_fwd_impl.hpp"
+#include "test_pool_fwd_common.hpp"
+
+template <typename Tuple>
+class TestMaxPool3dFwd : public ::testing::Test
+{
+    protected:
+    using InDataType                  = std::tuple_element_t<0, Tuple>;
+    using OutDataType                 = std::tuple_element_t<1, Tuple>;
+    using ComputeDataType             = std::tuple_element_t<2, Tuple>;
+    using IndexDataType               = std::tuple_element_t<3, Tuple>;
+    static constexpr bool ReturnIndex = std::tuple_element_t<4, Tuple>::value;
+
+    std::vector<PoolingParam> params;
+
+    void Run()
+    {
+        for(auto param : params)
+        {
+            bool success =
+                ck::profiler::profile_pool3d_fwd_impl<InDataType,
+                                                      OutDataType,
+                                                      ComputeDataType,
+                                                      IndexDataType,
+                                                      ck::tensor_layout::convolution::NDHWC,
+                                                      ck::tensor_layout::convolution::NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      ReturnIndex>(true,
+                                                                   2,
+                                                                   false,
+                                                                   false,
+                                                                   param.length_,
+                                                                   param.window_spatial_lengths_,
+                                                                   param.window_strides_,
+                                                                   param.window_dilations_,
+                                                                   param.input_left_pads_,
+                                                                   param.input_right_pads_);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using true_t  = std::integral_constant<bool, true>;
+using false_t = std::integral_constant<bool, false>;
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, I32, true_t>,
+                                     std::tuple<F16, F16, F32, I32, false_t>,
+                                     std::tuple<F32, F32, F32, I32, true_t>,
+                                     std::tuple<F32, F32, F32, I32, false_t>>;
+
+TYPED_TEST_SUITE(TestMaxPool3dFwd, KernelTypes);
+TYPED_TEST(TestMaxPool3dFwd, Test_Pool)
+{
+    // length, window_length, window_stride, window_dilation, left_pad, right_pad
+    this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 16, 64, 64, 64}, {64, 64, 64}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 16, 64, 64, 64}, {4, 4, 4}, {4, 4, 4}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}};
+
+    this->Run();
+}