From f504e98b5dbfd6deb1d1409646be4736f5a2a6a0 Mon Sep 17 00:00:00 2001
From: aledudek <aleksander.dudek@amd.com>
Date: Tue, 17 Sep 2024 15:57:10 +0200
Subject: [PATCH] Extend pool3d fwd avg, max operations by f8_t, int8_t types
 (#1483)

* Extend pool3d fwd avg, max operations by f8_t, int8_t types

* Pack MaxPool3dFwd params together

* Fix MaxPool3dFwd AVG instances

* Decrease verification precision for bf16

* Adjust tests + review changes

* Adjust threshold for F8

* Adjusted compute types for MAX op instances

* Fix ComputeDataType mismatch in tests and profiler for AVG

* Fix naming from max_pool3d_fwd to pool3d_fwd

* Adjust CMakeLists

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

[ROCm/composable_kernel commit: a793afc961c9ab3a314ffdcacafe6455181d57e1]
---
 .../gpu/pool3d_fwd.hpp                        |  71 +++-
 .../gpu/pool3d_fwd/CMakeLists.txt             |   4 +
 ...vice_avg_pool3d_fwd_ndhwc_f16_instance.cpp |   2 +-
 ...evice_avg_pool3d_fwd_ndhwc_f8_instance.cpp |  24 ++
 ...evice_avg_pool3d_fwd_ndhwc_i8_instance.cpp |  24 ++
 ...ice_max_pool3d_fwd_ndhwc_bf16_instance.cpp |   2 +-
 ...vice_max_pool3d_fwd_ndhwc_f16_instance.cpp |   2 +-
 ...evice_max_pool3d_fwd_ndhwc_f8_instance.cpp |  32 ++
 ...evice_max_pool3d_fwd_ndhwc_i8_instance.cpp |  32 ++
 .../pool3d_fwd/pool_fwd_instance_common.hpp   |   4 +-
 .../profiler/profile_pool3d_fwd_impl.hpp      | 121 ++++---
 profiler/src/CMakeLists.txt                   |   2 +-
 profiler/src/profile_max_pool3d_fwd.cpp       | 245 -------------
 profiler/src/profile_pool3d_fwd.cpp           | 331 ++++++++++++++++++
 test/pool/test_avg_pool3d_fwd.cpp             |  35 +-
 test/pool/test_max_pool3d_fwd.cpp             |  46 +--
 test/pool/test_pool_fwd_common.hpp            |   2 +
 17 files changed, 620 insertions(+), 359 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp
 delete mode 100644 profiler/src/profile_max_pool3d_fwd.cpp
 create mode 100644 profiler/src/profile_pool3d_fwd.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
index 94ee68a409..e5e06f9e55 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -22,7 +22,7 @@ static constexpr auto WindowRank = 3;
 
 static constexpr auto MaxOp = ck::ReduceTensorOp::MAX;
 static constexpr auto AvgOp = ck::ReduceTensorOp::AVG;
-#ifdef CK_ENABLE_FP16
+
 // FP16
 void add_device_pool3d_fwd_ndhwc_f16_instances(
     std::vector<std::unique_ptr<
@@ -36,8 +36,22 @@ void add_device_pool3d_fwd_ndhwc_f16_instances(
 void add_device_pool3d_fwd_ndhwc_index_f16_instances(
     std::vector<std::unique_ptr<
         DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, NDHWC, NDHWC, MaxOp, true>>>&);
-#endif
-#ifdef CK_ENABLE_BF16
+
+using F8 = ck::f8_t;
+// F8
+void add_device_pool3d_fwd_ndhwc_f8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NDHWC, NDHWC, MaxOp, false>>>&);
+
+void add_device_pool3d_fwd_ndhwc_f8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NDHWC, NDHWC, AvgOp, false>>>&);
+
+// FP8 - return index
+void add_device_pool3d_fwd_ndhwc_index_f8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NDHWC, NDHWC, MaxOp, true>>>&);
+
 // BF16
 void add_device_pool3d_fwd_ndhwc_bf16_instances(
     std::vector<std::unique_ptr<
@@ -51,8 +65,7 @@ void add_device_pool3d_fwd_ndhwc_bf16_instances(
 void add_device_pool3d_fwd_ndhwc_index_bf16_instances(
     std::vector<std::unique_ptr<
         DevicePoolFwd<InOutRank, WindowRank, BF16, BF16, I32, NDHWC, NDHWC, MaxOp, true>>>&);
-#endif
-#ifdef CK_ENABLE_FP32
+
 // FP32
 void add_device_pool3d_fwd_ndhwc_f32_instances(
     std::vector<std::unique_ptr<
@@ -66,7 +79,21 @@ void add_device_pool3d_fwd_ndhwc_f32_instances(
 void add_device_pool3d_fwd_ndhwc_index_f32_instances(
     std::vector<std::unique_ptr<
         DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, NDHWC, NDHWC, MaxOp, true>>>&);
-#endif
+
+// I8
+void add_device_pool3d_fwd_ndhwc_i8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NDHWC, NDHWC, MaxOp, false>>>&);
+
+void add_device_pool3d_fwd_ndhwc_i8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NDHWC, NDHWC, AvgOp, false>>>&);
+
+// I8 - return index
+void add_device_pool3d_fwd_ndhwc_index_i8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NDHWC, NDHWC, MaxOp, true>>>&);
+
 template <typename InDataType,
           typename OutDataType,
           typename IndexDataType,
@@ -99,7 +126,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
         if constexpr(is_same_v<InLayout, NDHWC> && is_same_v<OutLayout, NDHWC>)
         {
-#ifdef CK_ENABLE_FP16
             if constexpr(is_same_v<InDataType, F16> && is_same_v<OutDataType, F16> &&
                          is_same_v<IndexDataType, I32>)
             {
@@ -112,8 +138,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
                     add_device_pool3d_fwd_ndhwc_f16_instances(op_ptrs);
                 }
             }
-#endif
-#ifdef CK_ENABLE_BF16
             else if constexpr(is_same_v<InDataType, BF16> && is_same_v<OutDataType, BF16> &&
                               is_same_v<IndexDataType, I32>)
             {
@@ -126,8 +150,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
                     add_device_pool3d_fwd_ndhwc_bf16_instances(op_ptrs);
                 }
             }
-#endif
-#ifdef CK_ENABLE_FP32
             else if constexpr(is_same_v<InDataType, F32> && is_same_v<OutDataType, F32> &&
                               is_same_v<IndexDataType, I32>)
             {
@@ -140,7 +162,30 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
                     add_device_pool3d_fwd_ndhwc_f32_instances(op_ptrs);
                 }
             }
-#endif
+            else if constexpr(is_same_v<InDataType, F8> && is_same_v<OutDataType, F8> &&
+                              is_same_v<IndexDataType, I32>)
+            {
+                if constexpr(OutputIndex && ReduceOpId == MaxOp)
+                {
+                    add_device_pool3d_fwd_ndhwc_index_f8_instances(op_ptrs);
+                }
+                else
+                {
+                    add_device_pool3d_fwd_ndhwc_f8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(is_same_v<InDataType, I8> && is_same_v<OutDataType, I8> &&
+                              is_same_v<IndexDataType, I32>)
+            {
+                if constexpr(OutputIndex && ReduceOpId == MaxOp)
+                {
+                    add_device_pool3d_fwd_ndhwc_index_i8_instances(op_ptrs);
+                }
+                else
+                {
+                    add_device_pool3d_fwd_ndhwc_i8_instances(op_ptrs);
+                }
+            }
         }
 
         return op_ptrs;
diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
index 533ce89dfd..a40663bf75 100644
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt
@@ -1,6 +1,10 @@
 set(DEVICE_POOL3D_FWD_INSTANCES)
 list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
                                         device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+                                        device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+                                        device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
+                                        device_max_pool3d_fwd_ndhwc_i8_instance.cpp
+                                        device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
                                         device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
                                         device_max_pool3d_fwd_ndhwc_f32_instance.cpp
                                         device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
index 4ebd50bae6..46ef817dd9 100644
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
new file mode 100644
index 0000000000..6069b5a83a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool3d_fwd_ndhwc_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
new file mode 100644
index 0000000000..fc286ffccd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool3d_fwd_ndhwc_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
index 5dc504e178..d9da126fcf 100644
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
index 46b16bd005..9644fb992d 100644
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
new file mode 100644
index 0000000000..af31cf8a86
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool3d_fwd_ndhwc_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, false>{});
+}
+
+void add_device_pool3d_fwd_ndhwc_index_f8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F8, F8, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp
new file mode 100644
index 0000000000..78278ccc1f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool3d_fwd_ndhwc_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, false>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I8, ReduceOpId, false>{});
+}
+
+void add_device_pool3d_fwd_ndhwc_index_i8_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, I8, I8, I32, NDHWC, NDHWC, ReduceOpId, true>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<I8, I8, I32, I8, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
index e8e7813295..e477f1ed73 100644
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -15,6 +15,8 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+using I8    = int8_t;
+using F8    = ck::f8_t;
 using I32   = int32_t;
 using F16   = ck::half_t;
 using BF16  = ck::bhalf_t;
diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
index 02fde48d6e..3bdaa5c838 100644
--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -17,6 +17,26 @@
 namespace ck {
 namespace profiler {
 
+struct PoolFwdInputParams
+{
+    int do_verification;
+    int init_method;
+    bool do_log;
+    bool time_kernel;
+    bool return_index;
+    int reduce_op;
+};
+
+struct PoolFwdKernelParams
+{
+    std::vector<index_t> in_length; // NCDHW
+    std::vector<index_t> window_spatial_lengths;
+    std::vector<index_t> window_strides;
+    std::vector<index_t> window_dilations;
+    std::vector<index_t> input_left_pads;
+    std::vector<index_t> input_right_pads;
+};
+
 template <typename InDataType,
           typename OutDataType,
           typename ComputeDataType,
@@ -26,29 +46,23 @@ template <typename InDataType,
           ck::ReduceTensorOp ReduceOpId,
           bool PropagateNan,
           bool OutputIndex>
-bool profile_pool3d_fwd_impl(int do_verification,
-                             int init_method,
-                             bool do_log,
-                             bool time_kernel,
-                             std::vector<index_t> in_length, // NCDHW
-                             std::vector<index_t> window_spatial_lengths,
-                             std::vector<index_t> window_strides,
-                             std::vector<index_t> window_dilations,
-                             std::vector<index_t> input_left_pads,
-                             std::vector<index_t> input_right_pads)
+bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& kernel_params)
 {
     constexpr index_t InOutRank  = 5;
     constexpr index_t WindowRank = 3;
 
-    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
-       window_strides.size() != WindowRank || window_dilations.size() != WindowRank ||
-       input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+    if(kernel_params.in_length.size() != InOutRank ||
+       kernel_params.window_spatial_lengths.size() != WindowRank ||
+       kernel_params.window_strides.size() != WindowRank ||
+       kernel_params.window_dilations.size() != WindowRank ||
+       kernel_params.input_left_pads.size() != WindowRank ||
+       kernel_params.input_right_pads.size() != WindowRank)
         return false;
 
     std::vector<index_t> out_length(InOutRank);
 
-    int N = in_length[0];
-    int C = in_length[1];
+    int N = kernel_params.in_length[0];
+    int C = kernel_params.in_length[1];
 
     out_length[0] = N;
     out_length[1] = C;
@@ -56,18 +70,18 @@ bool profile_pool3d_fwd_impl(int do_verification,
     // Calculate Do, Ho, Wo
     for(int i = 2; i < InOutRank; ++i)
     {
-        auto pad1             = input_left_pads[i - 2];
-        auto pad2             = input_right_pads[i - 2];
-        auto windows_size     = window_spatial_lengths[i - 2];
-        auto windows_stride   = window_strides[i - 2];
-        auto windows_dilation = window_dilations[i - 2];
+        auto pad1             = kernel_params.input_left_pads[i - 2];
+        auto pad2             = kernel_params.input_right_pads[i - 2];
+        auto windows_size     = kernel_params.window_spatial_lengths[i - 2];
+        auto windows_stride   = kernel_params.window_strides[i - 2];
+        auto windows_dilation = kernel_params.window_dilations[i - 2];
         auto eff              = (windows_size - 1) * windows_dilation + 1;
-        out_length[i]         = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
+        out_length[i] = (kernel_params.in_length[i] + pad1 + pad2 - eff) / windows_stride + 1;
     }
 
-    int Di = in_length[2];
-    int Hi = in_length[3];
-    int Wi = in_length[4];
+    int Di = kernel_params.in_length[2];
+    int Hi = kernel_params.in_length[3];
+    int Wi = kernel_params.in_length[4];
     int Do = out_length[2];
     int Ho = out_length[3];
     int Wo = out_length[4];
@@ -88,7 +102,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
     Tensor<IndexDataType> out_indices_n_c_do_ho_wo_device(
         f_host_tensor_descriptor(N, C, Do, Ho, Wo));
 
-    switch(init_method)
+    switch(in_params.init_method)
     {
     case 0: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
     case 1: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
@@ -125,7 +139,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
     float best_avg_time   = std::numeric_limits<float>::max();
     float best_gb_per_sec = 0;
 
-    if(do_verification)
+    if(in_params.do_verification)
     {
         using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
                                                                                   WindowRank,
@@ -141,11 +155,11 @@ bool profile_pool3d_fwd_impl(int do_verification,
         auto ref_argument = ref.MakeArgument(in_n_c_di_hi_wi,
                                              out_n_c_do_ho_wo_host,
                                              out_indices_n_c_do_ho_wo_host,
-                                             window_spatial_lengths,
-                                             window_strides,
-                                             window_dilations,
-                                             input_left_pads,
-                                             input_right_pads);
+                                             kernel_params.window_spatial_lengths,
+                                             kernel_params.window_strides,
+                                             kernel_params.window_dilations,
+                                             kernel_params.input_left_pads,
+                                             kernel_params.input_right_pads);
         auto ref_invoker  = ref.MakeInvoker();
         ref_invoker.Run(ref_argument);
     }
@@ -158,16 +172,16 @@ bool profile_pool3d_fwd_impl(int do_verification,
             static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
             static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
             static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
-            in_length,
-            window_spatial_lengths,
+            kernel_params.in_length,
+            kernel_params.window_spatial_lengths,
             out_length,
             {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C},
             {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
             {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
-            window_strides,
-            window_dilations,
-            input_left_pads,
-            input_right_pads,
+            kernel_params.window_strides,
+            kernel_params.window_dilations,
+            kernel_params.input_left_pads,
+            kernel_params.input_right_pads,
             {2, 3, 4});
 
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
@@ -176,10 +190,11 @@ bool profile_pool3d_fwd_impl(int do_verification,
         }
         else
         {
-            if(time_kernel)
+            if(in_params.time_kernel)
             {
                 std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
-                LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
+                LogRange(std::cout << "input lengths = ", kernel_params.in_length, ", ")
+                    << std::endl;
             }
 
             continue;
@@ -187,7 +202,8 @@ bool profile_pool3d_fwd_impl(int do_verification,
 
         auto invoker_ptr = inst_ptr->MakeInvokerPointer();
 
-        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        float avg_time =
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, in_params.time_kernel});
 
         std::size_t num_bytes = in_n_c_di_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
                                 out_n_c_do_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
@@ -198,7 +214,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
 
         float gb_per_sec = num_bytes / 1.E6 / avg_time;
 
-        if(time_kernel)
+        if(in_params.time_kernel)
             std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
                       << inst_ptr->GetTypeString() << std::endl;
 
@@ -209,25 +225,25 @@ bool profile_pool3d_fwd_impl(int do_verification,
             best_gb_per_sec    = gb_per_sec;
         }
 
-        if(do_verification)
+        if(in_params.do_verification)
         {
             out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
 
-            bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
+            auto tolerance = 1e-3;
+            bool pass      = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
                                              out_n_c_do_ho_wo_host.mData,
                                              "Error: Incorrect results",
-                                             1e-3,
-                                             1e-3);
+                                             tolerance,
+                                             tolerance);
 
             if constexpr(OutputIndex)
             {
                 out_indices_device_buf.FromDevice(out_indices_n_c_do_ho_wo_device.mData.data());
-
                 pass = pass && ck::utils::check_err(out_indices_n_c_do_ho_wo_device,
                                                     out_indices_n_c_do_ho_wo_host);
             }
 
-            if(do_log)
+            if(in_params.do_log)
             {
                 LogRangeAsType<float>(
                     std::cout << "in_n_c_di_hi_wi  : ", in_n_c_di_hi_wi.mData, ",")
@@ -249,20 +265,21 @@ bool profile_pool3d_fwd_impl(int do_verification,
             if(!pass)
             {
                 std::cout << inst_ptr->GetTypeString() << " failed verification: ";
-                LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
+                LogRange(std::cout << "lengths = [", kernel_params.in_length, ", ")
+                    << "]." << std::endl;
                 return false;
             }
             else
             {
-                if(time_kernel)
+                if(in_params.time_kernel)
                     std::cout << "pass" << std::endl;
             }
         }
     }
 
-    if(time_kernel)
+    if(in_params.time_kernel)
     {
-        LogRange(std::cout << "length = ", in_length, ",") << std::endl;
+        LogRange(std::cout << "length = ", kernel_params.in_length, ",") << std::endl;
         std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
                   << best_instance_name << std::endl;
     }
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 8d91c2b5e0..e9528baeb6 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -10,7 +10,7 @@ set(PROFILER_SOURCES
     profile_groupnorm_bwd_gamma_beta.cpp
     profile_layernorm_fwd.cpp
     profile_max_pool2d_fwd.cpp
-    profile_max_pool3d_fwd.cpp
+    profile_pool3d_fwd.cpp
     profile_avg_pool3d_bwd.cpp
     profile_max_pool3d_bwd.cpp
     profile_avg_pool2d_bwd.cpp
diff --git a/profiler/src/profile_max_pool3d_fwd.cpp b/profiler/src/profile_max_pool3d_fwd.cpp
deleted file mode 100644
index 52fdf29fe4..0000000000
--- a/profiler/src/profile_max_pool3d_fwd.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <vector>
-#include <unordered_map>
-
-#include "profiler/data_type_enum.hpp"
-#include "profiler/profile_pool3d_fwd_impl.hpp"
-#include "profiler_operation_registry.hpp"
-
-using ck::index_t;
-
-struct maxPoolFwdArgParser
-{
-    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
-                                                                   {"wsize", {}},
-                                                                   {"wstride", {}},
-                                                                   {"wdilation", {}},
-                                                                   {"pad1", {}},
-                                                                   {"pad2", {}}};
-
-    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
-    {
-        if(std::string("--") + key == argv[i])
-        {
-            int pos = i;
-            while(++i < argc && argv[i][0] != '-') {}
-            int end = i;
-            for(int j = pos + 1; j < end; j++)
-            {
-                long_opts[key].push_back(std::stoi(argv[j]));
-            }
-            return true;
-        }
-        return false;
-    }
-
-    void operator()(int argc, char* argv[])
-    {
-        for(auto& kv : long_opts)
-        {
-            for(int i = 1; i < argc; i++)
-            {
-                if(parse_opt(argc, argv, kv.first, i))
-                    break;
-            }
-        }
-    }
-};
-
-void print_help_max_pool3d_fwd()
-{
-    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
-              << "arg2: verification (0: no; 1: yes)\n"
-              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
-              << "arg4: print tensor value (0: no; 1: yes)\n"
-              << "arg5: time kernel (0=no, 1=yes)\n"
-              << "arg6: return index (0=no, 1=yes)\n"
-              << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
-              << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
-              << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
-              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
-              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
-              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
-              << "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
-                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
-              << std::endl;
-}
-
-int profile_max_pool3d_fwd(int argc, char* argv[])
-{
-    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
-    bool do_verification       = true;
-    int init_method            = 0;
-    bool do_log                = false;
-    bool time_kernel           = true;
-    bool return_index          = false;
-
-    std::vector<index_t> in_length = {2, 32, 30, 30, 30};
-    std::vector<index_t> wsize     = {2, 2, 2};
-    std::vector<index_t> wstride   = {2, 2, 2};
-    std::vector<index_t> wdilation = {1, 1, 1};
-    std::vector<index_t> pad1      = {1, 1, 1};
-    std::vector<index_t> pad2      = {1, 1, 1};
-
-    if(argc != 2 && argc != 34)
-    {
-        print_help_max_pool3d_fwd();
-        return 0;
-    }
-    else if(argc == 34)
-    {
-        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
-        do_verification = std::stoi(argv[3]);
-        init_method     = std::stoi(argv[4]);
-        do_log          = std::stoi(argv[5]);
-        time_kernel     = std::stoi(argv[6]);
-        return_index    = std::stoi(argv[7]);
-
-        // parse the long options
-        maxPoolFwdArgParser arg_parser;
-        arg_parser(argc, argv);
-        in_length = arg_parser.long_opts["length"];
-        wsize     = arg_parser.long_opts["wsize"];
-        wstride   = arg_parser.long_opts["wstride"];
-        wdilation = arg_parser.long_opts["wdilation"];
-        pad1      = arg_parser.long_opts["pad1"];
-        pad2      = arg_parser.long_opts["pad2"];
-    }
-
-#ifdef CK_ENABLE_FP16
-    using F16 = ck::half_t;
-#endif
-#ifdef CK_ENABLE_BF16
-    using BF16 = ck::bhalf_t;
-#endif
-#ifdef CK_ENABLE_FP32
-    using F32 = float;
-#endif
-    using I32   = int32_t;
-    using NDHWC = ck::tensor_layout::convolution::NDHWC;
-
-#if 1
-    constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
-#else
-    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
-#endif
-
-    if(false)
-        ;
-#ifdef CK_ENABLE_FP16
-    else if(data_type == ck::DataTypeEnum::Half)
-    {
-        if(return_index)
-            ck::profiler::
-                profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
-                    do_verification,
-                    init_method,
-                    do_log,
-                    time_kernel,
-                    in_length,
-                    wsize,
-                    wstride,
-                    wdilation,
-                    pad1,
-                    pad2);
-        else
-            ck::profiler::
-                profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
-                    do_verification,
-                    init_method,
-                    do_log,
-                    time_kernel,
-                    in_length,
-                    wsize,
-                    wstride,
-                    wdilation,
-                    pad1,
-                    pad2);
-    }
-#endif
-#ifdef CK_ENABLE_BF16
-    else if(data_type == ck::DataTypeEnum::BFloat16)
-    {
-        if(return_index)
-            ck::profiler::profile_pool3d_fwd_impl<BF16,
-                                                  BF16,
-                                                  BF16,
-                                                  I32,
-                                                  NDHWC,
-                                                  NDHWC,
-                                                  ReduceOpId,
-                                                  false,
-                                                  true>(do_verification,
-                                                        init_method,
-                                                        do_log,
-                                                        time_kernel,
-                                                        in_length,
-                                                        wsize,
-                                                        wstride,
-                                                        wdilation,
-                                                        pad1,
-                                                        pad2);
-        else
-            ck::profiler::profile_pool3d_fwd_impl<BF16,
-                                                  BF16,
-                                                  BF16,
-                                                  I32,
-                                                  NDHWC,
-                                                  NDHWC,
-                                                  ReduceOpId,
-                                                  false,
-                                                  false>(do_verification,
-                                                         init_method,
-                                                         do_log,
-                                                         time_kernel,
-                                                         in_length,
-                                                         wsize,
-                                                         wstride,
-                                                         wdilation,
-                                                         pad1,
-                                                         pad2);
-    }
-#endif
-#ifdef CK_ENABLE_FP32
-    else if(data_type == ck::DataTypeEnum::Float)
-    {
-        if(return_index)
-            ck::profiler::
-                profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
-                    do_verification,
-                    init_method,
-                    do_log,
-                    time_kernel,
-                    in_length,
-                    wsize,
-                    wstride,
-                    wdilation,
-                    pad1,
-                    pad2);
-        else
-            ck::profiler::
-                profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
-                    do_verification,
-                    init_method,
-                    do_log,
-                    time_kernel,
-                    in_length,
-                    wsize,
-                    wstride,
-                    wdilation,
-                    pad1,
-                    pad2);
-    }
-#endif
-    else
-    {
-        throw std::runtime_error("not implemented yet");
-    }
-
-    return 0;
-}
-
-REGISTER_PROFILER_OPERATION("max_pool3d_fwd", "max_pool3d fwd", profile_max_pool3d_fwd);
diff --git a/profiler/src/profile_pool3d_fwd.cpp b/profiler/src/profile_pool3d_fwd.cpp
new file mode 100644
index 0000000000..4ea1fbcf49
--- /dev/null
+++ b/profiler/src/profile_pool3d_fwd.cpp
@@ -0,0 +1,331 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_pool3d_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+struct poolFwdArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
+                                                                   {"wsize", {}},
+                                                                   {"wstride", {}},
+                                                                   {"wdilation", {}},
+                                                                   {"pad1", {}},
+                                                                   {"pad2", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help_pool3d_fwd()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32; 3: int8; 5: bf16; 7: fp8)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=no, 1=yes)\n"
+              << "arg6: return index (0=no, 1=yes)\n"
+              << "arg7: reduce op (0: max; 1: avg)\n"
+              << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
+              << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
+              << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
+              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
+              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
+              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
+              << "eg: ckProfiler pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
+                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
+              << std::endl;
+}
+
+int profile_pool3d_fwd(int argc, char* argv[])
+{
+    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
+    ck::profiler::PoolFwdInputParams in_params{true, 0, false, true, false, 0};
+    ck::profiler::PoolFwdKernelParams kernel_params{
+        {2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+
+    if(argc != 2 && argc != 35)
+    {
+        print_help_pool3d_fwd();
+        return 0;
+    }
+    else if(argc == 35)
+    {
+        data_type                 = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+        in_params.do_verification = std::stoi(argv[3]);
+        in_params.init_method     = std::stoi(argv[4]);
+        in_params.do_log          = std::stoi(argv[5]);
+        in_params.time_kernel     = std::stoi(argv[6]);
+        in_params.return_index    = std::stoi(argv[7]);
+        in_params.reduce_op       = std::stoi(argv[8]);
+
+        // parse the long options
+        poolFwdArgParser arg_parser;
+        arg_parser(argc, argv);
+        kernel_params.in_length              = arg_parser.long_opts["length"];
+        kernel_params.window_spatial_lengths = arg_parser.long_opts["wsize"];
+        kernel_params.window_strides         = arg_parser.long_opts["wstride"];
+        kernel_params.window_dilations       = arg_parser.long_opts["wdilation"];
+        kernel_params.input_left_pads        = arg_parser.long_opts["pad1"];
+        kernel_params.input_right_pads       = arg_parser.long_opts["pad2"];
+    }
+
+    using F16   = ck::half_t;
+    using BF16  = ck::bhalf_t;
+    using F32   = float;
+    using I8    = int8_t;
+    using I32   = int32_t;
+    using F8    = ck::f8_t;
+    using NDHWC = ck::tensor_layout::convolution::NDHWC;
+
+    if(data_type == ck::DataTypeEnum::Half)
+    {
+        if(in_params.reduce_op == 1)
+        {
+            ck::profiler::profile_pool3d_fwd_impl<F16,
+                                                  F16,
+                                                  F32,
+                                                  I32,
+                                                  NDHWC,
+                                                  NDHWC,
+                                                  ck::ReduceTensorOp::AVG,
+                                                  false,
+                                                  false>(in_params, kernel_params);
+        }
+        else
+        { // reduce_op == 0
+            if(in_params.return_index)
+            {
+                ck::profiler::profile_pool3d_fwd_impl<F16,
+                                                      F16,
+                                                      F16,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      true>(in_params, kernel_params);
+            }
+            else
+            {
+                ck::profiler::profile_pool3d_fwd_impl<F16,
+                                                      F16,
+                                                      F16,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      false>(in_params, kernel_params);
+            }
+        }
+    }
+    else if(data_type == ck::DataTypeEnum::BFloat16)
+    {
+        if(in_params.reduce_op == 1)
+        {
+            ck::profiler::profile_pool3d_fwd_impl<BF16,
+                                                  BF16,
+                                                  F32,
+                                                  I32,
+                                                  NDHWC,
+                                                  NDHWC,
+                                                  ck::ReduceTensorOp::AVG,
+                                                  false,
+                                                  false>(in_params, kernel_params);
+        }
+        else
+        { // reduce_op == 0
+            if(in_params.return_index)
+            {
+                ck::profiler::profile_pool3d_fwd_impl<BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      true>(in_params, kernel_params);
+            }
+            else
+            {
+                ck::profiler::profile_pool3d_fwd_impl<BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      false>(in_params, kernel_params);
+            }
+        }
+    }
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        if(in_params.reduce_op == 1)
+        {
+            ck::profiler::profile_pool3d_fwd_impl<F32,
+                                                  F32,
+                                                  F32,
+                                                  I32,
+                                                  NDHWC,
+                                                  NDHWC,
+                                                  ck::ReduceTensorOp::AVG,
+                                                  false,
+                                                  false>(in_params, kernel_params);
+        }
+        else
+        { // reduce_op == 0
+            if(in_params.return_index)
+            {
+                ck::profiler::profile_pool3d_fwd_impl<F32,
+                                                      F32,
+                                                      F32,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      true>(in_params, kernel_params);
+            }
+            else
+            {
+                ck::profiler::profile_pool3d_fwd_impl<F32,
+                                                      F32,
+                                                      F32,
+                                                      I32,
+                                                      NDHWC,
+                                                      NDHWC,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      false>(in_params, kernel_params);
+            }
+        }
+    }
+    else if(data_type == ck::DataTypeEnum::Float8)
+    {
+        if(in_params.reduce_op == 1)
+        {
+            return ck::profiler::profile_pool3d_fwd_impl<F8,
+                                                         F8,
+                                                         F32,
+                                                         I32,
+                                                         NDHWC,
+                                                         NDHWC,
+                                                         ck::ReduceTensorOp::AVG,
+                                                         false,
+                                                         false>(in_params, kernel_params);
+        }
+        else
+        { // reduce_op == 0
+            if(in_params.return_index)
+            {
+                return ck::profiler::profile_pool3d_fwd_impl<F8,
+                                                             F8,
+                                                             F8,
+                                                             I32,
+                                                             NDHWC,
+                                                             NDHWC,
+                                                             ck::ReduceTensorOp::MAX,
+                                                             false,
+                                                             true>(in_params, kernel_params);
+            }
+            else
+            {
+                return ck::profiler::profile_pool3d_fwd_impl<F8,
+                                                             F8,
+                                                             F8,
+                                                             I32,
+                                                             NDHWC,
+                                                             NDHWC,
+                                                             ck::ReduceTensorOp::MAX,
+                                                             false,
+                                                             false>(in_params, kernel_params);
+            }
+        }
+    }
+    else if(data_type == ck::DataTypeEnum::Int8)
+    {
+        if(in_params.reduce_op == 1)
+        {
+            return ck::profiler::profile_pool3d_fwd_impl<I8,
+                                                         I8,
+                                                         I32,
+                                                         I32,
+                                                         NDHWC,
+                                                         NDHWC,
+                                                         ck::ReduceTensorOp::AVG,
+                                                         false,
+                                                         false>(in_params, kernel_params);
+        }
+        else
+        { // reduce_op == 0
+            if(in_params.return_index)
+            {
+                return ck::profiler::profile_pool3d_fwd_impl<I8,
+                                                             I8,
+                                                             I8,
+                                                             I32,
+                                                             NDHWC,
+                                                             NDHWC,
+                                                             ck::ReduceTensorOp::MAX,
+                                                             false,
+                                                             true>(in_params, kernel_params);
+            }
+            else
+            {
+                return ck::profiler::profile_pool3d_fwd_impl<I8,
+                                                             I8,
+                                                             I8,
+                                                             I32,
+                                                             NDHWC,
+                                                             NDHWC,
+                                                             ck::ReduceTensorOp::MAX,
+                                                             false,
+                                                             false>(in_params, kernel_params);
+            }
+        }
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("pool3d_fwd", "pool3d fwd", profile_pool3d_fwd);
diff --git a/test/pool/test_avg_pool3d_fwd.cpp b/test/pool/test_avg_pool3d_fwd.cpp
index fc196a8a07..378b05399e 100644
--- a/test/pool/test_avg_pool3d_fwd.cpp
+++ b/test/pool/test_avg_pool3d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_pool3d_fwd_impl.hpp"
@@ -16,10 +16,19 @@ class TestAvgPool3dFwd : public ::testing::Test
 
     std::vector<PoolingParam> params;
 
+    ck::profiler::PoolFwdInputParams in_params_avg_pool{true, 2, false, false, false, 1};
+
     void Run()
     {
         for(auto param : params)
         {
+            ck::profiler::PoolFwdKernelParams kernel_params{param.length_,
+                                                            param.window_spatial_lengths_,
+                                                            param.window_strides_,
+                                                            param.window_dilations_,
+                                                            param.input_left_pads_,
+                                                            param.input_right_pads_};
+
             bool success =
                 ck::profiler::profile_pool3d_fwd_impl<InDataType,
                                                       OutDataType,
@@ -29,26 +38,18 @@ class TestAvgPool3dFwd : public ::testing::Test
                                                       ck::tensor_layout::convolution::NDHWC,
                                                       ck::ReduceTensorOp::AVG,
                                                       false,
-                                                      false>(true,
-                                                             2,
-                                                             false,
-                                                             false,
-                                                             param.length_,
-                                                             param.window_spatial_lengths_,
-                                                             param.window_strides_,
-                                                             param.window_dilations_,
-                                                             param.input_left_pads_,
-                                                             param.input_right_pads_);
+                                                      false>(in_params_avg_pool, kernel_params);
             EXPECT_TRUE(success);
         }
     }
 };
-#ifdef CK_ENABLE_FP16
-using KernelTypes =
-    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
-#else
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
-#endif
+
+using KernelTypes = ::testing::Types<std::tuple<I8, I8, I32, I32>,
+                                     std::tuple<F8, F8, F32, I32>,
+                                     std::tuple<F16, F16, F32, I32>,
+                                     std::tuple<BF16, BF16, F32, I32>,
+                                     std::tuple<F32, F32, F32, I32>>;
+
 TYPED_TEST_SUITE(TestAvgPool3dFwd, KernelTypes);
 TYPED_TEST(TestAvgPool3dFwd, Test_Pool)
 {
diff --git a/test/pool/test_max_pool3d_fwd.cpp b/test/pool/test_max_pool3d_fwd.cpp
index 7189f1b104..d7602f9acd 100644
--- a/test/pool/test_max_pool3d_fwd.cpp
+++ b/test/pool/test_max_pool3d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_pool3d_fwd_impl.hpp"
@@ -16,10 +16,20 @@ class TestMaxPool3dFwd : public ::testing::Test
 
     std::vector<PoolingParam> params;
 
+    ck::profiler::PoolFwdInputParams in_params_max_pool{true, 2, false, false, false, 0};
+    ck::profiler::PoolFwdInputParams in_params_max_pool_indexed{true, 2, false, false, true, 0};
+
     void Run()
     {
         for(auto param : params)
         {
+            ck::profiler::PoolFwdKernelParams kernel_params{param.length_,
+                                                            param.window_spatial_lengths_,
+                                                            param.window_strides_,
+                                                            param.window_dilations_,
+                                                            param.input_left_pads_,
+                                                            param.input_right_pads_};
+
             // max pool
             bool success =
                 ck::profiler::profile_pool3d_fwd_impl<InDataType,
@@ -30,16 +40,7 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                       ck::tensor_layout::convolution::NDHWC,
                                                       ck::ReduceTensorOp::MAX,
                                                       false,
-                                                      false>(true,
-                                                             2,
-                                                             false,
-                                                             false,
-                                                             param.length_,
-                                                             param.window_spatial_lengths_,
-                                                             param.window_strides_,
-                                                             param.window_dilations_,
-                                                             param.input_left_pads_,
-                                                             param.input_right_pads_);
+                                                      false>(in_params_max_pool, kernel_params);
             EXPECT_TRUE(success);
 
             // max pool + index
@@ -51,27 +52,18 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                             ck::tensor_layout::convolution::NDHWC,
                                                             ck::ReduceTensorOp::MAX,
                                                             false,
-                                                            true>(true,
-                                                                  2,
-                                                                  false,
-                                                                  false,
-                                                                  param.length_,
-                                                                  param.window_spatial_lengths_,
-                                                                  param.window_strides_,
-                                                                  param.window_dilations_,
-                                                                  param.input_left_pads_,
-                                                                  param.input_right_pads_);
+                                                            true>(in_params_max_pool_indexed,
+                                                                  kernel_params);
             EXPECT_TRUE(success);
         }
     }
 };
 
-#ifdef CK_ENABLE_FP16
-using KernelTypes =
-    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
-#else
-using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
-#endif
+using KernelTypes = ::testing::Types<std::tuple<I8, I8, I8, I32>,
+                                     std::tuple<F8, F8, F8, I32>,
+                                     std::tuple<F16, F16, F16, I32>,
+                                     std::tuple<BF16, BF16, BF16, I32>,
+                                     std::tuple<F32, F32, F32, I32>>;
 
 TYPED_TEST_SUITE(TestMaxPool3dFwd, KernelTypes);
 TYPED_TEST(TestMaxPool3dFwd, Test_Pool)
diff --git a/test/pool/test_pool_fwd_common.hpp b/test/pool/test_pool_fwd_common.hpp
index 0302c3c337..b510b2f214 100644
--- a/test/pool/test_pool_fwd_common.hpp
+++ b/test/pool/test_pool_fwd_common.hpp
@@ -4,6 +4,8 @@
 #include "gtest/gtest.h"
 #include "ck/ck.hpp"
 
+using I8   = int8_t;
+using F8   = ck::f8_t;
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using F32  = float;