From 25aa95786a21af2fc16aeba9f09b106f394dc43f Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Mon, 8 Jul 2024 13:55:54 -0400
Subject: [PATCH] Add ckProfiler support for forward 3D convolutions with OUT
 element-wise operations. (#1354)

[ROCm/composable_kernel commit: eb44e0472a2d58b623007b056173817793c9df3d]
---
 .../ck/library/utility/host_tensor.hpp        |  10 +-
 ...ile_grouped_conv_fwd_outelementop_impl.hpp | 352 ++++++++++++++++++
 profiler/src/CMakeLists.txt                   |   3 +
 .../profile_grouped_conv_fwd_outelementop.cpp | 220 +++++++++++
 .../profile_grouped_conv_fwd_outelementop.sh  |  20 +
 5 files changed, 604 insertions(+), 1 deletion(-)
 create mode 100644 profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp
 create mode 100644 profiler/src/profile_grouped_conv_fwd_outelementop.cpp
 create mode 100755 script/profile_grouped_conv_fwd_outelementop.sh
diff --git a/library/include/ck/library/utility/host_tensor.hpp b/library/include/ck/library/utility/host_tensor.hpp
index ddbd16ad9a..493b992aca 100644
--- a/library/include/ck/library/utility/host_tensor.hpp
+++ b/library/include/ck/library/utility/host_tensor.hpp
@@ -43,7 +43,15 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
             first = false;
         else
             os << delim;
-        os << static_cast<T>(v);
+
+        if constexpr(std::is_same_v<T, ck::f8_t> || std::is_same_v<T, ck::bf8_t>)
+        {
+            os << ck::type_convert<float>(v);
+        }
+        else
+        {
+            os << static_cast<T>(v);
+        }
     }
     return os;
 }
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp
new file mode 100644
index 0000000000..bd756eb825
--- /dev/null
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp
@@ -0,0 +1,352 @@
+#pragma once
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convinvscale.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename DataType>
+inline constexpr double get_rtol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 1e-1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 1.5e-1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
+
+template <typename DataType>
+inline constexpr double get_atol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 16.1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 8192.1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename OutElementOp,
+          typename AComputeType = InDataType,
+          typename BComputeType = AComputeType>
+bool profile_grouped_conv_fwd_outelementop_impl(int do_verification,
+                                                int init_method,
+                                                bool do_log,
+                                                bool time_kernel,
+                                                const ck::utils::conv::ConvParam& conv_param)
+{
+    auto pass = true; // return status
+
+    using CShuffleDataType = float;
+
+    using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+    using InElementOp  = PassThrough;
+    using WeiElementOp = PassThrough;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    Tensor<InDataType> input(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    Tensor<CShuffleDataType> c(out_g_n_k_wos_desc);
+    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
+    Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << host_output.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-1, 1});
+        break;
+    default:
+        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
+        weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(input.mData.data());
+    wei_device_buf.ToDevice(weight.mData.data());
+
+    // random scale values
+    auto scale_in = type_convert<float>(
+        type_convert<f8_t>(2.0f * float(RAND_MAX / 2 - std::rand()) / float(RAND_MAX)));
+    auto scale_wei = type_convert<float>(
+        type_convert<f8_t>(2.0f * float(RAND_MAX / 2 - std::rand()) / float(RAND_MAX)));
+    auto scale_out = type_convert<float>(
+        type_convert<f8_t>(2.0f * float(RAND_MAX / 2 - std::rand()) / float(RAND_MAX)));
+
+    // initialize out_element_op for each iteration
+    const auto out_element_op = OutElementOp{scale_in, scale_wei, scale_out};
+
+    std::cout << "scale_in: " << scale_in << std::endl;
+    std::cout << "scale_wei: " << scale_wei << std::endl;
+    std::cout << "scale_out: " << scale_out << std::endl;
+
+    // run reference op
+    if(do_verification)
+    {
+
+        std::cout << "\nVerifying algorithm against reference convolution..." << std::endl;
+        std::cout << "\tUsing (rel_tol,abs_tol) = (" << std::setprecision(7)
+                  << get_rtol<OutDataType>() << ", " << get_atol<OutDataType>() << ")" << std::endl;
+
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     CShuffleDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     PassThrough>{};
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(input,
+                                                  weight,
+                                                  c,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  PassThrough{});
+
+        c.SetZero();
+        ref_invoker.Run(ref_argument);
+
+        host_output.ForEach([&](auto&, auto idx) { out_element_op(host_output(idx), c(idx)); });
+    }
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init output to zero before profiling next kernel
+            out_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = conv_param.GetFlops();
+            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+            float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(device_output.mData.data());
+
+                pass = pass & ck::utils::check_err(device_output,
+                                                   host_output,
+                                                   "Error: Device and Host results do not match!",
+                                                   get_rtol<OutDataType>(),
+                                                   get_atol<OutDataType>());
+
+                if(do_log)
+                {
+                    LogRangeAsType<InDataType>(std::cout << "input : ", input.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<WeiDataType>(std::cout << "weight: ", weight.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<OutDataType>(
+                        std::cout << "host_output  : ", host_output.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<OutDataType>(
+                        std::cout << "device_output: ", device_output.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    };
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
+                                                                                   InLayout,
+                                                                                   WeiLayout,
+                                                                                   ck::Tuple<>,
+                                                                                   OutLayout,
+                                                                                   InDataType,
+                                                                                   WeiDataType,
+                                                                                   ck::Tuple<>,
+                                                                                   OutDataType,
+                                                                                   InElementOp,
+                                                                                   WeiElementOp,
+                                                                                   OutElementOp,
+                                                                                   AComputeType,
+                                                                                   BComputeType>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                        wei_device_buf.GetDeviceBuffer(),
+                                                        {},
+                                                        out_device_buf.GetDeviceBuffer(),
+                                                        a_g_n_c_wis_lengths,
+                                                        a_g_n_c_wis_strides,
+                                                        b_g_k_c_xs_lengths,
+                                                        b_g_k_c_xs_strides,
+                                                        {},
+                                                        {},
+                                                        e_g_n_k_wos_lengths,
+                                                        e_g_n_k_wos_strides,
+                                                        conv_filter_strides,
+                                                        conv_filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        in_element_op,
+                                                        wei_element_op,
+                                                        out_element_op);
+
+        run_impl(op_ptr, argument_ptr);
+    }
+
+    std::cout << "Best configuration parameters:"
+              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index c2a9769727..198f49432f 100755
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -57,6 +57,7 @@ if(GPU_TARGETS MATCHES "gfx9")
   list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp)
   list(APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp)
   list(APPEND PROFILER_SOURCES profile_conv_fwd.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd_outelementop.cpp)
 
 endif()
 
@@ -134,6 +135,8 @@ if(GPU_TARGETS MATCHES "gfx9")
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convscale_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convinvscale_instance)
 endif()
 
 if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
diff --git a/profiler/src/profile_grouped_conv_fwd_outelementop.cpp b/profiler/src/profile_grouped_conv_fwd_outelementop.cpp
new file mode 100644
index 0000000000..196a2cf3f2
--- /dev/null
+++ b/profiler/src/profile_grouped_conv_fwd_outelementop.cpp
@@ -0,0 +1,220 @@
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "profiler/profile_grouped_conv_fwd_outelementop_impl.hpp"
+
+#include "ck/utility/data_type.hpp"
+#include "profiler_operation_registry.hpp"
+
+#include <iostream>
+
+enum struct ConvLayout
+{
+    GNHWC_GKYXC_GNHWK = 0,
+    NHWGC_GKYXC_NHWGK = 1
+};
+
+enum struct OutElementOp
+{
+    ConvScale    = 0,
+    ConvInvScale = 1
+};
+
+enum struct ConvDataType
+{
+    F8_F8_F8   = 0,
+    BF8_BF8_F8 = 1,
+    F8_BF8_F8  = 2,
+    BF8_F8_F8  = 3
+};
+
+#define OP_NAME "grouped_conv_fwd_outelementop"
+#define OP_DESC "Grouped Convolution Forward+Elementwise Operation"
+
+static void print_helper_msg()
+{
+    // clang-format off
+    std::cout
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Input fp8, Weight fp8, Output fp8\n"
+        << "                 1: Input bf8, Weight bf8, Output fp8\n"
+        << "                 2: Input fp8, Weight bf8, Output fp8\n"
+        << "                 3: Input bf8, Weight fp8, Output fp8)\n"
+        << "arg3: element-wise operation (0: ConvScale\n"
+        << "                              1: ConvInvScale)\n"
+        << "arg4: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
+        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
+        << "arg5: verification (0: no, 1: yes)\n"
+        << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg7: print tensor value (0: no; 1: yes)\n"
+        << "arg8: time kernel (0: no, 1: yes)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+    // clang-format on
+}
+
+int grouped_conv_fwd_outelementop(int argc, char* argv[])
+{
+
+    // 9 total, 1 for num_dim_spatial
+    if(argc < 10)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto op              = static_cast<OutElementOp>(std::stoi(argv[3]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[4]));
+    const bool do_verification = std::stoi(argv[5]);
+    const int init_method      = std::stoi(argv[6]);
+    const bool do_log          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[8]);
+    const int num_dim_spatial  = std::stoi(argv[9]);
+
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial + 1 for argv[0]
+    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv);
+
+    using F8  = ck::f8_t;
+    using BF8 = ck::bf8_t;
+
+    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+    using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
+    using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
+
+    using ConvScale    = ck::tensor_operation::element_wise::ConvScale;
+    using ConvInvScale = ck::tensor_operation::element_wise::ConvInvscale;
+
+    constexpr auto I3 = ck::Number<3>{};
+
+    auto profile = [&](auto num_dim_spatial_tmp,
+                       auto in_layout,
+                       auto wei_layout,
+                       auto out_layout,
+                       auto in_type,
+                       auto wei_type,
+                       auto out_type,
+                       auto out_element_op,
+                       auto a_compute_type,
+                       auto b_compute_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+
+        using InDataType  = decltype(in_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+
+        using OutElementOp = decltype(out_element_op);
+
+        using AComputeType = decltype(a_compute_type);
+        using BComputeType = decltype(b_compute_type);
+
+        bool pass = ck::profiler::profile_grouped_conv_fwd_outelementop_impl<NDimSpatial,
+                                                                             InLayout,
+                                                                             WeiLayout,
+                                                                             OutLayout,
+                                                                             InDataType,
+                                                                             WeiDataType,
+                                                                             OutDataType,
+                                                                             OutElementOp,
+                                                                             AComputeType,
+                                                                             BComputeType>(
+            do_verification, init_method, do_log, time_kernel, params);
+
+        return pass ? 0 : 1;
+    };
+
+    if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(op == OutElementOp::ConvScale)
+        {
+            if(data_type == ConvDataType::F8_F8_F8)
+            {
+                return profile(
+                    I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, F8{}, F8{}, ConvScale{}, F8{}, F8{});
+            }
+            else if(data_type == ConvDataType::BF8_BF8_F8)
+            {
+                return profile(I3,
+                               NDHWGC{},
+                               GKZYXC{},
+                               NDHWGK{},
+                               BF8{},
+                               BF8{},
+                               F8{},
+                               ConvScale{},
+                               BF8{},
+                               BF8{});
+            }
+            else if(data_type == ConvDataType::F8_BF8_F8)
+            {
+                return profile(
+                    I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, BF8{}, F8{}, ConvScale{}, F8{}, BF8{});
+            }
+            else if(data_type == ConvDataType::BF8_F8_F8)
+            {
+                return profile(
+                    I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF8{}, F8{}, F8{}, ConvScale{}, BF8{}, F8{});
+            }
+        }
+        else if(op == OutElementOp::ConvInvScale)
+        {
+            if(data_type == ConvDataType::F8_F8_F8)
+            {
+                return profile(
+                    I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, F8{}, F8{}, ConvInvScale{}, F8{}, F8{});
+            }
+            else if(data_type == ConvDataType::BF8_BF8_F8)
+            {
+                return profile(I3,
+                               NDHWGC{},
+                               GKZYXC{},
+                               NDHWGK{},
+                               BF8{},
+                               BF8{},
+                               F8{},
+                               ConvInvScale{},
+                               BF8{},
+                               BF8{});
+            }
+            else if(data_type == ConvDataType::F8_BF8_F8)
+            {
+                return profile(I3,
+                               NDHWGC{},
+                               GKZYXC{},
+                               NDHWGK{},
+                               F8{},
+                               BF8{},
+                               F8{},
+                               ConvInvScale{},
+                               F8{},
+                               BF8{});
+            }
+            else if(data_type == ConvDataType::BF8_F8_F8)
+            {
+                return profile(I3,
+                               NDHWGC{},
+                               GKZYXC{},
+                               NDHWGK{},
+                               BF8{},
+                               F8{},
+                               F8{},
+                               ConvInvScale{},
+                               BF8{},
+                               F8{});
+            }
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, grouped_conv_fwd_outelementop);
diff --git a/script/profile_grouped_conv_fwd_outelementop.sh b/script/profile_grouped_conv_fwd_outelementop.sh
new file mode 100755
index 0000000000..ac444a25c2
--- /dev/null
+++ b/script/profile_grouped_conv_fwd_outelementop.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+
+OP=$1
+DATATYPE=$2
+OUTELEMENTOP=$3
+LAYOUT=$4
+VERIFY=$5
+INIT=$6
+LOG=$7
+TIME=$8
+
+N=$9
+
+#######  op    datatype  OUTELEMENTOP  layout   verify   init   log   time  Ndims  G    N    K     C   Z   Y   X   Di  Hi   Wi  Sz  Sy  Sx  Dz  Dy  Dx  Left Pz LeftPy  LeftPx  RightPz RightPy  RightPx
+$DRIVER $OP   $DATATYPE $OUTELEMENTOP $LAYOUT  $VERIFY  $INIT  $LOG  $TIME      3 32   $N   96    96   3   3   3   28  28   28   1   1   1   1   1   1        1      1       1        1       1        1
+$DRIVER $OP   $DATATYPE $OUTELEMENTOP $LAYOUT  $VERIFY  $INIT  $LOG  $TIME      3 32   $N  192   192   3   3   3   28  28   28   1   1   1   1   1   1        1      1       1        1       1        1