Adding remaining conv, dynamic_op, and scaleadd_scaleadd_relu flavors for grouped conv fwd (#3529)

* Adding remaining flavors for grouped conv fwd As titled. Following variants are added: - grouped_conv2d_fwd_dynamic_op - grouped_conv3d_fwd_dynamic_op - grouped_conv3d_fwd_bilinear - grouped_conv3d_fwd_convscale - grouped_conv3d_fwd_convinvscale - grouped_conv3d_fwd_convscale_add - grouped_conv3d_fwd_convscale_relu - grouped_conv3d_fwd_scale - grouped_conv3d_fwd_combconvscale - grouped_conv3d_fwd_scaleadd_scaleadd_relu * Fix incomplete parsing of types from source names in add_instance_library() cmakelists function so we don't build f8 on RDNA3. * Do not build f8 / bf8 only flavor tests on RDNA3 * Make sure we have proper generic instances for all instance lists related to the post-ces extra flavors, with scalarPerVector = 1. Then disable all but one generic instance per instance list to reduce compile time. * Post rebase fix: Template parameters for Grouped Conv Fwd Device Impl got tweaked upstream. * adding int8 and fp16 overloads to the elementwise operations * fixed copilot nits * Addressing review comments: - removed unnecessary examples for dynamic op - removed unnecessary conv specalizations for all the flavors - removed spurious bilinear and scale source files * clang-format * reduced no of tests --------- Co-authored-by: Wojciech Laskowski <wojciech.laskowski@streamhpc.com>
2026-04-20 06:49:15 +00:00 · 2026-01-30 17:02:14 +01:00
parent 6a6177a246
commit 2377a62837
72 changed files with 5178 additions and 34 deletions
--- a/test/grouped_convnd_fwd/CMakeLists.txt
+++ b/test/grouped_convnd_fwd/CMakeLists.txt
@@ -7,6 +7,9 @@ if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
    add_gtest_executable(test_grouped_convnd_fwd_bilinear test_grouped_convnd_fwd_bilinear.cpp)
    target_link_libraries(test_grouped_convnd_fwd_bilinear PRIVATE utility device_grouped_conv3d_fwd_bilinear_instance)

+    add_gtest_executable(test_grouped_convnd_fwd_dynamic_op test_grouped_convnd_fwd_dynamic_op.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_dynamic_op PRIVATE utility device_grouped_conv2d_fwd_dynamic_op_instance device_grouped_conv3d_fwd_dynamic_op_instance)
+
    add_gtest_executable(test_grouped_convnd_fwd_scaleadd_ab test_grouped_convnd_fwd_scaleadd_ab.cpp)
    target_link_libraries(test_grouped_convnd_fwd_scaleadd_ab PRIVATE utility device_grouped_conv3d_fwd_scaleadd_ab_instance)

--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dynamic_op.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dynamic_op.cpp
@@ -0,0 +1,180 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+using I8   = int8_t;
+using F8   = ck::f8_t;
+using BF8  = ck::bf8_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+template <typename Tuple>
+class TestGroupedConvndFwdDynamicOp : public ::testing::Test
+{
+    protected:
+    using InDataType   = std::tuple_element_t<0, Tuple>;
+    using WeiDataType  = std::tuple_element_t<1, Tuple>;
+    using OutDataType  = std::tuple_element_t<2, Tuple>;
+    using AComputeType = std::tuple_element_t<3, Tuple>;
+    using BComputeType = std::tuple_element_t<4, Tuple>;
+    using InLayout     = std::tuple_element_t<5, Tuple>;
+    using WeiLayout    = std::tuple_element_t<6, Tuple>;
+    using OutLayout    = std::tuple_element_t<7, Tuple>;
+    using IndexType    = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+
+        const auto dynamic_op = ck::tensor_operation::element_wise::DynamicUnaryOp{
+            ck::tensor_operation::element_wise::PassThrough{}};
+
+        for(auto& param : conv_params)
+        {
+            if(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a")
+            {
+                if(std::is_same<InDataType, F8>::value || std::is_same<InDataType, BF8>::value)
+                {
+                    printf("Skipping FP8 / BF8 tests on CDNA1/2.\n");
+                    continue;
+                }
+            }
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<
+                               NDimSpatial,
+                               InLayout,
+                               WeiLayout,
+                               OutLayout,
+                               InDataType,
+                               WeiDataType,
+                               OutDataType,
+                               AComputeType,
+                               BComputeType,
+                               IndexType,
+                               ck::tensor_operation::element_wise::DynamicUnaryOp>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               true,  // time_kernel
+                               param,
+                               dynamic_op);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d =
+    ::testing::Types<std::tuple<F16, F16, F16, F16, F16, NHWGC, GKYXC, NHWGK>,
+                     std::tuple<BF16, BF16, BF16, BF16, BF16, NHWGC, GKYXC, NHWGK>,
+                     std::tuple<I8, I8, I8, I8, I8, NHWGC, GKYXC, NHWGK>,
+                     std::tuple<F32, F32, F32, F32, F32, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d =
+    ::testing::Types<std::tuple<F16, F16, F16, F16, F16, NDHWGC, GKZYXC, NDHWGK>,
+                     std::tuple<BF16, BF16, BF16, BF16, BF16, NDHWGC, GKZYXC, NDHWGK>,
+                     std::tuple<I8, I8, I8, I8, I8, NDHWGC, GKZYXC, NDHWGK>,
+                     std::tuple<F32, F32, F32, F32, F32, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwdDynamicOp2d : public TestGroupedConvndFwdDynamicOp<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwdDynamicOp3d : public TestGroupedConvndFwdDynamicOp<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwdDynamicOp2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwdDynamicOp3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwdDynamicOp2d, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 3, 5, 96, 200, {1, 1}, {73, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 1, 1, 32, 32, {1, 1}, {128, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 1, 1, 32, 32, {2, 2}, {128, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 1, 1, 32, 32, {3, 3}, {128, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 1, 1, 32, 32, {5, 5}, {128, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 1, 1, 32, 32, {9, 9}, {128, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back({2, 1, 1, 1, 32, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 64, 3, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 1, 1, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+
+    this->conv_params.push_back(
+        {2, 96, 1, 1, 1, {1, 1}, {120, 160}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 96, 1, 1, 1, {3, 3}, {120, 160}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndFwdDynamicOp3d, Test3D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {3, 3, 5, 96, 200, {1, 1, 1}, {37, 37, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {1, 1, 1}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {2, 2, 2}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {5, 5, 5}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {9, 9, 9}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {1, 1, 1}, {16, 16, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+
+    this->conv_params.push_back(
+        {3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+
+    this->conv_params.push_back(
+        {3, 96, 1, 1, 1, {1, 1, 1}, {120, 40, 20}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 96, 1, 1, 1, {3, 3, 3}, {120, 40, 20}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->template Run<3>();
+}
--- a/test/grouped_convnd_fwd_activation/CMakeLists.txt
+++ b/test/grouped_convnd_fwd_activation/CMakeLists.txt
@@ -1,6 +1,26 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT

+if(GPU_TARGETS MATCHES "gfx9|gfx12")
+    add_gtest_executable(test_grouped_convnd_fwd_convinvscale test_grouped_convnd_fwd_convinvscale.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_convinvscale PRIVATE utility device_grouped_conv3d_fwd_convinvscale_instance)
+
+    add_gtest_executable(test_grouped_convnd_fwd_convscaleadd test_grouped_convnd_fwd_convscaleadd.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_convscaleadd PRIVATE utility device_grouped_conv3d_fwd_convscale_add_instance)
+
+    add_gtest_executable(test_grouped_convnd_fwd_convscalerelu test_grouped_convnd_fwd_convscalerelu.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_convscalerelu PRIVATE utility device_grouped_conv3d_fwd_convscale_relu_instance)
+
+    add_gtest_executable(test_grouped_convnd_fwd_convscale test_grouped_convnd_fwd_convscale.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_convscale PRIVATE utility device_grouped_conv3d_fwd_convscale_instance)
+
+    add_gtest_executable(test_grouped_convnd_fwd_combconvscale test_grouped_convnd_fwd_combconvscale.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_combconvscale PRIVATE utility device_grouped_conv3d_fwd_convscale_instance)
+
+    add_gtest_executable(test_grouped_convnd_fwd_combconvscalerelu test_grouped_convnd_fwd_combconvscalerelu.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_combconvscalerelu PRIVATE utility device_grouped_conv3d_fwd_convscale_relu_instance)
+endif()
+
 if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
    add_gtest_executable(test_grouped_convnd_fwd_bias_clamp test_grouped_convnd_fwd_bias_clamp.cpp)
    target_link_libraries(test_grouped_convnd_fwd_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)
@@ -23,4 +43,7 @@ if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")

    add_gtest_executable(test_grouped_convnd_fwd_gk_bias_bnorm_clamp test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp)
    target_link_libraries(test_grouped_convnd_fwd_gk_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
+
+    add_gtest_executable(test_grouped_convnd_fwd_scaleadd_scaleadd_relu test_grouped_convnd_fwd_scaleadd_scaleadd_relu.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_scaleadd_scaleadd_relu PRIVATE utility device_grouped_conv3d_fwd_scaleadd_scaleadd_relu_instance)
 endif()
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_combconvscale.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_combconvscale.cpp
@@ -0,0 +1,94 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "profiler/profile_grouped_conv_fwd_outelementop_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
+
+using CombConvScale = ck::tensor_operation::element_wise::ScaleScalePass;
+
+template <typename Tuple>
+class TestGroupedConvndFwdCombConvScale : public ::testing::Test
+{
+    protected:
+    using InDataType  = std::tuple_element_t<0, Tuple>;
+    using WeiDataType = std::tuple_element_t<1, Tuple>;
+    using OutDataType = std::tuple_element_t<2, Tuple>;
+    using InLayout    = std::tuple_element_t<3, Tuple>;
+    using WeiLayout   = std::tuple_element_t<4, Tuple>;
+    using OutLayout   = std::tuple_element_t<5, Tuple>;
+    using IndexType   = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            if(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a")
+            {
+                if(std::is_same<InDataType, ck::f8_t>::value ||
+                   std::is_same<InDataType, ck::bf8_t>::value)
+                {
+                    printf("Skipping FP8 / BF8 tests on CDNA1/2.\n");
+                    continue;
+                }
+            }
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_outelementop_impl<NDimSpatial,
+                                                                                    InLayout,
+                                                                                    WeiLayout,
+                                                                                    OutLayout,
+                                                                                    InDataType,
+                                                                                    WeiDataType,
+                                                                                    OutDataType,
+                                                                                    CombConvScale,
+                                                                                    InDataType,
+                                                                                    InDataType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               true,  // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+using CombConvScaleKernelTypes3d =
+    ::testing::Types<std::tuple<ck::f8_t, ck::f8_t, float, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwdCombConvScale3d : public TestGroupedConvndFwdCombConvScale<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwdCombConvScale3d, CombConvScaleKernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwdCombConvScale3d, Test3D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {3, 3, 5, 96, 200, {1, 1, 1}, {37, 37, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {9, 9, 9}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 96, 1, 1, 1, {1, 1, 1}, {120, 40, 20}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+
+    this->template Run<3>();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_combconvscalerelu.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_combconvscalerelu.cpp
@@ -0,0 +1,95 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "profiler/profile_grouped_conv_fwd_outelementop_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
+
+using CombConvScaleRelu = ck::tensor_operation::element_wise::ScaleScaleRelu;
+
+template <typename Tuple>
+class TestGroupedConvndFwdCombConvScaleRelu : public ::testing::Test
+{
+    protected:
+    using InDataType  = std::tuple_element_t<0, Tuple>;
+    using WeiDataType = std::tuple_element_t<1, Tuple>;
+    using OutDataType = std::tuple_element_t<2, Tuple>;
+    using InLayout    = std::tuple_element_t<3, Tuple>;
+    using WeiLayout   = std::tuple_element_t<4, Tuple>;
+    using OutLayout   = std::tuple_element_t<5, Tuple>;
+    using IndexType   = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            if(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a")
+            {
+                if(std::is_same<InDataType, ck::f8_t>::value ||
+                   std::is_same<InDataType, ck::bf8_t>::value)
+                {
+                    printf("Skipping FP8 / BF8 tests on CDNA1/2.\n");
+                    continue;
+                }
+            }
+            pass =
+                pass && ck::profiler::profile_grouped_conv_fwd_outelementop_impl<NDimSpatial,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 OutLayout,
+                                                                                 InDataType,
+                                                                                 WeiDataType,
+                                                                                 OutDataType,
+                                                                                 CombConvScaleRelu,
+                                                                                 InDataType,
+                                                                                 InDataType>(
+                            true,  // do_verification
+                            1,     // init_method: integer value
+                            false, // do_log
+                            true,  // time_kernel
+                            param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+using CombConvScaleReluKernelTypes3d =
+    ::testing::Types<std::tuple<ck::f8_t, ck::f8_t, float, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwdCombConvScaleRelu3d : public TestGroupedConvndFwdCombConvScaleRelu<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwdCombConvScaleRelu3d, CombConvScaleReluKernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwdCombConvScaleRelu3d, Test3D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {3, 3, 5, 96, 200, {1, 1, 1}, {37, 37, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {9, 9, 9}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 96, 1, 1, 1, {1, 1, 1}, {120, 40, 20}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+
+    this->template Run<3>();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_convinvscale.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_convinvscale.cpp
@@ -0,0 +1,89 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "profiler/profile_grouped_conv_fwd_outelementop_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using ConvInvscale = ck::tensor_operation::element_wise::ConvInvscale;
+
+template <typename Tuple>
+class TestGroupedConvndFwdConvInvscale : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            if(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a")
+            {
+                if(std::is_same<DataType, ck::f8_t>::value ||
+                   std::is_same<DataType, ck::bf8_t>::value)
+                {
+                    printf("Skipping FP8 / BF8 tests on CDNA1/2.\n");
+                    continue;
+                }
+            }
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_outelementop_impl<NDimSpatial,
+                                                                                    InLayout,
+                                                                                    WeiLayout,
+                                                                                    OutLayout,
+                                                                                    DataType,
+                                                                                    DataType,
+                                                                                    DataType,
+                                                                                    ConvInvscale>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               true,  // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::f8_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwdConvInvscale3d : public TestGroupedConvndFwdConvInvscale<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwdConvInvscale3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwdConvInvscale3d, Test3D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {3, 3, 5, 96, 200, {1, 1, 1}, {37, 37, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {9, 9, 9}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 96, 1, 1, 1, {1, 1, 1}, {120, 40, 20}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+
+    this->template Run<3>();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_convscale.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_convscale.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "profiler/profile_grouped_conv_fwd_outelementop_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using ConvScale = ck::tensor_operation::element_wise::ConvScale;
+
+template <typename Tuple>
+class TestGroupedConvndFwdConvScale : public ::testing::Test
+{
+    protected:
+    using InDataType   = std::tuple_element_t<0, Tuple>;
+    using WeiDataType  = std::tuple_element_t<1, Tuple>;
+    using OutDataType  = std::tuple_element_t<2, Tuple>;
+    using AComputeType = std::tuple_element_t<3, Tuple>;
+    using BComputeType = std::tuple_element_t<4, Tuple>;
+    using InLayout     = std::tuple_element_t<5, Tuple>;
+    using WeiLayout    = std::tuple_element_t<6, Tuple>;
+    using OutLayout    = std::tuple_element_t<7, Tuple>;
+    using IndexType    = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            if(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a")
+            {
+                if(std::is_same<InDataType, ck::f8_t>::value ||
+                   std::is_same<InDataType, ck::bf8_t>::value)
+                {
+                    printf("Skipping FP8 / BF8 tests on CDNA1/2.\n");
+                    continue;
+                }
+            }
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_outelementop_impl<NDimSpatial,
+                                                                                    InLayout,
+                                                                                    WeiLayout,
+                                                                                    OutLayout,
+                                                                                    InDataType,
+                                                                                    WeiDataType,
+                                                                                    OutDataType,
+                                                                                    ConvScale,
+                                                                                    AComputeType,
+                                                                                    BComputeType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               true,  // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+using KernelTypes3d = ::testing::Types<
+    std::tuple<ck::f8_t, ck::f8_t, ck::f8_t, ck::f8_t, ck::f8_t, NDHWGC, GKZYXC, NDHWGK>,
+    std::tuple<ck::bf8_t, ck::bf8_t, ck::f8_t, ck::bf8_t, ck::bf8_t, NDHWGC, GKZYXC, NDHWGK>,
+    std::tuple<ck::f8_t, ck::bf8_t, ck::f8_t, ck::f8_t, ck::bf8_t, NDHWGC, GKZYXC, NDHWGK>,
+    std::tuple<ck::bf8_t, ck::f8_t, ck::f8_t, ck::bf8_t, ck::f8_t, NDHWGC, GKZYXC, NDHWGK>>;
+template <typename Tuple>
+class TestGroupedConvndFwdConvScale3d : public TestGroupedConvndFwdConvScale<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwdConvScale3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwdConvScale3d, Test3D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {3, 3, 5, 96, 200, {1, 1, 1}, {37, 37, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {9, 9, 9}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 96, 1, 1, 1, {1, 1, 1}, {120, 40, 20}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+
+    this->template Run<3>();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_convscaleadd.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_convscaleadd.cpp
@@ -0,0 +1,91 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "profiler/profile_grouped_conv_fwd_convscale_add_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using ConvScaleAdd = ck::tensor_operation::element_wise::ConvScaleAdd;
+
+template <typename Tuple>
+class TestGroupedConvndFwdConvScaleAdd : public ::testing::Test
+{
+    protected:
+    using DataType   = std::tuple_element_t<0, Tuple>;
+    using InLayout   = std::tuple_element_t<1, Tuple>;
+    using WeiLayout  = std::tuple_element_t<2, Tuple>;
+    using BiasLayout = std::tuple_element_t<3, Tuple>;
+    using OutLayout  = std::tuple_element_t<4, Tuple>;
+    using IndexType  = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            if(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a")
+            {
+                if(std::is_same<DataType, ck::f8_t>::value ||
+                   std::is_same<DataType, ck::bf8_t>::value)
+                {
+                    printf("Skipping FP8 / BF8 tests on CDNA1/2.\n");
+                    continue;
+                }
+            }
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_convscale_add_impl<NDimSpatial,
+                                                                                     InLayout,
+                                                                                     WeiLayout,
+                                                                                     BiasLayout,
+                                                                                     OutLayout,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     float,
+                                                                                     DataType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               true,  // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::f8_t, NDHWGC, GKZYXC, NDHWGK, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwdConvScaleAdd3d : public TestGroupedConvndFwdConvScaleAdd<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwdConvScaleAdd3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwdConvScaleAdd3d, Test3D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {3, 3, 5, 96, 200, {1, 1, 1}, {37, 37, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {9, 9, 9}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 96, 1, 1, 1, {1, 1, 1}, {120, 40, 20}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+
+    this->template Run<3>();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_convscalerelu.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_convscalerelu.cpp
@@ -0,0 +1,89 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "profiler/profile_grouped_conv_fwd_outelementop_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using ConvScaleRelu = ck::tensor_operation::element_wise::ConvScaleRelu;
+
+template <typename Tuple>
+class TestGroupedConvndFwdConvScaleRelu : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            if(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a")
+            {
+                if(std::is_same<DataType, ck::f8_t>::value ||
+                   std::is_same<DataType, ck::bf8_t>::value)
+                {
+                    printf("Skipping FP8 / BF8 tests on CDNA1/2.\n");
+                    continue;
+                }
+            }
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_outelementop_impl<NDimSpatial,
+                                                                                    InLayout,
+                                                                                    WeiLayout,
+                                                                                    OutLayout,
+                                                                                    DataType,
+                                                                                    DataType,
+                                                                                    DataType,
+                                                                                    ConvScaleRelu>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               true,  // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::f8_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwdConvScaleRelu3d : public TestGroupedConvndFwdConvScaleRelu<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwdConvScaleRelu3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwdConvScaleRelu3d, Test3D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {3, 3, 5, 96, 200, {1, 1, 1}, {37, 37, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {9, 9, 9}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 96, 1, 1, 1, {1, 1, 1}, {120, 40, 20}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+
+    this->template Run<3>();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_scaleadd_scaleadd_relu.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_scaleadd_scaleadd_relu.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "profiler/profile_grouped_conv_fwd_scaleadd_scaleadd_relu_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using I8   = int8_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+template <typename Tuple>
+class TestGroupedConvndFwdScaleAddScaleAddRelu : public ::testing::Test
+{
+    protected:
+    using InDataType  = std::tuple_element_t<0, Tuple>;
+    using WeiDataType = std::tuple_element_t<1, Tuple>;
+    using OutDataType = std::tuple_element_t<2, Tuple>;
+    using InLayout    = std::tuple_element_t<3, Tuple>;
+    using WeiLayout   = std::tuple_element_t<4, Tuple>;
+    using OutLayout   = std::tuple_element_t<5, Tuple>;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            if(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a")
+            {
+                if(std::is_same<InDataType, ck::f8_t>::value ||
+                   std::is_same<InDataType, ck::bf8_t>::value)
+                {
+                    printf("Skipping FP8 / BF8 tests on CDNA1/2.\n");
+                    continue;
+                }
+            }
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_scaleadd_scaleadd_relu_impl<
+                               NDimSpatial,
+                               InLayout,
+                               WeiLayout,
+                               OutLayout,
+                               InDataType,
+                               WeiDataType,
+                               OutDataType,
+                               ck::tensor_operation::element_wise::ScaleAddScaleAddRelu,
+                               InDataType,
+                               InDataType>(true,  // do_verification
+                                           1,     // init_method: integer value
+                                           false, // do_log
+                                           true,  // time_kernel
+                                           param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+using CombConvScaleAddScaleAddReluKernelTypes3d =
+    ::testing::Types<std::tuple<F16, F16, F16, NDHWGC, GKZYXC, NDHWGK>,
+                     std::tuple<BF16, BF16, BF16, NDHWGC, GKZYXC, NDHWGK>,
+                     std::tuple<I8, I8, I8, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwdScaleAddScaleAddRelu3d
+    : public TestGroupedConvndFwdScaleAddScaleAddRelu<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwdScaleAddScaleAddRelu3d,
+                 CombConvScaleAddScaleAddReluKernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwdScaleAddScaleAddRelu3d, Test3D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {3, 3, 5, 96, 200, {1, 1, 1}, {37, 37, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 32, 32, {5, 5, 5}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+
+    this->template Run<3>();
+}